Skip to content

Commit 74a76bf

Browse files
committed
added csvcleaner to datatools, bug fixes
1 parent 3e933c1 commit 74a76bf

File tree

4 files changed

+266
-2
lines changed

4 files changed

+266
-2
lines changed

Makefile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ VERSION = $(shell grep -m1 'Version = ' $(PROJECT).go | cut -d\` -f 2)
77

88
BRANCH = $(shell git branch | grep '* ' | cut -d\ -f 2)
99

10-
build: bin/csvcols bin/csvrows bin/csvfind bin/csvjoin bin/jsoncols bin/jsonrange bin/xlsx2json bin/xlsx2csv bin/csv2mdtable bin/csv2xlsx bin/csv2json bin/vcard2json bin/jsonjoin bin/jsonmunge bin/findfile bin/finddir bin/mergepath bin/reldate bin/range bin/timefmt bin/urlparse bin/splitstring
10+
build: bin/csvcols bin/csvrows bin/csvfind bin/csvjoin bin/jsoncols bin/jsonrange bin/xlsx2json bin/xlsx2csv bin/csv2mdtable bin/csv2xlsx bin/csv2json bin/vcard2json bin/jsonjoin bin/jsonmunge bin/findfile bin/finddir bin/mergepath bin/reldate bin/range bin/timefmt bin/urlparse bin/splitstring bin/csvcleaner
1111

1212

1313
bin/csvcols: datatools.go cmds/csvcols/csvcols.go
@@ -76,6 +76,9 @@ bin/urlparse: datatools.go cmds/urlparse/urlparse.go
7676
bin/splitstring: datatools.go cmds/splitstring/splitstring.go
7777
go build -o bin/splitstring cmds/splitstring/splitstring.go
7878

79+
bin/csvcleaner: datatools.go cmds/csvcleaner/csvcleaner.go
80+
go build -o bin/csvcleaner cmds/csvcleaner/csvcleaner.go
81+
7982
test:
8083
go test
8184

@@ -124,6 +127,7 @@ install:
124127
env GOBIN=$(GOPATH)/bin go install cmds/xlsx2json/xlsx2json.go
125128
env GOBIN=$(GOPATH)/bin go install cmds/xlsx2csv/xlsx2csv.go
126129
env GOBIN=$(GOPATH)/bin go install cmds/splitstring/splitstring.go
130+
env GOBIN=$(GOPATH)/bin go install cmds/csvcleaner/csvcleaner.go
127131

128132
dist/linux-amd64:
129133
mkdir -p dist/bin
@@ -149,6 +153,7 @@ dist/linux-amd64:
149153
env GOOS=linux GOARCH=amd64 go build -o dist/bin/timefmt cmds/timefmt/timefmt.go
150154
env GOOS=linux GOARCH=amd64 go build -o dist/bin/urlparse cmds/urlparse/urlparse.go
151155
env GOOS=linux GOARCH=amd64 go build -o dist/bin/splitstring cmds/splitstring/splitstring.go
156+
env GOOS=linux GOARCH=amd64 go build -o dist/bin/csvcleaner cmds/csvcleaner/csvcleaner.go
152157
cd dist && zip -r $(PROJECT)-$(VERSION)-linux-amd64.zip README.md LICENSE INSTALL.md bin/* docs/* how-to/* demos/*
153158
rm -fR dist/bin
154159

@@ -177,6 +182,7 @@ dist/macosx-amd64:
177182
env GOOS=darwin GOARCH=amd64 go build -o dist/bin/timefmt cmds/timefmt/timefmt.go
178183
env GOOS=darwin GOARCH=amd64 go build -o dist/bin/urlparse cmds/urlparse/urlparse.go
179184
env GOOS=darwin GOARCH=amd64 go build -o dist/bin/splitstring cmds/splitstring/splitstring.go
185+
env GOOS=darwin GOARCH=amd64 go build -o dist/bin/csvcleaner cmds/csvcleaner/csvcleaner.go
180186
cd dist && zip -r $(PROJECT)-$(VERSION)-macosx-amd64.zip README.md LICENSE INSTALL.md bin/* docs/* how-to/* demos/*
181187
rm -fR dist/bin
182188

@@ -206,6 +212,7 @@ dist/windows-amd64:
206212
env GOOS=windows GOARCH=amd64 go build -o dist/bin/timefmt.exe cmds/timefmt/timefmt.go
207213
env GOOS=windows GOARCH=amd64 go build -o dist/bin/urlparse.exe cmds/urlparse/urlparse.go
208214
env GOOS=windows GOARCH=amd64 go build -o dist/bin/splitstring.exe cmds/splitstring/splitstring.go
215+
env GOOS=windows GOARCH=amd64 go build -o dist/bin/csvcleaner.exe cmds/csvcleaner/csvcleaner.go
209216
cd dist && zip -r $(PROJECT)-$(VERSION)-windows-amd64.zip README.md LICENSE INSTALL.md bin/* docs/* how-to/* demos/*
210217
rm -fR dist/bin
211218

@@ -236,6 +243,7 @@ dist/raspbian-arm7:
236243
env GOOS=linux GOARCH=arm GOARM=7 go build -o dist/bin/timefmt cmds/timefmt/timefmt.go
237244
env GOOS=linux GOARCH=arm GOARM=7 go build -o dist/bin/urlparse cmds/urlparse/urlparse.go
238245
env GOOS=linux GOARCH=arm GOARM=7 go build -o dist/bin/splitstring cmds/splitstring/splitstring.go
246+
env GOOS=linux GOARCH=arm GOARM=7 go build -o dist/bin/csvcleaner cmds/csvcleaner/csvcleaner.go
239247
cd dist && zip -r $(PROJECT)-$(VERSION)-raspbian-arm7.zip README.md LICENSE INSTALL.md bin/* docs/* how-to/* demos/*
240248
rm -fR dist/bin
241249

cmds/csvcleaner/csvcleaner.go

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
package main
2+
3+
import (
4+
"encoding/csv"
5+
"flag"
6+
"fmt"
7+
"io"
8+
"os"
9+
"path"
10+
"strings"
11+
12+
// Caltech Library Packages
13+
"github.com/caltechlibrary/cli"
14+
"github.com/caltechlibrary/datatools"
15+
)
16+
17+
var (
18+
usage = `USAGE: %s [OPTIONS]`
19+
20+
description = `
21+
%s normalizes a CSV file based on the options selected. It
22+
helps to address issues like variable number of columns, leading/trailing
23+
spaces in columns, and non-UTF-8 encoding issues.
24+
25+
By default input is expected from standard in and output is sent to
26+
standard out (errors to standard error). These can be modified by
27+
appropriate options. The csv file is processed as a stream of rows so
28+
minimal memory is used to operate on the file.
29+
30+
`
31+
32+
examples = `
33+
34+
EXAMPLES
35+
36+
Normalizing a spread sheet's column count to 5 padding columns as needed per row.
37+
38+
cat mysheet.csv | %s -field-per-row=5
39+
40+
Trim leading spaces.
41+
42+
cat mysheet.csv | %s -left-trim-spaces
43+
44+
Trim trailing spaces.
45+
46+
cat mysheet.csv | %s -right-trim-spaces
47+
48+
Trim leading and trailing spaces
49+
50+
cat mysheet.csv | %s -trim-spaces
51+
52+
`
53+
54+
// Standard Options
55+
showHelp bool
56+
showLicense bool
57+
showVersion bool
58+
showExamples bool
59+
inputFName string
60+
outputFName string
61+
62+
// App Options
63+
comma string
64+
rowComment string
65+
fieldsPerRecord int
66+
lazyQuotes bool
67+
trailingComma bool
68+
trimSpace bool
69+
trimLeadingSpace bool
70+
trimTrailingSpace bool
71+
reuseRecord bool
72+
commaOut string
73+
useCRLF bool
74+
stopOnError bool
75+
76+
verbose bool
77+
)
78+
79+
func init() {
80+
// Standard options
81+
flag.BoolVar(&showHelp, "h", false, "display help")
82+
flag.BoolVar(&showHelp, "help", false, "display help")
83+
flag.BoolVar(&showLicense, "l", false, "display license")
84+
flag.BoolVar(&showLicense, "license", false, "display license")
85+
flag.BoolVar(&showVersion, "v", false, "display version")
86+
flag.BoolVar(&showVersion, "version", false, "display version")
87+
flag.BoolVar(&showExamples, "example", false, "display example(s)")
88+
flag.StringVar(&inputFName, "i", "", "input filename")
89+
flag.StringVar(&inputFName, "input", "", "input filename")
90+
flag.StringVar(&outputFName, "o", "", "output filename")
91+
flag.StringVar(&outputFName, "output", "", "output filename")
92+
93+
// Application specific options
94+
flag.IntVar(&fieldsPerRecord, "fields-per-row", 0, "set the number of columns to output right padding empty cells as needed")
95+
flag.BoolVar(&lazyQuotes, "use-lazy-quoting", false, "If LazyQuotes is true, a quote may appear in an unquoted field and a non-doubled quote may appear in a quoted field.")
96+
flag.BoolVar(&trimSpace, "trim-spaces", false, "If set to true leading and trailing white space in a field is ignored.")
97+
flag.BoolVar(&trimLeadingSpace, "left-trim-spaces", false, "If set to true leading white space in a field is ignored.")
98+
flag.BoolVar(&trimTrailingSpace, "right-trim-spaces", false, "If set to true trailing white space in a field is ignored.")
99+
flag.BoolVar(&reuseRecord, "reuse", true, "if false then a new array is allocated for each row processed, if true the array gets reused")
100+
flag.StringVar(&comma, "comma", "", "if set use this character in place of a comma for delimiting cells")
101+
flag.StringVar(&rowComment, "comment-char", "", "if set, rows starting with this character will be ignored as comments")
102+
flag.StringVar(&commaOut, "output-comma", "", "if set use this character in place of a comma for delimiting output cells")
103+
flag.BoolVar(&useCRLF, "use-crlf", false, "if set use a charage return and line feed in output")
104+
flag.BoolVar(&stopOnError, "stop-on-error", false, "exit on error, useful if you're trying to debug a problematic CSV file")
105+
106+
flag.BoolVar(&verbose, "verbose", false, "write verbose output to standard error")
107+
}
108+
109+
func main() {
110+
appName := path.Base(os.Args[0])
111+
flag.Parse()
112+
args := flag.Args()
113+
114+
cfg := cli.New(appName, "", datatools.Version)
115+
cfg.UsageText = fmt.Sprintf(usage, appName)
116+
cfg.DescriptionText = fmt.Sprintf(description, appName)
117+
cfg.OptionText = "OPTIONS"
118+
cfg.ExampleText = fmt.Sprintf(examples, appName, appName, appName, appName)
119+
120+
if showHelp == true {
121+
if len(args) > 0 {
122+
fmt.Println(cfg.Help(args...))
123+
} else {
124+
fmt.Println(cfg.Usage())
125+
}
126+
os.Exit(0)
127+
}
128+
129+
if showExamples == true {
130+
if len(args) > 0 {
131+
fmt.Println(cfg.Example(args...))
132+
} else {
133+
fmt.Println(cfg.ExampleText)
134+
}
135+
os.Exit(0)
136+
}
137+
138+
if showLicense == true {
139+
fmt.Println(cfg.License())
140+
os.Exit(0)
141+
}
142+
143+
if showVersion == true {
144+
fmt.Println(cfg.Version())
145+
os.Exit(0)
146+
}
147+
148+
in, err := cli.Open(inputFName, os.Stdin)
149+
if err != nil {
150+
fmt.Fprintf(os.Stderr, "%s\n", err)
151+
os.Exit(1)
152+
}
153+
defer cli.CloseFile(inputFName, in)
154+
155+
out, err := cli.Create(outputFName, os.Stdout)
156+
if err != nil {
157+
fmt.Fprintf(os.Stderr, "%s\n", err)
158+
os.Exit(1)
159+
}
160+
defer cli.CloseFile(outputFName, out)
161+
162+
// Loop through input CSV, apply options, write to output CSV
163+
if trimSpace == true {
164+
trimLeadingSpace = true
165+
trimTrailingSpace = true
166+
}
167+
168+
// Setup our CSV reader with any cli options
169+
var rStr []rune
170+
171+
r := csv.NewReader(in)
172+
if comma != "" {
173+
rStr = []rune(comma)
174+
if len(rStr) > 0 {
175+
r.Comma = rStr[0]
176+
}
177+
}
178+
if rowComment != "" {
179+
rStr = []rune(rowComment)
180+
if len(rStr) > 0 {
181+
r.Comment = rStr[0]
182+
}
183+
}
184+
r.FieldsPerRecord = fieldsPerRecord
185+
r.LazyQuotes = lazyQuotes
186+
r.TrimLeadingSpace = trimLeadingSpace
187+
r.ReuseRecord = reuseRecord
188+
189+
w := csv.NewWriter(out)
190+
if commaOut != "" {
191+
rStr = []rune(commaOut)
192+
if len(rStr) > 0 {
193+
w.Comma = rStr[0]
194+
}
195+
}
196+
w.UseCRLF = useCRLF
197+
198+
// i is so we can track row count as we process each streamed in row
199+
expectedCellCount := 0
200+
hasError := false
201+
i := 1
202+
for {
203+
row, err := r.Read()
204+
if err == io.EOF {
205+
break
206+
}
207+
if i == 1 {
208+
expectedCellCount = len(row)
209+
}
210+
if err != nil {
211+
serr := fmt.Sprintf("%s", err)
212+
if strings.HasSuffix(serr, "wrong number of fields in line") == true && fieldsPerRecord >= 0 {
213+
if verbose {
214+
fmt.Fprintf(os.Stderr, "row %d: expected %d, got %d cells\n", i, expectedCellCount, len(row))
215+
}
216+
// Trim trailing cells if needed
217+
if fieldsPerRecord > 0 && len(row) >= fieldsPerRecord {
218+
row = row[0:fieldsPerRecord]
219+
}
220+
// Append cells if needed
221+
for len(row) < expectedCellCount {
222+
row = append(row, "")
223+
}
224+
} else {
225+
hasError = true
226+
if verbose {
227+
fmt.Fprintf(os.Stderr, "%s\n", err)
228+
}
229+
}
230+
}
231+
if trimSpace {
232+
for i := range row {
233+
s := row[i]
234+
row[i] = strings.TrimSpace(s)
235+
}
236+
}
237+
if err := w.Write(row); err != nil {
238+
fmt.Fprintf(os.Stderr, "error writing row %d: %s\n", i, err)
239+
}
240+
i++
241+
if verbose == true && (i%100) == 0 {
242+
fmt.Fprintf(os.Stderr, "Processed %d rows\n", i)
243+
}
244+
if hasError && stopOnError {
245+
os.Exit(1)
246+
}
247+
}
248+
// Finally we need to flush any remaining output...
249+
w.Flush()
250+
if verbose {
251+
fmt.Fprintf(os.Stderr, "Processed %d rows\n", i)
252+
}
253+
}

cmds/splitstring/splitstring.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,19 @@ var (
1818
usage = `USAGE: %s [OPTIONS] [STRING_TO_SPLIT]`
1919

2020
description = `
21+
2122
%s splits a string based on a delimiting string provided. The default
2223
delimiter is a space. You can specify a delimiting string via
2324
the -d or --delimiter option. %s will split the string provided
2425
as a command line argument but can read split string(s) recieved on
2526
stdin in with the -i or --input option. By default the split
2627
strings are render as a JSON array but with the option -nl or
2728
--newline you can render each split string one per line.
29+
2830
`
2931

3032
examples = `
33+
3134
EXAMPLES
3235
3336
Splitting a string that is double pipe delimited rendering

datatools.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ import (
3535
)
3636

3737
const (
38-
Version = `v0.0.16`
38+
Version = `v0.0.17`
3939

4040
LicenseText = `
4141
%s %s

0 commit comments

Comments
 (0)