Skip to content

Commit 24eb061

Browse files
author
R. S. Doiel
committed
commiting to main
1 parent 50947b1 commit 24eb061

File tree

3 files changed

+453
-0
lines changed

3 files changed

+453
-0
lines changed

cmd/csv2jsonl/csv2jsonl.go

Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
// csv2json - is a command line that takes CSV input from stdin and
2+
// writes out JSON-L expression. It includes support for using the first
3+
// row as field names or default fieldnames (e.g. col0, col1, col2).
4+
// Additionally it can output the resulting JSON data structures as a
5+
// JSON array or individual JSON blobs (one line per blob).
6+
//
7+
// @author R. S. Doiel, <[email protected]>
8+
//
9+
// Copyright (c) 2021, Caltech
10+
// All rights not granted herein are expressly reserved by Caltech.
11+
//
12+
// Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
13+
//
14+
// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
15+
//
16+
// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
17+
//
18+
// 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
19+
//
20+
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21+
package main
22+
23+
import (
24+
"encoding/csv"
25+
"flag"
26+
"fmt"
27+
"io"
28+
"os"
29+
"path"
30+
"strings"
31+
32+
// My packages
33+
"github.com/caltechlibrary/datatools"
34+
)
35+
36+
var (
37+
helpText = `%{app_name}(1) user manual | version {version} {release_hash}
38+
% R. S. Doiel
39+
% {release_date}
40+
41+
# NAME
42+
43+
{app_name}
44+
45+
# SYNOPSIS
46+
47+
{app_name} [OPTIIONS]
48+
49+
# DESCRIPTION
50+
51+
csv2jsonl reads CSV from stdin and writes a JSON-L to stdout. JSON output
52+
is one object per line. See https://jsonlines.org.
53+
54+
# OPTIONS
55+
56+
-help
57+
: display help
58+
59+
-license
60+
: display license
61+
62+
-version
63+
: display version
64+
65+
-d, -delimiter
66+
: set the delimter character
67+
68+
-examples
69+
: display example(s)
70+
71+
-fields-per-record
72+
: Set the number of fields expected in the CSV read, -1 to turn off
73+
74+
-i, -input
75+
: input filename
76+
77+
-nl, -newline
78+
: include trailing newline in output
79+
80+
-o, -output
81+
: output filename
82+
83+
-quiet
84+
: suppress error output
85+
86+
-reuse-record
87+
: reuse the backing array
88+
89+
-trim-leading-space
90+
: trim leading space in fields for CSV input
91+
92+
-use-header
93+
: treat the first row as field names
94+
95+
-use-lazy-quotes
96+
: use lazy quotes for for CSV input
97+
98+
99+
# EXAMPLES
100+
101+
Convert data1.csv to data1.jsonl using Unix pipes.
102+
103+
~~~
104+
cat data1.csv | csv2jsonl > data1.jsonl
105+
~~~
106+
107+
Convert data1.csv to JSON line (one object line per blob)
108+
109+
~~~
110+
csv2jsonl data1.csv
111+
~~~
112+
113+
`
114+
115+
// Standard Options
116+
showHelp bool
117+
showLicense bool
118+
showVersion bool
119+
inputFName string
120+
outputFName string
121+
quiet bool
122+
newLine bool
123+
eol string
124+
125+
// Application Options
126+
useHeader bool
127+
asBlobs bool
128+
delimiter string
129+
lazyQuotes bool
130+
trimLeadingSpace bool
131+
fieldsPerRecord int
132+
reuseRecord bool
133+
)
134+
135+
func main() {
136+
appName := path.Base(os.Args[0])
137+
version := datatools.Version
138+
license := datatools.LicenseText
139+
releaseDate := datatools.ReleaseDate
140+
releaseHash := datatools.ReleaseHash
141+
142+
// Standard Options
143+
flag.BoolVar(&showHelp, "help", showHelp, "display help")
144+
flag.BoolVar(&showLicense, "license", showLicense, "display license")
145+
flag.BoolVar(&showVersion, "version", showVersion, "display version")
146+
flag.StringVar(&inputFName, "i", "", "input filename")
147+
flag.StringVar(&inputFName, "input", "", "input filename")
148+
flag.StringVar(&outputFName, "o", "", "output filename")
149+
flag.StringVar(&outputFName, "output", "", "output filename")
150+
flag.BoolVar(&quiet, "quiet", false, "suppress error output")
151+
flag.BoolVar(&newLine, "nl", true, "include trailing newline in output")
152+
flag.BoolVar(&newLine, "newline", true, "include trailing newline in output")
153+
154+
// App Options
155+
flag.BoolVar(&useHeader, "use-header", true, "treat the first row as field names")
156+
flag.StringVar(&delimiter, "d", "", "set the delimter character")
157+
flag.StringVar(&delimiter, "delimiter", "", "set the delimter character")
158+
flag.BoolVar(&lazyQuotes, "use-lazy-quotes", false, "use lazy quotes for for CSV input")
159+
flag.BoolVar(&trimLeadingSpace, "trim-leading-space", false, "trim leading space in fields for CSV input")
160+
flag.BoolVar(&reuseRecord, "reuse-record", false, "reuse the backing array")
161+
flag.IntVar(&fieldsPerRecord, "fields-per-record", 0, "Set the number of fields expected in the CSV read, -1 to turn off")
162+
163+
// Parse environment and options
164+
flag.Parse()
165+
166+
// Setup IO
167+
var err error
168+
169+
in := os.Stdin
170+
out := os.Stdout
171+
eout := os.Stderr
172+
173+
if inputFName != "" && inputFName != "-" {
174+
in, err = os.Open(inputFName)
175+
if err != nil {
176+
fmt.Fprintln(eout, err)
177+
os.Exit(1)
178+
}
179+
defer in.Close()
180+
}
181+
if outputFName != "" && outputFName != "-" {
182+
out, err = os.Create(outputFName)
183+
if err != nil {
184+
fmt.Fprintln(eout, err)
185+
os.Exit(1)
186+
}
187+
defer out.Close()
188+
}
189+
190+
// Process options
191+
if showHelp {
192+
fmt.Fprintf(out, "%s\n", datatools.FmtHelp(helpText, appName, version, releaseDate, releaseHash))
193+
os.Exit(0)
194+
}
195+
if showLicense == true {
196+
fmt.Fprintf(out, "%s\n", license)
197+
os.Exit(0)
198+
}
199+
if showVersion == true {
200+
fmt.Fprintf(out, "datatools, %s %s %s\n", appName, version, releaseHash)
201+
os.Exit(0)
202+
}
203+
if newLine {
204+
eol = "\n"
205+
}
206+
207+
rowNo := 0
208+
fieldNames := []string{}
209+
r := csv.NewReader(in)
210+
r.Comment = '#'
211+
r.FieldsPerRecord = fieldsPerRecord
212+
r.LazyQuotes = lazyQuotes
213+
r.TrimLeadingSpace = trimLeadingSpace
214+
r.ReuseRecord = reuseRecord
215+
if delimiter != "" {
216+
r.Comma = datatools.NormalizeDelimiterRune(delimiter)
217+
}
218+
if useHeader == true {
219+
row, err := r.Read()
220+
if err == io.EOF {
221+
fmt.Fprintln(eout, "No data")
222+
os.Exit(1)
223+
}
224+
if err != nil {
225+
fmt.Fprintln(eout, err)
226+
os.Exit(1)
227+
}
228+
for _, val := range row {
229+
fieldNames = append(fieldNames, strings.TrimSpace(val))
230+
}
231+
rowNo++
232+
}
233+
hasError := false
234+
object := map[string]interface{}{}
235+
for {
236+
row, err := r.Read()
237+
if err == io.EOF {
238+
break
239+
}
240+
if err != nil {
241+
fmt.Fprintln(eout, err)
242+
os.Exit(1)
243+
}
244+
245+
// Pad the fieldnames if necessary
246+
object = map[string]interface{}{}
247+
for col, val := range row {
248+
if col < len(fieldNames) {
249+
object[fieldNames[col]] = val
250+
} else {
251+
object[fmt.Sprintf("col_%d", col)] = val
252+
}
253+
}
254+
var src []byte
255+
src, err = datatools.JSONMarshal(object)
256+
if err != nil {
257+
if !quiet {
258+
fmt.Fprintf(eout, "error row %d, %s\n", rowNo, err)
259+
}
260+
hasError = true
261+
}
262+
fmt.Fprintf(out, "%s%s", src, eol)
263+
}
264+
if hasError == true {
265+
os.Exit(1)
266+
}
267+
}

csv2jsonl.1.html

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<title>Caltech Library's Digital Library Development Sandbox</title>
5+
<link href='https://fonts.googleapis.com/css?family=Open+Sans' rel='stylesheet' type='text/css'>
6+
<link rel="stylesheet" href="/css/site.css">
7+
</head>
8+
<body>
9+
<header>
10+
<a href="http://library.caltech.edu"><img src="/assets/liblogo.gif" alt="Caltech Library logo"></a>
11+
</header>
12+
<nav>
13+
<ul>
14+
<li><a href="/">Home</a></li>
15+
<li><a href="./">README</a></li>
16+
<li><a href="LICENSE">LICENSE</a></li>
17+
<li><a href="INSTALL.html">INSTALL</a></li>
18+
<li><a href="user-manual.html">User Manual</a></li>
19+
<li><a href="how-to/">Tutorials</a></li>
20+
<li><a href="search.html">Search Docs</a></li>
21+
<li><a href="about.html">About</a></li>
22+
<li><a href="https://github.com/caltechlibrary/datatools">GitHub</a></li>
23+
</ul>
24+
</nav>
25+
26+
<section>
27+
<h1 id="name">NAME</h1>
28+
<p>csv2jsonl</p>
29+
<h1 id="synopsis">SYNOPSIS</h1>
30+
<p>csv2jsonl [OPTIIONS]</p>
31+
<h1 id="description">DESCRIPTION</h1>
32+
<p>csv2jsonl reads CSV from stdin and writes a JSON-L to stdout. JSON
33+
output is one object per line. See https://jsonlines.org.</p>
34+
<h1 id="options">OPTIONS</h1>
35+
<dl>
36+
<dt>-help</dt>
37+
<dd>
38+
display help
39+
</dd>
40+
<dt>-license</dt>
41+
<dd>
42+
display license
43+
</dd>
44+
<dt>-version</dt>
45+
<dd>
46+
display version
47+
</dd>
48+
<dt>-d, -delimiter</dt>
49+
<dd>
50+
set the delimter character
51+
</dd>
52+
<dt>-examples</dt>
53+
<dd>
54+
display example(s)
55+
</dd>
56+
<dt>-fields-per-record</dt>
57+
<dd>
58+
Set the number of fields expected in the CSV read, -1 to turn off
59+
</dd>
60+
<dt>-i, -input</dt>
61+
<dd>
62+
input filename
63+
</dd>
64+
<dt>-nl, -newline</dt>
65+
<dd>
66+
include trailing newline in output
67+
</dd>
68+
<dt>-o, -output</dt>
69+
<dd>
70+
output filename
71+
</dd>
72+
<dt>-quiet</dt>
73+
<dd>
74+
suppress error output
75+
</dd>
76+
<dt>-reuse-record</dt>
77+
<dd>
78+
reuse the backing array
79+
</dd>
80+
<dt>-trim-leading-space</dt>
81+
<dd>
82+
trim leading space in fields for CSV input
83+
</dd>
84+
<dt>-use-header</dt>
85+
<dd>
86+
treat the first row as field names
87+
</dd>
88+
<dt>-use-lazy-quotes</dt>
89+
<dd>
90+
use lazy quotes for for CSV input
91+
</dd>
92+
</dl>
93+
<h1 id="examples">EXAMPLES</h1>
94+
<p>Convert data1.csv to data1.jsonl using Unix pipes.</p>
95+
<pre><code> cat data1.csv | csv2jsonl &gt; data1.jsonl</code></pre>
96+
<p>Convert data1.csv to JSON line (one object line per blob)</p>
97+
<pre><code> csv2jsonl data1.csv</code></pre>
98+
</section>
99+
100+
<footer>
101+
<span><h1><A href="http://caltech.edu">Caltech</a></h1></span>
102+
<span>&copy; 2023 <a href="https://www.library.caltech.edu/copyright">Caltech library</a></span>
103+
<address>1200 E California Blvd, Mail Code 1-32, Pasadena, CA 91125-3200</address>
104+
<span>Phone: <a href="tel:+1-626-395-3405">(626)395-3405</a></span>
105+
<span><a href="mailto:[email protected]">Email Us</a></span>
106+
<a class="cl-hide" href="sitemap.xml">Site Map</a>
107+
</footer>
108+
</body>
109+
</html>

0 commit comments

Comments
 (0)