Skip to content

Commit 80dad79

Browse files
committed
crfs/stargz: add start of package
Basic API, format, tests. Good enough checkpoint. Updates golang/go#30829 Change-Id: Iaec5b205314d64fca5056f6b19a7bae52e5cef94 Reviewed-on: https://go-review.googlesource.com/c/build/+/167769 Reviewed-by: Brad Fitzpatrick <[email protected]>
1 parent c5eff76 commit 80dad79

File tree

2 files changed

+637
-0
lines changed

2 files changed

+637
-0
lines changed

crfs/stargz/stargz.go

Lines changed: 345 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,345 @@
1+
// Copyright 2019 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// The stargz package reads & writes tar.gz ("tarball") files in a
6+
// seekable, indexed format call "stargz". A stargz file is still a
7+
// valid tarball, but it's slightly bigger with new gzip streams for
8+
// each new file & throughout large files, and has an index in a magic
9+
// file at the end.
10+
package stargz
11+
12+
import (
13+
"archive/tar"
14+
"bufio"
15+
"bytes"
16+
"compress/gzip"
17+
"encoding/json"
18+
"errors"
19+
"fmt"
20+
"io"
21+
"strconv"
22+
"time"
23+
)
24+
25+
// TOCTarName is the name of the JSON file in the tar archive in the
26+
// table of contents gzip stream.
27+
const TOCTarName = "stargz.index.json"
28+
29+
// FooterSize is the number of bytes in the stargz footer.
30+
//
31+
// The footer is an empty gzip stream with no compression and an Extra
32+
// header of the form "%016xSTARGZ", where the 64 bit hex-encoded
33+
// number is the offset to the gzip stream of JSON TOC.
34+
//
35+
// 47 comes from:
36+
//
37+
// 10 byte gzip header +
38+
// 2 byte (LE16) length of extra, encoding 22 (16 hex digits + len("STARGZ")) == "\x16\x00" +
39+
// 22 bytes of extra (fmt.Sprintf("%016xSTARGZ", tocGzipOffset))
40+
// 5 byte flate header
41+
// 8 byte gzip footer (two little endian uint32s: digest, size)
42+
const FooterSize = 47
43+
44+
// A Reader permits random access reads from a stargz file.
45+
type Reader struct {
46+
sr *io.SectionReader
47+
TOC *TOC
48+
}
49+
50+
// Open opens a stargz file for reading.
51+
func Open(sr *io.SectionReader) (*Reader, error) {
52+
if sr.Size() < FooterSize {
53+
return nil, fmt.Errorf("stargz size %d is smaller than the stargz footer size", sr.Size())
54+
}
55+
// TODO: read a bigger chunk (1MB?) at once here to hopefully
56+
// get the TOC + footer in one go.
57+
var footer [FooterSize]byte
58+
if _, err := sr.ReadAt(footer[:], sr.Size()-FooterSize); err != nil {
59+
return nil, fmt.Errorf("error reading footer: %v", err)
60+
}
61+
tocOff, ok := parseFooter(footer[:])
62+
if !ok {
63+
return nil, fmt.Errorf("error parsing footer")
64+
}
65+
tocTargz := make([]byte, sr.Size()-tocOff-FooterSize)
66+
if _, err := sr.ReadAt(tocTargz, tocOff); err != nil {
67+
return nil, fmt.Errorf("error reading %d byte TOC targz: %v", len(tocTargz), err)
68+
}
69+
zr, err := gzip.NewReader(bytes.NewReader(tocTargz))
70+
if err != nil {
71+
return nil, fmt.Errorf("malformed TOC gzip header: %v", err)
72+
}
73+
zr.Multistream(false)
74+
tr := tar.NewReader(zr)
75+
h, err := tr.Next()
76+
if err != nil {
77+
return nil, fmt.Errorf("failed to find tar header in TOC gzip stream: %v", err)
78+
}
79+
if h.Name != TOCTarName {
80+
return nil, fmt.Errorf("TOC tar entry had name %q; expected %q", h.Name, TOCTarName)
81+
}
82+
toc := new(TOC)
83+
if err := json.NewDecoder(tr).Decode(&toc); err != nil {
84+
return nil, fmt.Errorf("error decoding TOC JSON: %v", err)
85+
}
86+
return &Reader{sr: sr, TOC: toc}, nil
87+
}
88+
89+
// TOCEntry is an entry in the stargz file's TOC (Table of Contents).
90+
type TOCEntry struct {
91+
Offset int64 `json:"offset,omitempty"` // offset to gzip stream of tar entry (for regular files only)
92+
Name string `json:"name"`
93+
Type string `json:"type"` // "dir", "reg", TODO
94+
Size int64 `json:"size,omitempty"`
95+
LinkName string `json:"linkName,omitempty"` // for symlinks
96+
Mode int64 `json:"mode,omitempty"` // Permission and mode bits
97+
Uid int `json:"uid,omitempty"` // User ID of owner
98+
Gid int `json:"gid,omitempty"` // Group ID of owner
99+
Uname string `json:"userName,omitempty"` // User name of owner
100+
Gname string `json:"groupName,omitempty"` // Group name of owner
101+
ModTime string `json:"modtime,omitempty"`
102+
103+
// ChunkOffset is non-zero if this is a chunk of a large,
104+
// regular file. If so, the Offset is where the gzip header of
105+
// ChunkSize bytes at ChunkOffset in Name begin. If both
106+
// ChunkOffset and ChunkSize are zero, the file contents are
107+
// completely represented at the tar gzip stream starting at
108+
// Offset.
109+
ChunkOffset int64 `json:"chunkOffset,omitempty"`
110+
ChunkSize int64 `json:"chunkSize,omitempty"`
111+
}
112+
113+
// TOC is the table of contents index of the files in the stargz file.
114+
type TOC struct {
115+
Version int `json:"version"`
116+
Entries []TOCEntry `json:"entries"`
117+
}
118+
119+
// A Writer writes stargz files.
120+
//
121+
// Use NewWriter to create a new Writer.
122+
type Writer struct {
123+
bw *bufio.Writer
124+
cw *countWriter
125+
toc *TOC
126+
127+
closed bool
128+
gz *gzip.Writer
129+
}
130+
131+
// NewWriter returns a new stargz writer writing to w.
132+
//
133+
// The writer must be closed to write its trailing table of contents.
134+
func NewWriter(w io.Writer) *Writer {
135+
bw := bufio.NewWriter(w)
136+
cw := &countWriter{w: bw}
137+
return &Writer{
138+
bw: bw,
139+
cw: cw,
140+
toc: &TOC{Version: 1},
141+
}
142+
}
143+
144+
// Close writes the stargz's table of contents and flushes all the
145+
// buffers, returning any error.
146+
func (w *Writer) Close() error {
147+
if w.closed {
148+
return nil
149+
}
150+
defer func() { w.closed = true }()
151+
152+
if err := w.closeGz(); err != nil {
153+
return err
154+
}
155+
156+
// Write the TOC index.
157+
tocOff := w.cw.n
158+
w.gz, _ = gzip.NewWriterLevel(w.cw, gzip.BestCompression)
159+
tw := tar.NewWriter(w.gz)
160+
tocJSON, err := json.MarshalIndent(w.toc, "", "\t")
161+
if err != nil {
162+
return err
163+
}
164+
if err := tw.WriteHeader(&tar.Header{
165+
Typeflag: tar.TypeReg,
166+
Name: TOCTarName,
167+
Size: int64(len(tocJSON)),
168+
}); err != nil {
169+
return err
170+
}
171+
if _, err := tw.Write(tocJSON); err != nil {
172+
return err
173+
}
174+
175+
if err := tw.Close(); err != nil {
176+
return err
177+
}
178+
if err := w.closeGz(); err != nil {
179+
return err
180+
}
181+
182+
// And a little footer with pointer to the TOC gzip stream.
183+
if _, err := w.bw.Write(footerBytes(tocOff)); err != nil {
184+
return err
185+
}
186+
187+
if err := w.bw.Flush(); err != nil {
188+
return err
189+
}
190+
191+
return nil
192+
}
193+
194+
func (w *Writer) closeGz() error {
195+
if w.closed {
196+
return errors.New("write on closed Writer")
197+
}
198+
if w.gz != nil {
199+
if err := w.gz.Close(); err != nil {
200+
return err
201+
}
202+
w.gz = nil
203+
}
204+
return nil
205+
}
206+
207+
// AppendTar reads the tar or tar.gz file from r and appends
208+
// each of its contents to w.
209+
//
210+
// The input r can optionally be gzip compressed but the output will
211+
// always be gzip compressed.
212+
func (w *Writer) AppendTar(r io.Reader) error {
213+
br := bufio.NewReader(r)
214+
var tr *tar.Reader
215+
if isGzip(br) {
216+
// NewReader can't fail if isGzip returned true.
217+
zr, _ := gzip.NewReader(br)
218+
tr = tar.NewReader(zr)
219+
} else {
220+
tr = tar.NewReader(br)
221+
}
222+
for {
223+
h, err := tr.Next()
224+
if err == io.EOF {
225+
break
226+
}
227+
if err != nil {
228+
return fmt.Errorf("error reading from source tar: tar.Reader.Next: %v", err)
229+
}
230+
ent := TOCEntry{
231+
Name: h.Name,
232+
Mode: h.Mode,
233+
Uid: h.Uid,
234+
Gid: h.Gid,
235+
Uname: h.Uname,
236+
Gname: h.Gname,
237+
ModTime: formatModtime(h.ModTime),
238+
}
239+
switch h.Typeflag {
240+
case tar.TypeLink:
241+
return fmt.Errorf("TODO: unsupported hardlink %q => %q", h.Name, h.Linkname)
242+
case tar.TypeSymlink:
243+
ent.Type = "symlink"
244+
ent.LinkName = h.Linkname
245+
case tar.TypeDir:
246+
ent.Type = "dir"
247+
case tar.TypeReg:
248+
ent.Offset = w.cw.n
249+
ent.Type = "reg"
250+
ent.Size = h.Size
251+
252+
// Start a new gzip stream for regular files.
253+
if err := w.closeGz(); err != nil {
254+
return err
255+
}
256+
default:
257+
return fmt.Errorf("unsupported input tar entry %q", h.Typeflag)
258+
}
259+
w.toc.Entries = append(w.toc.Entries, ent)
260+
if w.gz == nil {
261+
w.gz, err = gzip.NewWriterLevel(w.cw, gzip.BestCompression)
262+
if err != nil {
263+
return err
264+
}
265+
}
266+
tw := tar.NewWriter(w.gz)
267+
if err := tw.WriteHeader(h); err != nil {
268+
return err
269+
}
270+
if _, err := io.Copy(tw, tr); err != nil {
271+
return err
272+
}
273+
if err := tw.Flush(); err != nil {
274+
return err
275+
}
276+
}
277+
return nil
278+
}
279+
280+
// footerBytes the 47 byte footer.
281+
func footerBytes(tocOff int64) []byte {
282+
buf := bytes.NewBuffer(make([]byte, 0, FooterSize))
283+
gz, _ := gzip.NewWriterLevel(buf, gzip.NoCompression)
284+
gz.Header.Extra = []byte(fmt.Sprintf("%016xSTARGZ", tocOff))
285+
gz.Close()
286+
if buf.Len() != FooterSize {
287+
panic(fmt.Sprintf("footer buffer = %d, not %d", buf.Len(), FooterSize))
288+
}
289+
return buf.Bytes()
290+
}
291+
292+
func parseFooter(p []byte) (tocOffset int64, ok bool) {
293+
if len(p) != FooterSize {
294+
return 0, false
295+
}
296+
zr, err := gzip.NewReader(bytes.NewReader(p))
297+
if err != nil {
298+
return 0, false
299+
}
300+
extra := zr.Header.Extra
301+
if len(extra) != 16+len("STARGZ") {
302+
return 0, false
303+
}
304+
if string(extra[16:]) != "STARGZ" {
305+
return 0, false
306+
}
307+
tocOffset, err = strconv.ParseInt(string(extra[:16]), 16, 64)
308+
return tocOffset, err == nil
309+
}
310+
311+
func formatModtime(t time.Time) string {
312+
if t.IsZero() || t.Unix() == 0 {
313+
return ""
314+
}
315+
t = t.UTC()
316+
if t.Equal(t.Round(time.Second)) {
317+
return t.UTC().Format(time.RFC3339)
318+
}
319+
return t.UTC().Format(time.RFC3339Nano)
320+
}
321+
322+
// countWriter counts how many bytes have been written to its wrapped
323+
// io.Writer.
324+
type countWriter struct {
325+
w io.Writer
326+
n int64
327+
}
328+
329+
func (cw *countWriter) Write(p []byte) (n int, err error) {
330+
n, err = cw.w.Write(p)
331+
cw.n += int64(n)
332+
return
333+
}
334+
335+
// isGzip reports whether br is positioned right before an upcoming gzip stream.
336+
// It does not consume any bytes from br.
337+
func isGzip(br *bufio.Reader) bool {
338+
const (
339+
gzipID1 = 0x1f
340+
gzipID2 = 0x8b
341+
gzipDeflate = 8
342+
)
343+
peek, _ := br.Peek(3)
344+
return len(peek) >= 3 && peek[0] == gzipID1 && peek[1] == gzipID2 && peek[2] == gzipDeflate
345+
}

0 commit comments

Comments
 (0)