|
| 1 | +// Copyright 2019 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +// The stargz package reads & writes tar.gz ("tarball") files in a |
| 6 | +// seekable, indexed format call "stargz". A stargz file is still a |
| 7 | +// valid tarball, but it's slightly bigger with new gzip streams for |
| 8 | +// each new file & throughout large files, and has an index in a magic |
| 9 | +// file at the end. |
| 10 | +package stargz |
| 11 | + |
| 12 | +import ( |
| 13 | + "archive/tar" |
| 14 | + "bufio" |
| 15 | + "bytes" |
| 16 | + "compress/gzip" |
| 17 | + "encoding/json" |
| 18 | + "errors" |
| 19 | + "fmt" |
| 20 | + "io" |
| 21 | + "strconv" |
| 22 | + "time" |
| 23 | +) |
| 24 | + |
| 25 | +// TOCTarName is the name of the JSON file in the tar archive in the |
| 26 | +// table of contents gzip stream. |
| 27 | +const TOCTarName = "stargz.index.json" |
| 28 | + |
| 29 | +// FooterSize is the number of bytes in the stargz footer. |
| 30 | +// |
| 31 | +// The footer is an empty gzip stream with no compression and an Extra |
| 32 | +// header of the form "%016xSTARGZ", where the 64 bit hex-encoded |
| 33 | +// number is the offset to the gzip stream of JSON TOC. |
| 34 | +// |
| 35 | +// 47 comes from: |
| 36 | +// |
| 37 | +// 10 byte gzip header + |
| 38 | +// 2 byte (LE16) length of extra, encoding 22 (16 hex digits + len("STARGZ")) == "\x16\x00" + |
| 39 | +// 22 bytes of extra (fmt.Sprintf("%016xSTARGZ", tocGzipOffset)) |
| 40 | +// 5 byte flate header |
| 41 | +// 8 byte gzip footer (two little endian uint32s: digest, size) |
| 42 | +const FooterSize = 47 |
| 43 | + |
| 44 | +// A Reader permits random access reads from a stargz file. |
| 45 | +type Reader struct { |
| 46 | + sr *io.SectionReader |
| 47 | + TOC *TOC |
| 48 | +} |
| 49 | + |
| 50 | +// Open opens a stargz file for reading. |
| 51 | +func Open(sr *io.SectionReader) (*Reader, error) { |
| 52 | + if sr.Size() < FooterSize { |
| 53 | + return nil, fmt.Errorf("stargz size %d is smaller than the stargz footer size", sr.Size()) |
| 54 | + } |
| 55 | + // TODO: read a bigger chunk (1MB?) at once here to hopefully |
| 56 | + // get the TOC + footer in one go. |
| 57 | + var footer [FooterSize]byte |
| 58 | + if _, err := sr.ReadAt(footer[:], sr.Size()-FooterSize); err != nil { |
| 59 | + return nil, fmt.Errorf("error reading footer: %v", err) |
| 60 | + } |
| 61 | + tocOff, ok := parseFooter(footer[:]) |
| 62 | + if !ok { |
| 63 | + return nil, fmt.Errorf("error parsing footer") |
| 64 | + } |
| 65 | + tocTargz := make([]byte, sr.Size()-tocOff-FooterSize) |
| 66 | + if _, err := sr.ReadAt(tocTargz, tocOff); err != nil { |
| 67 | + return nil, fmt.Errorf("error reading %d byte TOC targz: %v", len(tocTargz), err) |
| 68 | + } |
| 69 | + zr, err := gzip.NewReader(bytes.NewReader(tocTargz)) |
| 70 | + if err != nil { |
| 71 | + return nil, fmt.Errorf("malformed TOC gzip header: %v", err) |
| 72 | + } |
| 73 | + zr.Multistream(false) |
| 74 | + tr := tar.NewReader(zr) |
| 75 | + h, err := tr.Next() |
| 76 | + if err != nil { |
| 77 | + return nil, fmt.Errorf("failed to find tar header in TOC gzip stream: %v", err) |
| 78 | + } |
| 79 | + if h.Name != TOCTarName { |
| 80 | + return nil, fmt.Errorf("TOC tar entry had name %q; expected %q", h.Name, TOCTarName) |
| 81 | + } |
| 82 | + toc := new(TOC) |
| 83 | + if err := json.NewDecoder(tr).Decode(&toc); err != nil { |
| 84 | + return nil, fmt.Errorf("error decoding TOC JSON: %v", err) |
| 85 | + } |
| 86 | + return &Reader{sr: sr, TOC: toc}, nil |
| 87 | +} |
| 88 | + |
| 89 | +// TOCEntry is an entry in the stargz file's TOC (Table of Contents). |
| 90 | +type TOCEntry struct { |
| 91 | + Offset int64 `json:"offset,omitempty"` // offset to gzip stream of tar entry (for regular files only) |
| 92 | + Name string `json:"name"` |
| 93 | + Type string `json:"type"` // "dir", "reg", TODO |
| 94 | + Size int64 `json:"size,omitempty"` |
| 95 | + LinkName string `json:"linkName,omitempty"` // for symlinks |
| 96 | + Mode int64 `json:"mode,omitempty"` // Permission and mode bits |
| 97 | + Uid int `json:"uid,omitempty"` // User ID of owner |
| 98 | + Gid int `json:"gid,omitempty"` // Group ID of owner |
| 99 | + Uname string `json:"userName,omitempty"` // User name of owner |
| 100 | + Gname string `json:"groupName,omitempty"` // Group name of owner |
| 101 | + ModTime string `json:"modtime,omitempty"` |
| 102 | + |
| 103 | + // ChunkOffset is non-zero if this is a chunk of a large, |
| 104 | + // regular file. If so, the Offset is where the gzip header of |
| 105 | + // ChunkSize bytes at ChunkOffset in Name begin. If both |
| 106 | + // ChunkOffset and ChunkSize are zero, the file contents are |
| 107 | + // completely represented at the tar gzip stream starting at |
| 108 | + // Offset. |
| 109 | + ChunkOffset int64 `json:"chunkOffset,omitempty"` |
| 110 | + ChunkSize int64 `json:"chunkSize,omitempty"` |
| 111 | +} |
| 112 | + |
| 113 | +// TOC is the table of contents index of the files in the stargz file. |
| 114 | +type TOC struct { |
| 115 | + Version int `json:"version"` |
| 116 | + Entries []TOCEntry `json:"entries"` |
| 117 | +} |
| 118 | + |
| 119 | +// A Writer writes stargz files. |
| 120 | +// |
| 121 | +// Use NewWriter to create a new Writer. |
| 122 | +type Writer struct { |
| 123 | + bw *bufio.Writer |
| 124 | + cw *countWriter |
| 125 | + toc *TOC |
| 126 | + |
| 127 | + closed bool |
| 128 | + gz *gzip.Writer |
| 129 | +} |
| 130 | + |
| 131 | +// NewWriter returns a new stargz writer writing to w. |
| 132 | +// |
| 133 | +// The writer must be closed to write its trailing table of contents. |
| 134 | +func NewWriter(w io.Writer) *Writer { |
| 135 | + bw := bufio.NewWriter(w) |
| 136 | + cw := &countWriter{w: bw} |
| 137 | + return &Writer{ |
| 138 | + bw: bw, |
| 139 | + cw: cw, |
| 140 | + toc: &TOC{Version: 1}, |
| 141 | + } |
| 142 | +} |
| 143 | + |
| 144 | +// Close writes the stargz's table of contents and flushes all the |
| 145 | +// buffers, returning any error. |
| 146 | +func (w *Writer) Close() error { |
| 147 | + if w.closed { |
| 148 | + return nil |
| 149 | + } |
| 150 | + defer func() { w.closed = true }() |
| 151 | + |
| 152 | + if err := w.closeGz(); err != nil { |
| 153 | + return err |
| 154 | + } |
| 155 | + |
| 156 | + // Write the TOC index. |
| 157 | + tocOff := w.cw.n |
| 158 | + w.gz, _ = gzip.NewWriterLevel(w.cw, gzip.BestCompression) |
| 159 | + tw := tar.NewWriter(w.gz) |
| 160 | + tocJSON, err := json.MarshalIndent(w.toc, "", "\t") |
| 161 | + if err != nil { |
| 162 | + return err |
| 163 | + } |
| 164 | + if err := tw.WriteHeader(&tar.Header{ |
| 165 | + Typeflag: tar.TypeReg, |
| 166 | + Name: TOCTarName, |
| 167 | + Size: int64(len(tocJSON)), |
| 168 | + }); err != nil { |
| 169 | + return err |
| 170 | + } |
| 171 | + if _, err := tw.Write(tocJSON); err != nil { |
| 172 | + return err |
| 173 | + } |
| 174 | + |
| 175 | + if err := tw.Close(); err != nil { |
| 176 | + return err |
| 177 | + } |
| 178 | + if err := w.closeGz(); err != nil { |
| 179 | + return err |
| 180 | + } |
| 181 | + |
| 182 | + // And a little footer with pointer to the TOC gzip stream. |
| 183 | + if _, err := w.bw.Write(footerBytes(tocOff)); err != nil { |
| 184 | + return err |
| 185 | + } |
| 186 | + |
| 187 | + if err := w.bw.Flush(); err != nil { |
| 188 | + return err |
| 189 | + } |
| 190 | + |
| 191 | + return nil |
| 192 | +} |
| 193 | + |
| 194 | +func (w *Writer) closeGz() error { |
| 195 | + if w.closed { |
| 196 | + return errors.New("write on closed Writer") |
| 197 | + } |
| 198 | + if w.gz != nil { |
| 199 | + if err := w.gz.Close(); err != nil { |
| 200 | + return err |
| 201 | + } |
| 202 | + w.gz = nil |
| 203 | + } |
| 204 | + return nil |
| 205 | +} |
| 206 | + |
| 207 | +// AppendTar reads the tar or tar.gz file from r and appends |
| 208 | +// each of its contents to w. |
| 209 | +// |
| 210 | +// The input r can optionally be gzip compressed but the output will |
| 211 | +// always be gzip compressed. |
| 212 | +func (w *Writer) AppendTar(r io.Reader) error { |
| 213 | + br := bufio.NewReader(r) |
| 214 | + var tr *tar.Reader |
| 215 | + if isGzip(br) { |
| 216 | + // NewReader can't fail if isGzip returned true. |
| 217 | + zr, _ := gzip.NewReader(br) |
| 218 | + tr = tar.NewReader(zr) |
| 219 | + } else { |
| 220 | + tr = tar.NewReader(br) |
| 221 | + } |
| 222 | + for { |
| 223 | + h, err := tr.Next() |
| 224 | + if err == io.EOF { |
| 225 | + break |
| 226 | + } |
| 227 | + if err != nil { |
| 228 | + return fmt.Errorf("error reading from source tar: tar.Reader.Next: %v", err) |
| 229 | + } |
| 230 | + ent := TOCEntry{ |
| 231 | + Name: h.Name, |
| 232 | + Mode: h.Mode, |
| 233 | + Uid: h.Uid, |
| 234 | + Gid: h.Gid, |
| 235 | + Uname: h.Uname, |
| 236 | + Gname: h.Gname, |
| 237 | + ModTime: formatModtime(h.ModTime), |
| 238 | + } |
| 239 | + switch h.Typeflag { |
| 240 | + case tar.TypeLink: |
| 241 | + return fmt.Errorf("TODO: unsupported hardlink %q => %q", h.Name, h.Linkname) |
| 242 | + case tar.TypeSymlink: |
| 243 | + ent.Type = "symlink" |
| 244 | + ent.LinkName = h.Linkname |
| 245 | + case tar.TypeDir: |
| 246 | + ent.Type = "dir" |
| 247 | + case tar.TypeReg: |
| 248 | + ent.Offset = w.cw.n |
| 249 | + ent.Type = "reg" |
| 250 | + ent.Size = h.Size |
| 251 | + |
| 252 | + // Start a new gzip stream for regular files. |
| 253 | + if err := w.closeGz(); err != nil { |
| 254 | + return err |
| 255 | + } |
| 256 | + default: |
| 257 | + return fmt.Errorf("unsupported input tar entry %q", h.Typeflag) |
| 258 | + } |
| 259 | + w.toc.Entries = append(w.toc.Entries, ent) |
| 260 | + if w.gz == nil { |
| 261 | + w.gz, err = gzip.NewWriterLevel(w.cw, gzip.BestCompression) |
| 262 | + if err != nil { |
| 263 | + return err |
| 264 | + } |
| 265 | + } |
| 266 | + tw := tar.NewWriter(w.gz) |
| 267 | + if err := tw.WriteHeader(h); err != nil { |
| 268 | + return err |
| 269 | + } |
| 270 | + if _, err := io.Copy(tw, tr); err != nil { |
| 271 | + return err |
| 272 | + } |
| 273 | + if err := tw.Flush(); err != nil { |
| 274 | + return err |
| 275 | + } |
| 276 | + } |
| 277 | + return nil |
| 278 | +} |
| 279 | + |
| 280 | +// footerBytes the 47 byte footer. |
| 281 | +func footerBytes(tocOff int64) []byte { |
| 282 | + buf := bytes.NewBuffer(make([]byte, 0, FooterSize)) |
| 283 | + gz, _ := gzip.NewWriterLevel(buf, gzip.NoCompression) |
| 284 | + gz.Header.Extra = []byte(fmt.Sprintf("%016xSTARGZ", tocOff)) |
| 285 | + gz.Close() |
| 286 | + if buf.Len() != FooterSize { |
| 287 | + panic(fmt.Sprintf("footer buffer = %d, not %d", buf.Len(), FooterSize)) |
| 288 | + } |
| 289 | + return buf.Bytes() |
| 290 | +} |
| 291 | + |
| 292 | +func parseFooter(p []byte) (tocOffset int64, ok bool) { |
| 293 | + if len(p) != FooterSize { |
| 294 | + return 0, false |
| 295 | + } |
| 296 | + zr, err := gzip.NewReader(bytes.NewReader(p)) |
| 297 | + if err != nil { |
| 298 | + return 0, false |
| 299 | + } |
| 300 | + extra := zr.Header.Extra |
| 301 | + if len(extra) != 16+len("STARGZ") { |
| 302 | + return 0, false |
| 303 | + } |
| 304 | + if string(extra[16:]) != "STARGZ" { |
| 305 | + return 0, false |
| 306 | + } |
| 307 | + tocOffset, err = strconv.ParseInt(string(extra[:16]), 16, 64) |
| 308 | + return tocOffset, err == nil |
| 309 | +} |
| 310 | + |
| 311 | +func formatModtime(t time.Time) string { |
| 312 | + if t.IsZero() || t.Unix() == 0 { |
| 313 | + return "" |
| 314 | + } |
| 315 | + t = t.UTC() |
| 316 | + if t.Equal(t.Round(time.Second)) { |
| 317 | + return t.UTC().Format(time.RFC3339) |
| 318 | + } |
| 319 | + return t.UTC().Format(time.RFC3339Nano) |
| 320 | +} |
| 321 | + |
| 322 | +// countWriter counts how many bytes have been written to its wrapped |
| 323 | +// io.Writer. |
| 324 | +type countWriter struct { |
| 325 | + w io.Writer |
| 326 | + n int64 |
| 327 | +} |
| 328 | + |
| 329 | +func (cw *countWriter) Write(p []byte) (n int, err error) { |
| 330 | + n, err = cw.w.Write(p) |
| 331 | + cw.n += int64(n) |
| 332 | + return |
| 333 | +} |
| 334 | + |
| 335 | +// isGzip reports whether br is positioned right before an upcoming gzip stream. |
| 336 | +// It does not consume any bytes from br. |
| 337 | +func isGzip(br *bufio.Reader) bool { |
| 338 | + const ( |
| 339 | + gzipID1 = 0x1f |
| 340 | + gzipID2 = 0x8b |
| 341 | + gzipDeflate = 8 |
| 342 | + ) |
| 343 | + peek, _ := br.Peek(3) |
| 344 | + return len(peek) >= 3 && peek[0] == gzipID1 && peek[1] == gzipID2 && peek[2] == gzipDeflate |
| 345 | +} |
0 commit comments