Skip to content

Commit 76d7602

Browse files
rscharfegitster
authored andcommitted
archive-tar: add internal gzip implementation
Git uses zlib for its own object store, but calls gzip when creating tgz archives. Add an option to perform the gzip compression for the latter using zlib, without depending on the external gzip binary. Plug it in by making write_block a function pointer and switching to a compressing variant if the filter command has the magic value "git archive gzip". Does that indirection slow down tar creation? Not really, at least not in this test: $ hyperfine -w3 -L rev HEAD,origin/main -p 'git checkout {rev} && make' \ './git -C ../linux archive --format=tar HEAD # {rev}' Benchmark #1: ./git -C ../linux archive --format=tar HEAD # HEAD Time (mean ± σ): 4.044 s ± 0.007 s [User: 3.901 s, System: 0.137 s] Range (min … max): 4.038 s … 4.059 s 10 runs Benchmark #2: ./git -C ../linux archive --format=tar HEAD # origin/main Time (mean ± σ): 4.047 s ± 0.009 s [User: 3.903 s, System: 0.138 s] Range (min … max): 4.038 s … 4.066 s 10 runs How does tgz creation perform? $ hyperfine -w3 -L command 'gzip -cn','git archive gzip' \ './git -c tar.tgz.command="{command}" -C ../linux archive --format=tgz HEAD' Benchmark #1: ./git -c tar.tgz.command="gzip -cn" -C ../linux archive --format=tgz HEAD Time (mean ± σ): 20.404 s ± 0.006 s [User: 23.943 s, System: 0.401 s] Range (min … max): 20.395 s … 20.414 s 10 runs Benchmark #2: ./git -c tar.tgz.command="git archive gzip" -C ../linux archive --format=tgz HEAD Time (mean ± σ): 23.807 s ± 0.023 s [User: 23.655 s, System: 0.145 s] Range (min … max): 23.782 s … 23.857 s 10 runs Summary './git -c tar.tgz.command="gzip -cn" -C ../linux archive --format=tgz HEAD' ran 1.17 ± 0.00 times faster than './git -c tar.tgz.command="git archive gzip" -C ../linux archive --format=tgz HEAD' So the internal implementation takes 17% longer on the Linux repo, but uses 2% less CPU time. That's because the external gzip can run in parallel on its own processor, while the internal one works sequentially and avoids the inter-process communication overhead. What are the benefits? Only an internal sequential implementation can offer this eco mode, and it allows avoiding the gzip(1) requirement. This implementation uses the helper functions from our zlib.c instead of the convenient gz* functions from zlib, because the latter doesn't give the control over the generated gzip header that the next patch requires. Original-patch-by: Rohit Ashiwal <[email protected]> Signed-off-by: René Scharfe <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent dfce118 commit 76d7602

File tree

3 files changed

+62
-2
lines changed

3 files changed

+62
-2
lines changed

Documentation/git-archive.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ tar.<format>.command::
148148
to the command (e.g., `-9`).
149149
+
150150
The `tar.gz` and `tgz` formats are defined automatically and use the
151-
command `gzip -cn` by default.
151+
command `gzip -cn` by default. An internal gzip implementation can be
152+
used by specifying the value `git archive gzip`.
152153

153154
tar.<format>.remote::
154155
If true, enable the format for use by remote clients via

archive-tar.c

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,13 @@ static int write_tar_filter_archive(const struct archiver *ar,
3838
#define USTAR_MAX_MTIME 077777777777ULL
3939
#endif
4040

41-
static void write_block(const void *buf)
41+
static void tar_write_block(const void *buf)
4242
{
4343
write_or_die(1, buf, BLOCKSIZE);
4444
}
4545

46+
static void (*write_block)(const void *) = tar_write_block;
47+
4648
/* writes out the whole block, but only if it is full */
4749
static void write_if_needed(void)
4850
{
@@ -430,6 +432,34 @@ static int write_tar_archive(const struct archiver *ar,
430432
return err;
431433
}
432434

435+
static git_zstream gzstream;
436+
static unsigned char outbuf[16384];
437+
438+
static void tgz_deflate(int flush)
439+
{
440+
while (gzstream.avail_in || flush == Z_FINISH) {
441+
int status = git_deflate(&gzstream, flush);
442+
if (!gzstream.avail_out || status == Z_STREAM_END) {
443+
write_or_die(1, outbuf, gzstream.next_out - outbuf);
444+
gzstream.next_out = outbuf;
445+
gzstream.avail_out = sizeof(outbuf);
446+
if (status == Z_STREAM_END)
447+
break;
448+
}
449+
if (status != Z_OK && status != Z_BUF_ERROR)
450+
die(_("deflate error (%d)"), status);
451+
}
452+
}
453+
454+
static void tgz_write_block(const void *data)
455+
{
456+
gzstream.next_in = (void *)data;
457+
gzstream.avail_in = BLOCKSIZE;
458+
tgz_deflate(Z_NO_FLUSH);
459+
}
460+
461+
static const char internal_gzip_command[] = "git archive gzip";
462+
433463
static int write_tar_filter_archive(const struct archiver *ar,
434464
struct archiver_args *args)
435465
{
@@ -440,6 +470,19 @@ static int write_tar_filter_archive(const struct archiver *ar,
440470
if (!ar->filter_command)
441471
BUG("tar-filter archiver called with no filter defined");
442472

473+
if (!strcmp(ar->filter_command, internal_gzip_command)) {
474+
write_block = tgz_write_block;
475+
git_deflate_init_gzip(&gzstream, args->compression_level);
476+
gzstream.next_out = outbuf;
477+
gzstream.avail_out = sizeof(outbuf);
478+
479+
r = write_tar_archive(ar, args);
480+
481+
tgz_deflate(Z_FINISH);
482+
git_deflate_end(&gzstream);
483+
return r;
484+
}
485+
443486
strbuf_addstr(&cmd, ar->filter_command);
444487
if (args->compression_level >= 0)
445488
strbuf_addf(&cmd, " -%d", args->compression_level);

t/t5000-tar-tree.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,22 @@ test_expect_success GZIP 'remote tar.gz can be disabled' '
374374
>remote.tar.gz
375375
'
376376

377+
test_expect_success 'git archive --format=tgz (internal gzip)' '
378+
test_config tar.tgz.command "git archive gzip" &&
379+
git archive --format=tgz HEAD >internal_gzip.tgz
380+
'
381+
382+
test_expect_success 'git archive --format=tar.gz (internal gzip)' '
383+
test_config tar.tar.gz.command "git archive gzip" &&
384+
git archive --format=tar.gz HEAD >internal_gzip.tar.gz &&
385+
test_cmp_bin internal_gzip.tgz internal_gzip.tar.gz
386+
'
387+
388+
test_expect_success GZIP 'extract tgz file (internal gzip)' '
389+
gzip -d -c <internal_gzip.tgz >internal_gzip.tar &&
390+
test_cmp_bin b.tar internal_gzip.tar
391+
'
392+
377393
test_expect_success 'archive and :(glob)' '
378394
git archive -v HEAD -- ":(glob)**/sh" >/dev/null 2>actual &&
379395
cat >expect <<EOF &&

0 commit comments

Comments
 (0)