Skip to content

Commit 6ebbb85

Browse files
authored
zstd: Copy literal in 16 byte blocks when possible (#592)
Also reduces literal overalloc when full allocs are allowed. ``` benchmark old ns/op new ns/op delta BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32 14572 13898 -4.63% BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32 3946 3682 -6.69% BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32 45150 43296 -4.11% BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32 33525 36679 +9.41% BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32 11952 10496 -12.18% BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32 14081 13339 -5.27% BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32 12111 11745 -3.02% BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32 1073 1037 -3.36% BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32 1759 1841 +4.66% BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32 43722 39755 -9.07% BenchmarkDecoder_DecodeAllParallel/html.zst-32 4144 3756 -9.36% BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32 1240 1240 +0.00% BenchmarkDecoder_DecodeAll/kppkn.gtb.zst-32 250426 240012 -4.16% BenchmarkDecoder_DecodeAll/geo.protodata.zst-32 71861 65548 -8.79% BenchmarkDecoder_DecodeAll/plrabn12.txt.zst-32 829878 736934 -11.20% BenchmarkDecoder_DecodeAll/lcet10.txt.zst-32 609402 683505 +12.16% BenchmarkDecoder_DecodeAll/asyoulik.txt.zst-32 231636 189146 -18.34% BenchmarkDecoder_DecodeAll/alice29.txt.zst-32 245022 226451 -7.58% BenchmarkDecoder_DecodeAll/html_x_4.zst-32 229709 216421 -5.78% BenchmarkDecoder_DecodeAll/paper-100k.pdf.zst-32 18400 17850 -2.99% BenchmarkDecoder_DecodeAll/fireworks.jpeg.zst-32 9682 9801 +1.23% BenchmarkDecoder_DecodeAll/urls.10K.zst-32 924472 796913 -13.80% BenchmarkDecoder_DecodeAll/html.zst-32 77728 66831 -14.02% BenchmarkDecoder_DecodeAll/comp-data.bin.zst-32 7985 7432 -6.93% Benchmark_seqdec_execute/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32 130498 106559 -18.34% Benchmark_seqdec_execute/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32 136475 121699 -10.83% Benchmark_seqdec_execute/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32 43119 33598 -22.08% Benchmark_seqdec_execute/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32 15723 14472 -7.96% Benchmark_seqdec_execute/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32 25968 19734 -24.01% Benchmark_seqdec_execute/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32 88906 79506 -10.57% Benchmark_seqdec_execute/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32 7385 7269 -1.57% Benchmark_seqdec_execute/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32 83133 64295 -22.66% Benchmark_seqdec_execute/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32 2899 2881 -0.62% Benchmark_seqdec_execute/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32 3951 3961 +0.25% Benchmark_seqdec_execute/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32 7063 6809 -3.60% Benchmark_seqdec_execute/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32 14045 14050 +0.04% Benchmark_seqdec_execute/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32 19679 18611 -5.43% Benchmark_seqdec_execute/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32 48841 45545 -6.75% Benchmark_seqdec_decodeSync/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32 276464 273620 -1.03% Benchmark_seqdec_decodeSync/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32 270905 269049 -0.69% Benchmark_seqdec_decodeSync/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32 146061 145878 -0.13% Benchmark_seqdec_decodeSync/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32 30686 27367 -10.82% Benchmark_seqdec_decodeSync/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32 88493 87167 -1.50% Benchmark_seqdec_decodeSync/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32 195326 195764 +0.22% Benchmark_seqdec_decodeSync/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32 14081 13925 -1.11% Benchmark_seqdec_decodeSync/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32 297178 298192 +0.34% Benchmark_seqdec_decodeSync/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32 2935 2921 -0.48% Benchmark_seqdec_decodeSync/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32 4856 4467 -8.01% Benchmark_seqdec_decodeSync/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32 14059 14050 -0.06% Benchmark_seqdec_decodeSync/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32 35636 33427 -6.20% Benchmark_seqdec_decodeSync/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32 88618 85660 -3.34% Benchmark_seqdec_decodeSync/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32 162282 160568 -1.06% ``` `lcet10.txt` doesn't like it, otherwise mostly positive. Streams before/after: ``` BenchmarkDecoderEnwik9-32 1 1288277200 ns/op 776.23 MB/s 59552 B/op 44 allocs/op BenchmarkDecoderEnwik9/multithreaded-writer-32 1 1191034000 ns/op 839.61 MB/s 13993224 B/op 113 allocs/op BenchmarkDecoderSilesia-32 5 209913160 ns/op 1009.69 MB/s 46715 B/op 38 allocs/op BenchmarkDecoderSilesia/multithreaded-writer-32 5 201394480 ns/op 1052.40 MB/s 5129462 B/op 77 allocs/op ```
1 parent fbaccdc commit 6ebbb85

File tree

6 files changed

+381
-112
lines changed

6 files changed

+381
-112
lines changed

zstd/_generate/gen.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ func main() {
6666
safeMem: false,
6767
}
6868
exec.generateProcedure("sequenceDecs_executeSimple_amd64")
69+
exec.safeMem = true
70+
exec.generateProcedure("sequenceDecs_executeSimple_safe_amd64")
6971

7072
decodeSync := decodeSync{}
7173
decodeSync.setBMI2(false)
@@ -1032,7 +1034,11 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle
10321034
TESTQ(ll, ll)
10331035
JZ(LabelRef("check_offset"))
10341036
// TODO: Investigate if it is possible to consistently overallocate literals.
1035-
e.copyMemoryPrecise("1", c.literals, c.outBase, ll)
1037+
if e.safeMem {
1038+
e.copyMemoryPrecise("1", c.literals, c.outBase, ll)
1039+
} else {
1040+
e.copyMemoryND("1", c.literals, c.outBase, ll)
1041+
}
10361042
ADDQ(ll, c.literals)
10371043
ADDQ(ll, c.outBase)
10381044
ADDQ(ll, c.outPosition)
@@ -1188,6 +1194,26 @@ func (e executeSimple) copyMemory(suffix string, src, dst, length reg.GPVirtual)
11881194
JHI(LabelRef(label))
11891195
}
11901196

1197+
// copyMemoryND will copy memory in blocks of 16 bytes,
1198+
// overwriting up to 15 extra bytes.
1199+
// All parameters are preserved.
1200+
func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtual) {
1201+
label := "copy_" + suffix
1202+
1203+
ofs := GP64()
1204+
s := Mem{Base: src, Index: ofs, Scale: 1}
1205+
d := Mem{Base: dst, Index: ofs, Scale: 1}
1206+
1207+
XORQ(ofs, ofs)
1208+
Label(label)
1209+
t := XMM()
1210+
MOVUPS(s, t)
1211+
MOVUPS(t, d)
1212+
ADDQ(U8(16), ofs)
1213+
CMPQ(ofs, length)
1214+
JB(LabelRef(label))
1215+
}
1216+
11911217
// copyMemoryPrecise will copy memory in blocks of 16 bytes,
11921218
// without overwriting nor overreading.
11931219
func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual) {

zstd/blockdec.go

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,8 @@ const (
4949
// Maximum possible block size (all Raw+Uncompressed).
5050
maxBlockSize = (1 << 21) - 1
5151

52-
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
53-
maxCompressedLiteralSize = 1 << 18
54-
maxRLELiteralSize = 1 << 20
55-
maxMatchLen = 131074
56-
maxSequences = 0x7f00 + 0xffff
52+
maxMatchLen = 131074
53+
maxSequences = 0x7f00 + 0xffff
5754

5855
// We support slightly less than the reference decoder to be able to
5956
// use ints on 32 bit archs.
@@ -368,14 +365,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
368365
}
369366
if cap(b.literalBuf) < litRegenSize {
370367
if b.lowMem {
371-
b.literalBuf = make([]byte, litRegenSize)
368+
b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
372369
} else {
373-
if litRegenSize > maxCompressedLiteralSize {
374-
// Exceptional
375-
b.literalBuf = make([]byte, litRegenSize)
376-
} else {
377-
b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
378-
}
370+
b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
379371
}
380372
}
381373
literals = b.literalBuf[:litRegenSize]
@@ -405,14 +397,14 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
405397
// Ensure we have space to store it.
406398
if cap(b.literalBuf) < litRegenSize {
407399
if b.lowMem {
408-
b.literalBuf = make([]byte, 0, litRegenSize)
400+
b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
409401
} else {
410-
b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
402+
b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
411403
}
412404
}
413405
var err error
414406
// Use our out buffer.
415-
huff.MaxDecodedSize = maxCompressedBlockSize
407+
huff.MaxDecodedSize = litRegenSize
416408
if fourStreams {
417409
literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
418410
} else {
@@ -437,9 +429,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
437429
// Ensure we have space to store it.
438430
if cap(b.literalBuf) < litRegenSize {
439431
if b.lowMem {
440-
b.literalBuf = make([]byte, 0, litRegenSize)
432+
b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
441433
} else {
442-
b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
434+
b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
443435
}
444436
}
445437
huff := hist.huffTree
@@ -456,7 +448,7 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
456448
return in, err
457449
}
458450
hist.huffTree = huff
459-
huff.MaxDecodedSize = maxCompressedBlockSize
451+
huff.MaxDecodedSize = litRegenSize
460452
// Use our out buffer.
461453
if fourStreams {
462454
literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
@@ -471,6 +463,8 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
471463
if len(literals) != litRegenSize {
472464
return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
473465
}
466+
// Re-cap to get extra size.
467+
literals = b.literalBuf[:len(literals)]
474468
if debugDecoder {
475469
printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
476470
}

zstd/decoder_test.go

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,12 +1402,7 @@ func benchmarkDecoderWithFile(path string, b *testing.B) {
14021402
if err != nil {
14031403
b.Fatal(err)
14041404
}
1405-
dec, err := NewReader(nil, WithDecoderLowmem(false))
1406-
if err != nil {
1407-
b.Fatal(err)
1408-
}
1409-
defer dec.Close()
1410-
err = dec.Reset(bytes.NewBuffer(data))
1405+
dec, err := NewReader(bytes.NewBuffer(data), WithDecoderLowmem(false), WithDecoderConcurrency(1))
14111406
if err != nil {
14121407
b.Fatal(err)
14131408
}
@@ -1416,19 +1411,69 @@ func benchmarkDecoderWithFile(path string, b *testing.B) {
14161411
b.Fatal(err)
14171412
}
14181413

1419-
b.SetBytes(n)
1420-
b.ReportAllocs()
1421-
b.ResetTimer()
1422-
for i := 0; i < b.N; i++ {
1423-
err = dec.Reset(bytes.NewBuffer(data))
1414+
b.Run("multithreaded-writer", func(b *testing.B) {
1415+
dec, err := NewReader(nil)
14241416
if err != nil {
14251417
b.Fatal(err)
14261418
}
1427-
_, err := io.CopyN(ioutil.Discard, dec, n)
1419+
1420+
b.SetBytes(n)
1421+
b.ReportAllocs()
1422+
b.ResetTimer()
1423+
for i := 0; i < b.N; i++ {
1424+
err = dec.Reset(bytes.NewBuffer(data))
1425+
if err != nil {
1426+
b.Fatal(err)
1427+
}
1428+
_, err := io.CopyN(ioutil.Discard, dec, n)
1429+
if err != nil {
1430+
b.Fatal(err)
1431+
}
1432+
}
1433+
})
1434+
1435+
b.Run("singlethreaded-writer", func(b *testing.B) {
1436+
dec, err := NewReader(nil, WithDecoderConcurrency(1))
14281437
if err != nil {
14291438
b.Fatal(err)
14301439
}
1431-
}
1440+
1441+
b.SetBytes(n)
1442+
b.ReportAllocs()
1443+
b.ResetTimer()
1444+
for i := 0; i < b.N; i++ {
1445+
err = dec.Reset(bytes.NewBuffer(data))
1446+
if err != nil {
1447+
b.Fatal(err)
1448+
}
1449+
_, err := io.CopyN(ioutil.Discard, dec, n)
1450+
if err != nil {
1451+
b.Fatal(err)
1452+
}
1453+
}
1454+
})
1455+
1456+
b.Run("singlethreaded-writerto", func(b *testing.B) {
1457+
dec, err := NewReader(nil, WithDecoderConcurrency(1))
1458+
if err != nil {
1459+
b.Fatal(err)
1460+
}
1461+
1462+
b.SetBytes(n)
1463+
b.ReportAllocs()
1464+
b.ResetTimer()
1465+
for i := 0; i < b.N; i++ {
1466+
err = dec.Reset(bytes.NewBuffer(data))
1467+
if err != nil {
1468+
b.Fatal(err)
1469+
}
1470+
// io.Copy will use io.WriterTo
1471+
_, err := io.Copy(ioutil.Discard, dec)
1472+
if err != nil {
1473+
b.Fatal(err)
1474+
}
1475+
}
1476+
})
14321477
}
14331478

14341479
func BenchmarkDecoderSilesia(b *testing.B) {

zstd/seqdec_amd64.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
6262
if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
6363
useSafe = true
6464
}
65+
if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
66+
useSafe = true
67+
}
68+
6569
br := s.br
6670

6771
maxBlockSize := maxCompressedBlockSize
@@ -301,6 +305,10 @@ type executeAsmContext struct {
301305
//go:noescape
302306
func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
303307

308+
// Same as above, but with safe memcopies
309+
//go:noescape
310+
func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
311+
304312
// executeSimple handles cases when dictionary is not used.
305313
func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
306314
// Ensure we have enough output size...
@@ -327,8 +335,12 @@ func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
327335
literals: s.literals,
328336
windowSize: s.windowSize,
329337
}
330-
331-
ok := sequenceDecs_executeSimple_amd64(&ctx)
338+
var ok bool
339+
if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
340+
ok = sequenceDecs_executeSimple_safe_amd64(&ctx)
341+
} else {
342+
ok = sequenceDecs_executeSimple_amd64(&ctx)
343+
}
332344
if !ok {
333345
return fmt.Errorf("match offset (%d) bigger than current history (%d)",
334346
seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))

0 commit comments

Comments
 (0)