Skip to content

Commit 1f0baa2

Browse files
committed
zstd: x86 assembler implementation of sequenceDecs.decode
Differences with the Go implementation: - check ml and mo in the main loop, - s.seqSize and litRemain are checked in the end.
1 parent 6c9bcdc commit 1f0baa2

File tree

8 files changed

+1396
-144
lines changed

8 files changed

+1396
-144
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,10 @@ _testmain.go
2323
*.test
2424
*.prof
2525
/s2/cmd/_s2sx/sfx-exe
26+
27+
# Linux perf files
28+
perf.data
29+
perf.data.old
30+
31+
# gdb history
32+
.gdb_history

zstd/autogen.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
package zstd
2+
3+
//go:generate go run generate.go
4+
//go:generate asmfmt -w seqdec_amd64.s

zstd/generate.go

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
//go:build ignore
2+
// +build ignore
3+
4+
package main
5+
6+
import (
7+
"log"
8+
"os"
9+
"path"
10+
"text/template"
11+
)
12+
13+
func main() {
14+
mapping := []struct {
15+
template string
16+
output string
17+
}{{
18+
template: "seqdec_amd64.s.in",
19+
output: "seqdec_amd64.s",
20+
},
21+
}
22+
23+
for i := range mapping {
24+
25+
state := make(map[string]string)
26+
27+
funcMap := template.FuncMap{
28+
"var": func(name string) string { return state[name] },
29+
"set": func(name, value string) string {
30+
state[name] = value
31+
return ""
32+
},
33+
}
34+
35+
input := mapping[i].template
36+
output := mapping[i].output
37+
if !shouldRegenerate(input, output) {
38+
log.Printf("%q is up to date", output)
39+
continue
40+
}
41+
42+
tmpl, err := template.New(path.Base(input)).Funcs(funcMap).ParseFiles(input)
43+
die(err)
44+
45+
out, err := os.Create(output)
46+
die(err)
47+
defer out.Close()
48+
49+
log.Printf("Generating %q from %q", output, input)
50+
err = tmpl.Execute(out, nil)
51+
die(err)
52+
}
53+
}
54+
55+
func die(err error) {
56+
if err != nil {
57+
log.Fatal(err)
58+
os.Exit(1)
59+
}
60+
}
61+
62+
func shouldRegenerate(srcpath, dstpath string) bool {
63+
src, err1 := os.Stat(srcpath)
64+
if err1 != nil {
65+
return true // I/O errors will be rediscovered later
66+
}
67+
68+
dst, err2 := os.Stat(dstpath)
69+
if err2 != nil {
70+
return true
71+
}
72+
73+
return src.ModTime().After(dst.ModTime())
74+
}

zstd/seqdec.go

Lines changed: 0 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -98,150 +98,6 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro
9898
return nil
9999
}
100100

101-
// decode sequences from the stream with the provided history.
102-
func (s *sequenceDecs) decode(seqs []seqVals) error {
103-
br := s.br
104-
105-
// Grab full sizes tables, to avoid bounds checks.
106-
llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
107-
llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
108-
s.seqSize = 0
109-
litRemain := len(s.literals)
110-
maxBlockSize := maxCompressedBlockSize
111-
if s.windowSize < maxBlockSize {
112-
maxBlockSize = s.windowSize
113-
}
114-
for i := range seqs {
115-
var ll, mo, ml int
116-
if br.off > 4+((maxOffsetBits+16+16)>>3) {
117-
// inlined function:
118-
// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
119-
120-
// Final will not read from stream.
121-
var llB, mlB, moB uint8
122-
ll, llB = llState.final()
123-
ml, mlB = mlState.final()
124-
mo, moB = ofState.final()
125-
126-
// extra bits are stored in reverse order.
127-
br.fillFast()
128-
mo += br.getBits(moB)
129-
if s.maxBits > 32 {
130-
br.fillFast()
131-
}
132-
ml += br.getBits(mlB)
133-
ll += br.getBits(llB)
134-
135-
if moB > 1 {
136-
s.prevOffset[2] = s.prevOffset[1]
137-
s.prevOffset[1] = s.prevOffset[0]
138-
s.prevOffset[0] = mo
139-
} else {
140-
// mo = s.adjustOffset(mo, ll, moB)
141-
// Inlined for rather big speedup
142-
if ll == 0 {
143-
// There is an exception though, when current sequence's literals_length = 0.
144-
// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
145-
// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
146-
mo++
147-
}
148-
149-
if mo == 0 {
150-
mo = s.prevOffset[0]
151-
} else {
152-
var temp int
153-
if mo == 3 {
154-
temp = s.prevOffset[0] - 1
155-
} else {
156-
temp = s.prevOffset[mo]
157-
}
158-
159-
if temp == 0 {
160-
// 0 is not valid; input is corrupted; force offset to 1
161-
println("WARNING: temp was 0")
162-
temp = 1
163-
}
164-
165-
if mo != 1 {
166-
s.prevOffset[2] = s.prevOffset[1]
167-
}
168-
s.prevOffset[1] = s.prevOffset[0]
169-
s.prevOffset[0] = temp
170-
mo = temp
171-
}
172-
}
173-
br.fillFast()
174-
} else {
175-
if br.overread() {
176-
if debugDecoder {
177-
printf("reading sequence %d, exceeded available data\n", i)
178-
}
179-
return io.ErrUnexpectedEOF
180-
}
181-
ll, mo, ml = s.next(br, llState, mlState, ofState)
182-
br.fill()
183-
}
184-
185-
if debugSequences {
186-
println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
187-
}
188-
// Evaluate.
189-
// We might be doing this async, so do it early.
190-
if mo == 0 && ml > 0 {
191-
return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
192-
}
193-
if ml > maxMatchLen {
194-
return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
195-
}
196-
s.seqSize += ll + ml
197-
if s.seqSize > maxBlockSize {
198-
return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
199-
}
200-
litRemain -= ll
201-
if litRemain < 0 {
202-
return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
203-
}
204-
seqs[i] = seqVals{
205-
ll: ll,
206-
ml: ml,
207-
mo: mo,
208-
}
209-
if i == len(seqs)-1 {
210-
// This is the last sequence, so we shouldn't update state.
211-
break
212-
}
213-
214-
// Manually inlined, ~ 5-20% faster
215-
// Update all 3 states at once. Approx 20% faster.
216-
nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
217-
if nBits == 0 {
218-
llState = llTable[llState.newState()&maxTableMask]
219-
mlState = mlTable[mlState.newState()&maxTableMask]
220-
ofState = ofTable[ofState.newState()&maxTableMask]
221-
} else {
222-
bits := br.get32BitsFast(nBits)
223-
lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
224-
llState = llTable[(llState.newState()+lowBits)&maxTableMask]
225-
226-
lowBits = uint16(bits >> (ofState.nbBits() & 31))
227-
lowBits &= bitMask[mlState.nbBits()&15]
228-
mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
229-
230-
lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
231-
ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
232-
}
233-
}
234-
s.seqSize += litRemain
235-
if s.seqSize > maxBlockSize {
236-
return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
237-
}
238-
err := br.close()
239-
if err != nil {
240-
printf("Closing sequences: %v, %+v\n", err, *br)
241-
}
242-
return err
243-
}
244-
245101
// execute will execute the decoded sequence with the provided history.
246102
// The sequence must be evaluated before being sent.
247103
func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {

zstd/seqdec_amd64.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
//go:build amd64 && !appengine && !noasm && gc
2+
// +build amd64,!appengine,!noasm,gc
3+
4+
package zstd
5+
6+
import (
7+
"fmt"
8+
)
9+
10+
type decodeAsmContext struct {
11+
llTable []decSymbol
12+
mlTable []decSymbol
13+
ofTable []decSymbol
14+
llState decSymbol
15+
mlState decSymbol
16+
ofState decSymbol
17+
iteration int
18+
seqs []seqVals
19+
litRemain int
20+
}
21+
22+
// error reported when mo == 0 && ml > 0
23+
const errorMatchLenOfsMismatch = 1
24+
25+
// error reported when ml > maxMatchLen
26+
const errorMatchLenTooBig = 2
27+
28+
// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
29+
//
30+
// Please refer to seqdec_generic.go for the reference implementation.
31+
func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
32+
33+
// decode sequences from the stream without the provided history.
34+
func (s *sequenceDecs) decode(seqs []seqVals) error {
35+
br := s.br
36+
37+
maxBlockSize := maxCompressedBlockSize
38+
if s.windowSize < maxBlockSize {
39+
maxBlockSize = s.windowSize
40+
}
41+
42+
ctx := decodeAsmContext{
43+
llTable: s.litLengths.fse.dt[:maxTablesize],
44+
mlTable: s.matchLengths.fse.dt[:maxTablesize],
45+
ofTable: s.offsets.fse.dt[:maxTablesize],
46+
llState: s.litLengths.state.state,
47+
mlState: s.matchLengths.state.state,
48+
ofState: s.offsets.state.state,
49+
seqs: seqs,
50+
iteration: len(seqs) - 1,
51+
litRemain: len(s.literals),
52+
}
53+
54+
s.seqSize = 0
55+
56+
errCode := sequenceDecs_decode_amd64(s, br, &ctx)
57+
if errCode != 0 {
58+
i := len(s.literals) - ctx.iteration
59+
switch errCode {
60+
case errorMatchLenOfsMismatch:
61+
ml := ctx.seqs[i].ml
62+
return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
63+
64+
case errorMatchLenTooBig:
65+
ml := ctx.seqs[i].ml
66+
return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
67+
}
68+
69+
return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
70+
}
71+
72+
if ctx.litRemain < 0 {
73+
return fmt.Errorf("literal count is too big: total available %d, total requested %d",
74+
len(s.literals), len(s.literals)-ctx.litRemain)
75+
}
76+
77+
s.seqSize += ctx.litRemain
78+
if s.seqSize > maxBlockSize {
79+
return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
80+
}
81+
err := br.close()
82+
if err != nil {
83+
printf("Closing sequences: %v, %+v\n", err, *br)
84+
}
85+
return err
86+
}

0 commit comments

Comments
 (0)