Skip to content

Commit 60b19fa

Browse files
authored
flate: Improve decompression speed 5-10% (#483)
* flate: Improve decompression speed 5-10% ``` benchmark old ns/op new ns/op delta BenchmarkDecodeDigitsSpeed1e4-32 49461 44204 -10.63% BenchmarkDecodeDigitsSpeed1e5-32 520488 509001 -2.21% BenchmarkDecodeDigitsSpeed1e6-32 5152811 5000738 -2.95% BenchmarkDecodeDigitsDefault1e4-32 50983 47693 -6.45% BenchmarkDecodeDigitsDefault1e5-32 494800 488243 -1.33% BenchmarkDecodeDigitsDefault1e6-32 4990322 4752297 -4.77% BenchmarkDecodeDigitsCompress1e4-32 49973 43992 -11.97% BenchmarkDecodeDigitsCompress1e5-32 515033 467616 -9.21% BenchmarkDecodeDigitsCompress1e6-32 5128402 4659296 -9.15% BenchmarkDecodeTwainSpeed1e4-32 51740 48324 -6.60% BenchmarkDecodeTwainSpeed1e5-32 532690 513209 -3.66% BenchmarkDecodeTwainSpeed1e6-32 5304535 5129081 -3.31% BenchmarkDecodeTwainDefault1e4-32 50613 48007 -5.15% BenchmarkDecodeTwainDefault1e5-32 488404 476945 -2.35% BenchmarkDecodeTwainDefault1e6-32 4881062 4710812 -3.49% BenchmarkDecodeTwainCompress1e4-32 49583 45632 -7.97% BenchmarkDecodeTwainCompress1e5-32 458843 445645 -2.88% BenchmarkDecodeTwainCompress1e6-32 4544787 4392530 -3.35% BenchmarkDecodeRandomSpeed1e4-32 298 305 +2.21% BenchmarkDecodeRandomSpeed1e5-32 1909 1909 +0.00% BenchmarkDecodeRandomSpeed1e6-32 19987 19809 -0.89% benchmark old MB/s new MB/s speedup BenchmarkDecodeDigitsSpeed1e4-32 202.18 226.23 1.12x BenchmarkDecodeDigitsSpeed1e5-32 192.13 196.46 1.02x BenchmarkDecodeDigitsSpeed1e6-32 194.07 199.97 1.03x BenchmarkDecodeDigitsDefault1e4-32 196.15 209.68 1.07x BenchmarkDecodeDigitsDefault1e5-32 202.10 204.82 1.01x BenchmarkDecodeDigitsDefault1e6-32 200.39 210.42 1.05x BenchmarkDecodeDigitsCompress1e4-32 200.11 227.31 1.14x BenchmarkDecodeDigitsCompress1e5-32 194.16 213.85 1.10x BenchmarkDecodeDigitsCompress1e6-32 194.99 214.62 1.10x BenchmarkDecodeTwainSpeed1e4-32 193.27 206.94 1.07x BenchmarkDecodeTwainSpeed1e5-32 187.73 194.85 1.04x BenchmarkDecodeTwainSpeed1e6-32 188.52 194.97 1.03x BenchmarkDecodeTwainDefault1e4-32 197.58 208.30 1.05x BenchmarkDecodeTwainDefault1e5-32 204.75 209.67 1.02x BenchmarkDecodeTwainDefault1e6-32 204.87 212.28 1.04x BenchmarkDecodeTwainCompress1e4-32 201.68 219.14 1.09x BenchmarkDecodeTwainCompress1e5-32 217.94 224.39 1.03x BenchmarkDecodeTwainCompress1e6-32 220.03 227.66 1.03x BenchmarkDecodeRandomSpeed1e4-32 33551.69 32828.68 0.98x BenchmarkDecodeRandomSpeed1e5-32 52391.84 52395.57 1.00x BenchmarkDecodeRandomSpeed1e6-32 50031.69 50482.80 1.01x ```
1 parent 61f58c1 commit 60b19fa

File tree

3 files changed

+536
-455
lines changed

3 files changed

+536
-455
lines changed

flate/_gen/gen_inflate.go

Lines changed: 60 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
//go:build generate
22
// +build generate
33

4-
//go:generate go run $GOFILE && gofmt -w ../inflate_gen.go
4+
//go:generate go run $GOFILE
5+
//go:generate go fmt ../inflate_gen.go
56

67
package main
78

@@ -16,9 +17,9 @@ func main() {
1617
panic(err)
1718
}
1819
defer f.Close()
19-
types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader"}
20-
names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader"}
21-
imports := []string{"bytes", "bufio", "io", "strings", "math/bits"}
20+
types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader", "Reader"}
21+
names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader", "GenericReader"}
22+
imports := []string{"bytes", "bufio", "fmt", "strings", "math/bits"}
2223
f.WriteString(`// Code generated by go generate gen_inflate.go. DO NOT EDIT.
2324
2425
package flate
@@ -44,6 +45,11 @@ func (f *decompressor) $FUNCNAME$() {
4445
)
4546
fr := f.r.($TYPE$)
4647
48+
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
49+
// but is smart enough to keep local variables in registers, so use nb and b,
50+
// inline call to moreBits and reassign b,nb back to f on return.
51+
fnb, fb := f.nb, f.b
52+
4753
switch f.stepState {
4854
case stateInit:
4955
goto readLiteral
@@ -62,41 +68,35 @@ readLiteral:
6268
// cases, the chunks slice will be 0 for the invalid sequence, leading it
6369
// satisfy the n == 0 check below.
6470
n := uint(f.hl.maxRead)
65-
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
66-
// but is smart enough to keep local variables in registers, so use nb and b,
67-
// inline call to moreBits and reassign b,nb back to f on return.
68-
nb, b := f.nb, f.b
6971
for {
70-
for nb < n {
72+
for fnb < n {
7173
c, err := fr.ReadByte()
7274
if err != nil {
73-
f.b = b
74-
f.nb = nb
75+
f.b, f.nb = fb, fnb
7576
f.err = noEOF(err)
7677
return
7778
}
7879
f.roffset++
79-
b |= uint32(c) << (nb & regSizeMaskUint32)
80-
nb += 8
80+
fb |= uint32(c) << (fnb & regSizeMaskUint32)
81+
fnb += 8
8182
}
82-
chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
83+
chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
8384
n = uint(chunk & huffmanCountMask)
8485
if n > huffmanChunkBits {
85-
chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
86+
chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
8687
n = uint(chunk & huffmanCountMask)
8788
}
88-
if n <= nb {
89+
if n <= fnb {
8990
if n == 0 {
90-
f.b = b
91-
f.nb = nb
91+
f.b, f.nb = fb, fnb
9292
if debugDecode {
9393
fmt.Println("huffsym: n==0")
9494
}
9595
f.err = CorruptInputError(f.roffset)
9696
return
9797
}
98-
f.b = b >> (n & regSizeMaskUint32)
99-
f.nb = nb - n
98+
fb = fb >> (n & regSizeMaskUint32)
99+
fnb = fnb - n
100100
v = int(chunk >> huffmanValueShift)
101101
break
102102
}
@@ -111,10 +111,12 @@ readLiteral:
111111
f.toRead = f.dict.readFlush()
112112
f.step = (*decompressor).$FUNCNAME$
113113
f.stepState = stateInit
114+
f.b, f.nb = fb, fnb
114115
return
115116
}
116117
goto readLiteral
117118
case v == 256:
119+
f.b, f.nb = fb, fnb
118120
f.finishBlock()
119121
return
120122
// otherwise, reference to older data
@@ -124,48 +126,51 @@ readLiteral:
124126
val := decCodeToLen[(v - 257)]
125127
length = int(val.length) + 3
126128
n := uint(val.extra)
127-
for f.nb < n {
129+
for fnb < n {
128130
c, err := fr.ReadByte()
129131
if err != nil {
132+
f.b, f.nb = fb, fnb
130133
if debugDecode {
131134
fmt.Println("morebits n>0:", err)
132135
}
133136
f.err = err
134137
return
135138
}
136139
f.roffset++
137-
f.b |= uint32(c) << f.nb
138-
f.nb += 8
140+
fb |= uint32(c) << (fnb&regSizeMaskUint32)
141+
fnb += 8
139142
}
140-
length += int(f.b & uint32(1<<(n&regSizeMaskUint32)-1))
141-
f.b >>= n & regSizeMaskUint32
142-
f.nb -= n
143+
length += int(fb & bitMask32[n])
144+
fb >>= n & regSizeMaskUint32
145+
fnb -= n
143146
default:
144147
if debugDecode {
145148
fmt.Println(v, ">= maxNumLit")
146149
}
147150
f.err = CorruptInputError(f.roffset)
151+
f.b, f.nb = fb, fnb
148152
return
149153
}
150154
151155
var dist uint32
152156
if f.hd == nil {
153-
for f.nb < 5 {
157+
for fnb < 5 {
154158
c, err := fr.ReadByte()
155159
if err != nil {
160+
f.b, f.nb = fb, fnb
156161
if debugDecode {
157162
fmt.Println("morebits f.nb<5:", err)
158163
}
159164
f.err = err
160165
return
161166
}
162167
f.roffset++
163-
f.b |= uint32(c) << f.nb
164-
f.nb += 8
168+
fb |= uint32(c) << (fnb&regSizeMaskUint32)
169+
fnb += 8
165170
}
166-
dist = uint32(bits.Reverse8(uint8(f.b & 0x1F << 3)))
167-
f.b >>= 5
168-
f.nb -= 5
171+
dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
172+
fb >>= 5
173+
fnb -= 5
169174
} else {
170175
// Since a huffmanDecoder can be empty or be composed of a degenerate tree
171176
// with single element, huffSym must error on these two edge cases. In both
@@ -175,38 +180,35 @@ readLiteral:
175180
// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
176181
// but is smart enough to keep local variables in registers, so use nb and b,
177182
// inline call to moreBits and reassign b,nb back to f on return.
178-
nb, b := f.nb, f.b
179183
for {
180-
for nb < n {
184+
for fnb < n {
181185
c, err := fr.ReadByte()
182186
if err != nil {
183-
f.b = b
184-
f.nb = nb
187+
f.b, f.nb = fb, fnb
185188
f.err = noEOF(err)
186189
return
187190
}
188191
f.roffset++
189-
b |= uint32(c) << (nb & regSizeMaskUint32)
190-
nb += 8
192+
fb |= uint32(c) << (fnb & regSizeMaskUint32)
193+
fnb += 8
191194
}
192-
chunk := f.hd.chunks[b&(huffmanNumChunks-1)]
195+
chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
193196
n = uint(chunk & huffmanCountMask)
194197
if n > huffmanChunkBits {
195-
chunk = f.hd.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hd.linkMask]
198+
chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
196199
n = uint(chunk & huffmanCountMask)
197200
}
198-
if n <= nb {
201+
if n <= fnb {
199202
if n == 0 {
200-
f.b = b
201-
f.nb = nb
203+
f.b, f.nb = fb, fnb
202204
if debugDecode {
203205
fmt.Println("huffsym: n==0")
204206
}
205207
f.err = CorruptInputError(f.roffset)
206208
return
207209
}
208-
f.b = b >> (n & regSizeMaskUint32)
209-
f.nb = nb - n
210+
fb = fb >> (n & regSizeMaskUint32)
211+
fnb = fnb - n
210212
dist = uint32(chunk >> huffmanValueShift)
211213
break
212214
}
@@ -220,24 +222,27 @@ readLiteral:
220222
nb := uint(dist-2) >> 1
221223
// have 1 bit in bottom of dist, need nb more.
222224
extra := (dist & 1) << (nb & regSizeMaskUint32)
223-
for f.nb < nb {
225+
for fnb < nb {
224226
c, err := fr.ReadByte()
225227
if err != nil {
228+
f.b, f.nb = fb, fnb
226229
if debugDecode {
227230
fmt.Println("morebits f.nb<nb:", err)
228231
}
229232
f.err = err
230233
return
231234
}
232235
f.roffset++
233-
f.b |= uint32(c) << f.nb
234-
f.nb += 8
236+
fb |= uint32(c) << (fnb&regSizeMaskUint32)
237+
fnb += 8
235238
}
236-
extra |= f.b & uint32(1<<(nb&regSizeMaskUint32)-1)
237-
f.b >>= nb & regSizeMaskUint32
238-
f.nb -= nb
239+
extra |= fb & bitMask32[nb]
240+
fb >>= nb & regSizeMaskUint32
241+
fnb -= nb
239242
dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
243+
// slower: dist = bitMask32[nb+1] + 2 + extra
240244
default:
245+
f.b, f.nb = fb, fnb
241246
if debugDecode {
242247
fmt.Println("dist too big:", dist, maxNumDist)
243248
}
@@ -247,6 +252,7 @@ readLiteral:
247252
248253
// No check on length; encoding can be prescient.
249254
if dist > uint32(f.dict.histSize()) {
255+
f.b, f.nb = fb, fnb
250256
if debugDecode {
251257
fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
252258
}
@@ -271,10 +277,12 @@ copyHistory:
271277
f.toRead = f.dict.readFlush()
272278
f.step = (*decompressor).$FUNCNAME$ // We need to continue this work
273279
f.stepState = stateDict
280+
f.b, f.nb = fb, fnb
274281
return
275282
}
276283
goto readLiteral
277284
}
285+
// Not reached
278286
}
279287
280288
`
@@ -290,6 +298,6 @@ copyHistory:
290298
f.WriteString("\t\t\treturn f.huffman" + names[i] + "\n")
291299
}
292300
f.WriteString("\t\tdefault:\n")
293-
f.WriteString("\t\t\treturn f.huffmanBlockGeneric")
301+
f.WriteString("\t\t\treturn f.huffmanGenericReader\n")
294302
f.WriteString("\t}\n}\n")
295303
}

0 commit comments

Comments
 (0)