Skip to content

Commit 2ee598c

Browse files
committed
zstd: Add delta encoding support
This adds support for delta encoding, compatible with the --patch-from option that was introduced in zstd reference v1.4.5: https://github.com/facebook/zstd/wiki/Zstandard-as-a-patching-engine
1 parent 8b191e4 commit 2ee598c

File tree

11 files changed

+163
-34
lines changed

11 files changed

+163
-34
lines changed

zstd/decoder.go

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -341,15 +341,8 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
341341
}
342342
return dst, err
343343
}
344-
if frame.DictionaryID != nil {
345-
dict, ok := d.dicts[*frame.DictionaryID]
346-
if !ok {
347-
return nil, ErrUnknownDictionary
348-
}
349-
if debugDecoder {
350-
println("setting dict", frame.DictionaryID)
351-
}
352-
frame.history.setDict(&dict)
344+
if err = d.setDict(frame); err != nil {
345+
return nil, err
353346
}
354347
if frame.WindowSize > d.o.maxWindowSize {
355348
if debugDecoder {
@@ -495,18 +488,12 @@ func (d *Decoder) nextBlockSync() (ok bool) {
495488
if !d.syncStream.inFrame {
496489
d.frame.history.reset()
497490
d.current.err = d.frame.reset(&d.syncStream.br)
491+
if d.current.err == nil {
492+
d.current.err = d.setDict(d.frame)
493+
}
498494
if d.current.err != nil {
499495
return false
500496
}
501-
if d.frame.DictionaryID != nil {
502-
dict, ok := d.dicts[*d.frame.DictionaryID]
503-
if !ok {
504-
d.current.err = ErrUnknownDictionary
505-
return false
506-
} else {
507-
d.frame.history.setDict(&dict)
508-
}
509-
}
510497
if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
511498
d.current.err = ErrDecoderSizeExceeded
512499
return false
@@ -865,13 +852,8 @@ decodeStream:
865852
if debugDecoder && err != nil {
866853
println("Frame decoder returned", err)
867854
}
868-
if err == nil && frame.DictionaryID != nil {
869-
dict, ok := d.dicts[*frame.DictionaryID]
870-
if !ok {
871-
err = ErrUnknownDictionary
872-
} else {
873-
frame.history.setDict(&dict)
874-
}
855+
if err == nil {
856+
err = d.setDict(frame)
875857
}
876858
if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
877859
if debugDecoder {
@@ -953,3 +935,20 @@ decodeStream:
953935
hist.reset()
954936
d.frame.history.b = frameHistCache
955937
}
938+
939+
func (d *Decoder) setDict(frame *frameDec) (err error) {
940+
dict, ok := d.dicts[frame.DictionaryID]
941+
if ok {
942+
if debugDecoder {
943+
println("setting dict", frame.DictionaryID)
944+
}
945+
frame.history.setDict(&dict)
946+
} else if frame.DictionaryID != 0 {
947+
// A zero or missing dictionary id is ambiguous:
948+
// either dictionary zero, or no dictionary. In particular,
949+
// zstd --patch-from uses this id for the source file,
950+
// so only return an error if the dictionary id is not zero.
951+
err = ErrUnknownDictionary
952+
}
953+
return err
954+
}

zstd/decoder_options.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ package zstd
66

77
import (
88
"errors"
9+
"fmt"
10+
"math/bits"
911
"runtime"
1012
)
1113

@@ -85,7 +87,13 @@ func WithDecoderMaxMemory(n uint64) DOption {
8587
}
8688

8789
// WithDecoderDicts allows to register one or more dictionaries for the decoder.
88-
// If several dictionaries with the same ID is provided the last one will be used.
90+
//
91+
// Each slice in dict must be in the [dictionary format] produced by
92+
// "zstd --train" from the Zstandard reference implementation.
93+
//
94+
// If several dictionaries with the same ID are provided, the last one will be used.
95+
//
96+
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
8997
func WithDecoderDicts(dicts ...[]byte) DOption {
9098
return func(o *decoderOptions) error {
9199
for _, b := range dicts {
@@ -99,6 +107,18 @@ func WithDecoderDicts(dicts ...[]byte) DOption {
99107
}
100108
}
101109

110+
// WithEncoderDictRaw registers a dictionary that may be used by the decoder.
111+
// The slice content can be arbitrary data.
112+
func WithDecoderDictRaw(id uint32, content []byte) DOption {
113+
return func(o *decoderOptions) error {
114+
if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
115+
return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
116+
}
117+
o.dicts = append(o.dicts, dict{id: id, content: content, offsets: [3]int{1, 4, 8}})
118+
return nil
119+
}
120+
}
121+
102122
// WithDecoderMaxWindow allows to set a maximum window size for decodes.
103123
// This allows rejecting packets that will cause big memory usage.
104124
// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.

zstd/dict.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ type dict struct {
2121

2222
const dictMagic = "\x37\xa4\x30\xec"
2323

24+
// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB.
25+
const dictMaxLength = 1 << 31
26+
2427
// ID returns the dictionary id or 0 if d is nil.
2528
func (d *dict) ID() uint32 {
2629
if d == nil {

zstd/dict_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,3 +459,38 @@ func readDicts(tb testing.TB, zr *zip.Reader) [][]byte {
459459
}
460460
return dicts
461461
}
462+
463+
// Test decoding of zstd --patch-from output.
464+
func TestDecoderRawDict(t *testing.T) {
465+
t.Parallel()
466+
467+
dict, err := os.ReadFile("testdata/delta/source.txt")
468+
if err != nil {
469+
t.Fatal(err)
470+
}
471+
472+
delta, err := os.Open("testdata/delta/target.txt.zst")
473+
if err != nil {
474+
t.Fatal(err)
475+
}
476+
defer delta.Close()
477+
478+
dec, err := NewReader(delta, WithDecoderDictRaw(0, dict))
479+
if err != nil {
480+
t.Fatal(err)
481+
}
482+
483+
out, err := io.ReadAll(dec)
484+
if err != nil {
485+
t.Fatal(err)
486+
}
487+
488+
ref, err := os.ReadFile("testdata/delta/target.txt")
489+
if err != nil {
490+
t.Fatal(err)
491+
}
492+
493+
if !bytes.Equal(out, ref) {
494+
t.Errorf("mismatch: got %q, wanted %q", out, ref)
495+
}
496+
}

zstd/encoder_options.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"errors"
55
"fmt"
66
"math"
7+
"math/bits"
78
"runtime"
89
"strings"
910
)
@@ -305,7 +306,13 @@ func WithLowerEncoderMem(b bool) EOption {
305306
}
306307

307308
// WithEncoderDict allows to register a dictionary that will be used for the encode.
309+
//
310+
// The slice dict must be in the [dictionary format] produced by
311+
// "zstd --train" from the Zstandard reference implementation.
312+
//
308313
// The encoder *may* choose to use no dictionary instead for certain payloads.
314+
//
315+
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
309316
func WithEncoderDict(dict []byte) EOption {
310317
return func(o *encoderOptions) error {
311318
d, err := loadDict(dict)
@@ -316,3 +323,17 @@ func WithEncoderDict(dict []byte) EOption {
316323
return nil
317324
}
318325
}
326+
327+
// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
328+
//
329+
// The slice content may contain arbitrary data. It will be used as an initial
330+
// history.
331+
func WithEncoderDictRaw(id uint32, content []byte) EOption {
332+
return func(o *encoderOptions) error {
333+
if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
334+
return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
335+
}
336+
o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}}
337+
return nil
338+
}
339+
}

zstd/example_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package zstd_test
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
7+
"github.com/klauspost/compress/zstd"
8+
)
9+
10+
func ExampleWithEncoderDictRaw() {
11+
// "Raw" dictionaries can be used for compressed delta encoding.
12+
13+
source := []byte(`
14+
This is the source file. Compression of the target file with
15+
the source file as the dictionary will produce a compressed
16+
delta encoding of the target file.`)
17+
target := []byte(`
18+
This is the target file. Decompression of the delta encoding with
19+
the source file as the dictionary will produce this file.`)
20+
21+
// The dictionary id is arbitrary. We use zero for compatibility
22+
// with zstd --patch-from, but applications can use any id
23+
// not in the range [32768, 1<<31).
24+
const id = 0
25+
26+
bestLevel := zstd.WithEncoderLevel(zstd.SpeedBestCompression)
27+
28+
w, _ := zstd.NewWriter(nil, bestLevel,
29+
zstd.WithEncoderDictRaw(id, source))
30+
delta := w.EncodeAll(target, nil)
31+
32+
r, _ := zstd.NewReader(nil, zstd.WithDecoderDictRaw(id, source))
33+
out, err := r.DecodeAll(delta, nil)
34+
if err != nil || !bytes.Equal(out, target) {
35+
panic("decoding error")
36+
}
37+
38+
// Ordinary compression, for reference.
39+
w, _ = zstd.NewWriter(nil, bestLevel)
40+
compressed := w.EncodeAll(target, nil)
41+
42+
// Check that the delta is at most half as big as the compressed file.
43+
fmt.Println(len(delta) < len(compressed)/2)
44+
// Output:
45+
// true
46+
}

zstd/framedec.go

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ type frameDec struct {
2929

3030
FrameContentSize uint64
3131

32-
DictionaryID *uint32
32+
DictionaryID uint32
3333
HasCheckSum bool
3434
SingleSegment bool
3535
}
@@ -155,7 +155,7 @@ func (d *frameDec) reset(br byteBuffer) error {
155155

156156
// Read Dictionary_ID
157157
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
158-
d.DictionaryID = nil
158+
d.DictionaryID = 0
159159
if size := fhd & 3; size != 0 {
160160
if size == 3 {
161161
size = 4
@@ -178,11 +178,7 @@ func (d *frameDec) reset(br byteBuffer) error {
178178
if debugDecoder {
179179
println("Dict size", size, "ID:", id)
180180
}
181-
if id > 0 {
182-
// ID 0 means "sorry, no dictionary anyway".
183-
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
184-
d.DictionaryID = &id
185-
}
181+
d.DictionaryID = id
186182
}
187183

188184
// Read Frame_Content_Size

zstd/testdata/delta/source.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
0000000000000000
2+
3+
This file is to be used as the dictionary for compressing target.txt:
4+
5+
zstd -19 --patch-from=source.txt target.txt

zstd/testdata/delta/target.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
0000000000000000
2+
3+
This file is to be compressed with source.txt as the dictionary:
4+
5+
zstd -19 --patch-from=source.txt target.txt

zstd/testdata/delta/target.txt.zst

39 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)