Skip to content

Commit 13ca912

Browse files
committed
zstd: Add delta encoding support
This adds support for delta encoding, compatible with the --patch-from option that was introduced in zstd reference v1.4.5: https://github.com/facebook/zstd/wiki/Zstandard-as-a-patching-engine
1 parent 8b191e4 commit 13ca912

File tree

10 files changed

+151
-34
lines changed

10 files changed

+151
-34
lines changed

zstd/decoder.go

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -341,15 +341,8 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
341341
}
342342
return dst, err
343343
}
344-
if frame.DictionaryID != nil {
345-
dict, ok := d.dicts[*frame.DictionaryID]
346-
if !ok {
347-
return nil, ErrUnknownDictionary
348-
}
349-
if debugDecoder {
350-
println("setting dict", frame.DictionaryID)
351-
}
352-
frame.history.setDict(&dict)
344+
if err = d.setDict(frame); err != nil {
345+
return nil, err
353346
}
354347
if frame.WindowSize > d.o.maxWindowSize {
355348
if debugDecoder {
@@ -495,18 +488,12 @@ func (d *Decoder) nextBlockSync() (ok bool) {
495488
if !d.syncStream.inFrame {
496489
d.frame.history.reset()
497490
d.current.err = d.frame.reset(&d.syncStream.br)
491+
if d.current.err == nil {
492+
d.current.err = d.setDict(d.frame)
493+
}
498494
if d.current.err != nil {
499495
return false
500496
}
501-
if d.frame.DictionaryID != nil {
502-
dict, ok := d.dicts[*d.frame.DictionaryID]
503-
if !ok {
504-
d.current.err = ErrUnknownDictionary
505-
return false
506-
} else {
507-
d.frame.history.setDict(&dict)
508-
}
509-
}
510497
if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
511498
d.current.err = ErrDecoderSizeExceeded
512499
return false
@@ -865,13 +852,8 @@ decodeStream:
865852
if debugDecoder && err != nil {
866853
println("Frame decoder returned", err)
867854
}
868-
if err == nil && frame.DictionaryID != nil {
869-
dict, ok := d.dicts[*frame.DictionaryID]
870-
if !ok {
871-
err = ErrUnknownDictionary
872-
} else {
873-
frame.history.setDict(&dict)
874-
}
855+
if err == nil {
856+
err = d.setDict(frame)
875857
}
876858
if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
877859
if debugDecoder {
@@ -953,3 +935,20 @@ decodeStream:
953935
hist.reset()
954936
d.frame.history.b = frameHistCache
955937
}
938+
939+
func (d *Decoder) setDict(frame *frameDec) (err error) {
940+
dict, ok := d.dicts[frame.DictionaryID]
941+
if ok {
942+
if debugDecoder {
943+
println("setting dict", frame.DictionaryID)
944+
}
945+
frame.history.setDict(&dict)
946+
} else if frame.DictionaryID != 0 {
947+
// A zero or missing dictionary id is ambiguous:
948+
// either dictionary zero, or no dictionary. In particular,
949+
// zstd --patch-from uses this id for the source file,
950+
// so only return an error if the dictionary id is not zero.
951+
err = ErrUnknownDictionary
952+
}
953+
return err
954+
}

zstd/decoder_options.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,13 @@ func WithDecoderMaxMemory(n uint64) DOption {
8585
}
8686

8787
// WithDecoderDicts allows to register one or more dictionaries for the decoder.
88-
// If several dictionaries with the same ID is provided the last one will be used.
88+
//
89+
// Each slice in dict must be in the [dictionary format] produced by
90+
// "zstd --train" from the Zstandard reference implementation.
91+
//
92+
// If several dictionaries with the same ID are provided, the last one will be used.
93+
//
94+
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
8995
func WithDecoderDicts(dicts ...[]byte) DOption {
9096
return func(o *decoderOptions) error {
9197
for _, b := range dicts {
@@ -99,6 +105,15 @@ func WithDecoderDicts(dicts ...[]byte) DOption {
99105
}
100106
}
101107

108+
// WithEncoderDictRaw registers a dictionary that may be used by the decoder.
109+
// The slice content can be arbitrary data.
110+
func WithDecoderDictRaw(id uint32, content []byte) DOption {
111+
return func(o *decoderOptions) error {
112+
o.dicts = append(o.dicts, dict{id: id, content: content})
113+
return nil
114+
}
115+
}
116+
102117
// WithDecoderMaxWindow allows to set a maximum window size for decodes.
103118
// This allows rejecting packets that will cause big memory usage.
104119
// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.

zstd/dict_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,3 +459,38 @@ func readDicts(tb testing.TB, zr *zip.Reader) [][]byte {
459459
}
460460
return dicts
461461
}
462+
463+
// Test decoding of zstd --patch-from output.
464+
func TestDecoderRawDict(t *testing.T) {
465+
t.Parallel()
466+
467+
dict, err := os.ReadFile("testdata/delta/source.txt")
468+
if err != nil {
469+
t.Fatal(err)
470+
}
471+
472+
delta, err := os.Open("testdata/delta/target.txt.zst")
473+
if err != nil {
474+
t.Fatal(err)
475+
}
476+
defer delta.Close()
477+
478+
dec, err := NewReader(delta, WithDecoderDictRaw(0, dict))
479+
if err != nil {
480+
t.Fatal(err)
481+
}
482+
483+
out, err := io.ReadAll(dec)
484+
if err != nil {
485+
t.Fatal(err)
486+
}
487+
488+
ref, err := os.ReadFile("testdata/delta/target.txt")
489+
if err != nil {
490+
t.Fatal(err)
491+
}
492+
493+
if !bytes.Equal(out, ref) {
494+
t.Errorf("mismatch: got %q, wanted %q", out, ref)
495+
}
496+
}

zstd/encoder_options.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,13 @@ func WithLowerEncoderMem(b bool) EOption {
305305
}
306306

307307
// WithEncoderDict allows to register a dictionary that will be used for the encode.
308+
//
309+
// The slice dict must be in the [dictionary format] produced by
310+
// "zstd --train" from the Zstandard reference implementation.
311+
//
308312
// The encoder *may* choose to use no dictionary instead for certain payloads.
313+
//
314+
// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
309315
func WithEncoderDict(dict []byte) EOption {
310316
return func(o *encoderOptions) error {
311317
d, err := loadDict(dict)
@@ -316,3 +322,14 @@ func WithEncoderDict(dict []byte) EOption {
316322
return nil
317323
}
318324
}
325+
326+
// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
327+
//
328+
// The slice content may contain arbitrary data. It will be used as an initial
329+
// history.
330+
func WithEncoderDictRaw(id uint32, content []byte) EOption {
331+
return func(o *encoderOptions) error {
332+
o.dict = &dict{id: id, content: content}
333+
return nil
334+
}
335+
}

zstd/example_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package zstd_test
2+
3+
import (
4+
"bytes"
5+
"fmt"
6+
7+
"github.com/klauspost/compress/zstd"
8+
)
9+
10+
func ExampleWithEncoderDictRaw() {
11+
// "Raw" dictionaries can be used for compressed delta encoding.
12+
13+
source := []byte(`
14+
This is the source file. Compression of the target file with
15+
the source file as the dictionary will produce a compressed
16+
delta encoding of the target file.`)
17+
target := []byte(`
18+
This is the target file. Decompression of the delta encoding with
19+
the source file as the dictionary will produce this file.`)
20+
21+
// The dictionary id is arbitrary. We use zero for compatibility
22+
// with zstd --patch-from, but applications can use any id
23+
// not in the range [32768, 1<<31).
24+
const id = 0
25+
26+
bestLevel := zstd.WithEncoderLevel(zstd.SpeedBestCompression)
27+
28+
w, _ := zstd.NewWriter(nil, bestLevel,
29+
zstd.WithEncoderDictRaw(id, source))
30+
delta := w.EncodeAll(target, nil)
31+
32+
r, _ := zstd.NewReader(nil, zstd.WithDecoderDictRaw(id, source))
33+
out, err := r.DecodeAll(delta, nil)
34+
if err != nil || !bytes.Equal(out, target) {
35+
panic("decoding error")
36+
}
37+
38+
// Ordinary compression, for reference.
39+
w, _ = zstd.NewWriter(nil, bestLevel)
40+
compressed := w.EncodeAll(target, nil)
41+
42+
// Check that the delta is at most half as big as the compressed file.
43+
fmt.Println(len(delta) < len(compressed)/2)
44+
// Output:
45+
// true
46+
}

zstd/framedec.go

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ type frameDec struct {
2929

3030
FrameContentSize uint64
3131

32-
DictionaryID *uint32
32+
DictionaryID uint32
3333
HasCheckSum bool
3434
SingleSegment bool
3535
}
@@ -155,7 +155,7 @@ func (d *frameDec) reset(br byteBuffer) error {
155155

156156
// Read Dictionary_ID
157157
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
158-
d.DictionaryID = nil
158+
d.DictionaryID = 0
159159
if size := fhd & 3; size != 0 {
160160
if size == 3 {
161161
size = 4
@@ -178,11 +178,7 @@ func (d *frameDec) reset(br byteBuffer) error {
178178
if debugDecoder {
179179
println("Dict size", size, "ID:", id)
180180
}
181-
if id > 0 {
182-
// ID 0 means "sorry, no dictionary anyway".
183-
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
184-
d.DictionaryID = &id
185-
}
181+
d.DictionaryID = id
186182
}
187183

188184
// Read Frame_Content_Size

zstd/testdata/delta/source.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
0000000000000000
2+
3+
This file is to be used as the dictionary for compressing target.txt:
4+
5+
zstd -19 --patch-from=source.txt target.txt

zstd/testdata/delta/target.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
0000000000000000
2+
3+
This file is to be compressed with source.txt as the dictionary:
4+
5+
zstd -19 --patch-from=source.txt target.txt

zstd/testdata/delta/target.txt.zst

39 Bytes
Binary file not shown.

zstd/zstd.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ var (
7272
ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")
7373

7474
// ErrUnknownDictionary is returned if the dictionary ID is unknown.
75-
// For the time being dictionaries are not supported.
7675
ErrUnknownDictionary = errors.New("unknown dictionary")
7776

7877
// ErrFrameSizeExceeded is returned if the stated frame size is exceeded.

0 commit comments

Comments
 (0)