aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/s2/dict.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/dict.go')
-rw-r--r--vendor/github.com/klauspost/compress/s2/dict.go350
1 files changed, 350 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/dict.go b/vendor/github.com/klauspost/compress/s2/dict.go
new file mode 100644
index 0000000..f125ad0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/dict.go
@@ -0,0 +1,350 @@
1// Copyright (c) 2022+ Klaus Post. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package s2
6
7import (
8 "bytes"
9 "encoding/binary"
10 "sync"
11)
12
13const (
14 // MinDictSize is the minimum dictionary size when repeat has been read.
15 MinDictSize = 16
16
17 // MaxDictSize is the maximum dictionary size when repeat has been read.
18 MaxDictSize = 65536
19
20 // MaxDictSrcOffset is the maximum offset where a dictionary entry can start.
21 MaxDictSrcOffset = 65535
22)
23
24// Dict contains a dictionary that can be used for encoding and decoding s2
25type Dict struct {
26 dict []byte
27 repeat int // Repeat as index of dict
28
29 fast, better, best sync.Once
30 fastTable *[1 << 14]uint16
31
32 betterTableShort *[1 << 14]uint16
33 betterTableLong *[1 << 17]uint16
34
35 bestTableShort *[1 << 16]uint32
36 bestTableLong *[1 << 19]uint32
37}
38
39// NewDict will read a dictionary.
40// It will return nil if the dictionary is invalid.
41func NewDict(dict []byte) *Dict {
42 if len(dict) == 0 {
43 return nil
44 }
45 var d Dict
46 // Repeat is the first value of the dict
47 r, n := binary.Uvarint(dict)
48 if n <= 0 {
49 return nil
50 }
51 dict = dict[n:]
52 d.dict = dict
53 if cap(d.dict) < len(d.dict)+16 {
54 d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
55 }
56 if len(dict) < MinDictSize || len(dict) > MaxDictSize {
57 return nil
58 }
59 d.repeat = int(r)
60 if d.repeat > len(dict) {
61 return nil
62 }
63 return &d
64}
65
66// Bytes will return a serialized version of the dictionary.
67// The output can be sent to NewDict.
68func (d *Dict) Bytes() []byte {
69 dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
70 return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
71}
72
73// MakeDict will create a dictionary.
74// 'data' must be at least MinDictSize.
75// If data is longer than MaxDictSize only the last MaxDictSize bytes will be used.
76// If searchStart is set the start repeat value will be set to the last
77// match of this content.
78// If no matches are found, it will attempt to find shorter matches.
79// This content should match the typical start of a block.
80// If at least 4 bytes cannot be matched, repeat is set to start of block.
81func MakeDict(data []byte, searchStart []byte) *Dict {
82 if len(data) == 0 {
83 return nil
84 }
85 if len(data) > MaxDictSize {
86 data = data[len(data)-MaxDictSize:]
87 }
88 var d Dict
89 dict := data
90 d.dict = dict
91 if cap(d.dict) < len(d.dict)+16 {
92 d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
93 }
94 if len(dict) < MinDictSize {
95 return nil
96 }
97
98 // Find the longest match possible, last entry if multiple.
99 for s := len(searchStart); s > 4; s-- {
100 if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
101 d.repeat = idx
102 break
103 }
104 }
105
106 return &d
107}
108
109// MakeDictManual will create a dictionary.
110// 'data' must be at least MinDictSize and less than or equal to MaxDictSize.
111// A manual first repeat index into data must be provided.
112// It must be less than len(data)-8.
113func MakeDictManual(data []byte, firstIdx uint16) *Dict {
114 if len(data) < MinDictSize || int(firstIdx) >= len(data)-8 || len(data) > MaxDictSize {
115 return nil
116 }
117 var d Dict
118 dict := data
119 d.dict = dict
120 if cap(d.dict) < len(d.dict)+16 {
121 d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
122 }
123
124 d.repeat = int(firstIdx)
125 return &d
126}
127
128// Encode returns the encoded form of src. The returned slice may be a sub-
129// slice of dst if dst was large enough to hold the entire encoded block.
130// Otherwise, a newly allocated slice will be returned.
131//
132// The dst and src must not overlap. It is valid to pass a nil dst.
133//
134// The blocks will require the same amount of memory to decode as encoding,
135// and does not make for concurrent decoding.
136// Also note that blocks do not contain CRC information, so corruption may be undetected.
137//
138// If you need to encode larger amounts of data, consider using
139// the streaming interface which gives all of these features.
140func (d *Dict) Encode(dst, src []byte) []byte {
141 if n := MaxEncodedLen(len(src)); n < 0 {
142 panic(ErrTooLarge)
143 } else if cap(dst) < n {
144 dst = make([]byte, n)
145 } else {
146 dst = dst[:n]
147 }
148
149 // The block starts with the varint-encoded length of the decompressed bytes.
150 dstP := binary.PutUvarint(dst, uint64(len(src)))
151
152 if len(src) == 0 {
153 return dst[:dstP]
154 }
155 if len(src) < minNonLiteralBlockSize {
156 dstP += emitLiteral(dst[dstP:], src)
157 return dst[:dstP]
158 }
159 n := encodeBlockDictGo(dst[dstP:], src, d)
160 if n > 0 {
161 dstP += n
162 return dst[:dstP]
163 }
164 // Not compressible
165 dstP += emitLiteral(dst[dstP:], src)
166 return dst[:dstP]
167}
168
169// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
170// slice of dst if dst was large enough to hold the entire encoded block.
171// Otherwise, a newly allocated slice will be returned.
172//
173// EncodeBetter compresses better than Encode but typically with a
174// 10-40% speed decrease on both compression and decompression.
175//
176// The dst and src must not overlap. It is valid to pass a nil dst.
177//
178// The blocks will require the same amount of memory to decode as encoding,
179// and does not make for concurrent decoding.
180// Also note that blocks do not contain CRC information, so corruption may be undetected.
181//
182// If you need to encode larger amounts of data, consider using
183// the streaming interface which gives all of these features.
184func (d *Dict) EncodeBetter(dst, src []byte) []byte {
185 if n := MaxEncodedLen(len(src)); n < 0 {
186 panic(ErrTooLarge)
187 } else if len(dst) < n {
188 dst = make([]byte, n)
189 }
190
191 // The block starts with the varint-encoded length of the decompressed bytes.
192 dstP := binary.PutUvarint(dst, uint64(len(src)))
193
194 if len(src) == 0 {
195 return dst[:dstP]
196 }
197 if len(src) < minNonLiteralBlockSize {
198 dstP += emitLiteral(dst[dstP:], src)
199 return dst[:dstP]
200 }
201 n := encodeBlockBetterDict(dst[dstP:], src, d)
202 if n > 0 {
203 dstP += n
204 return dst[:dstP]
205 }
206 // Not compressible
207 dstP += emitLiteral(dst[dstP:], src)
208 return dst[:dstP]
209}
210
211// EncodeBest returns the encoded form of src. The returned slice may be a sub-
212// slice of dst if dst was large enough to hold the entire encoded block.
213// Otherwise, a newly allocated slice will be returned.
214//
215// EncodeBest compresses as good as reasonably possible but with a
216// big speed decrease.
217//
218// The dst and src must not overlap. It is valid to pass a nil dst.
219//
220// The blocks will require the same amount of memory to decode as encoding,
221// and does not make for concurrent decoding.
222// Also note that blocks do not contain CRC information, so corruption may be undetected.
223//
224// If you need to encode larger amounts of data, consider using
225// the streaming interface which gives all of these features.
226func (d *Dict) EncodeBest(dst, src []byte) []byte {
227 if n := MaxEncodedLen(len(src)); n < 0 {
228 panic(ErrTooLarge)
229 } else if len(dst) < n {
230 dst = make([]byte, n)
231 }
232
233 // The block starts with the varint-encoded length of the decompressed bytes.
234 dstP := binary.PutUvarint(dst, uint64(len(src)))
235
236 if len(src) == 0 {
237 return dst[:dstP]
238 }
239 if len(src) < minNonLiteralBlockSize {
240 dstP += emitLiteral(dst[dstP:], src)
241 return dst[:dstP]
242 }
243 n := encodeBlockBest(dst[dstP:], src, d)
244 if n > 0 {
245 dstP += n
246 return dst[:dstP]
247 }
248 // Not compressible
249 dstP += emitLiteral(dst[dstP:], src)
250 return dst[:dstP]
251}
252
253// Decode returns the decoded form of src. The returned slice may be a sub-
254// slice of dst if dst was large enough to hold the entire decoded block.
255// Otherwise, a newly allocated slice will be returned.
256//
257// The dst and src must not overlap. It is valid to pass a nil dst.
258func (d *Dict) Decode(dst, src []byte) ([]byte, error) {
259 dLen, s, err := decodedLen(src)
260 if err != nil {
261 return nil, err
262 }
263 if dLen <= cap(dst) {
264 dst = dst[:dLen]
265 } else {
266 dst = make([]byte, dLen)
267 }
268 if s2DecodeDict(dst, src[s:], d) != 0 {
269 return nil, ErrCorrupt
270 }
271 return dst, nil
272}
273
274func (d *Dict) initFast() {
275 d.fast.Do(func() {
276 const (
277 tableBits = 14
278 maxTableSize = 1 << tableBits
279 )
280
281 var table [maxTableSize]uint16
282 // We stop so any entry of length 8 can always be read.
283 for i := 0; i < len(d.dict)-8-2; i += 3 {
284 x0 := load64(d.dict, i)
285 h0 := hash6(x0, tableBits)
286 h1 := hash6(x0>>8, tableBits)
287 h2 := hash6(x0>>16, tableBits)
288 table[h0] = uint16(i)
289 table[h1] = uint16(i + 1)
290 table[h2] = uint16(i + 2)
291 }
292 d.fastTable = &table
293 })
294}
295
296func (d *Dict) initBetter() {
297 d.better.Do(func() {
298 const (
299 // Long hash matches.
300 lTableBits = 17
301 maxLTableSize = 1 << lTableBits
302
303 // Short hash matches.
304 sTableBits = 14
305 maxSTableSize = 1 << sTableBits
306 )
307
308 var lTable [maxLTableSize]uint16
309 var sTable [maxSTableSize]uint16
310
311 // We stop so any entry of length 8 can always be read.
312 for i := 0; i < len(d.dict)-8; i++ {
313 cv := load64(d.dict, i)
314 lTable[hash7(cv, lTableBits)] = uint16(i)
315 sTable[hash4(cv, sTableBits)] = uint16(i)
316 }
317 d.betterTableShort = &sTable
318 d.betterTableLong = &lTable
319 })
320}
321
322func (d *Dict) initBest() {
323 d.best.Do(func() {
324 const (
325 // Long hash matches.
326 lTableBits = 19
327 maxLTableSize = 1 << lTableBits
328
329 // Short hash matches.
330 sTableBits = 16
331 maxSTableSize = 1 << sTableBits
332 )
333
334 var lTable [maxLTableSize]uint32
335 var sTable [maxSTableSize]uint32
336
337 // We stop so any entry of length 8 can always be read.
338 for i := 0; i < len(d.dict)-8; i++ {
339 cv := load64(d.dict, i)
340 hashL := hash8(cv, lTableBits)
341 hashS := hash4(cv, sTableBits)
342 candidateL := lTable[hashL]
343 candidateS := sTable[hashS]
344 lTable[hashL] = uint32(i) | candidateL<<16
345 sTable[hashS] = uint32(i) | candidateS<<16
346 }
347 d.bestTableShort = &sTable
348 d.bestTableLong = &lTable
349 })
350}