aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/s2/decode_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/decode_amd64.s')
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_amd64.s568
1 files changed, 568 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
new file mode 100644
index 0000000..9b105e0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
@@ -0,0 +1,568 @@
1// Copyright 2016 The Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6// +build !appengine
7// +build gc
8// +build !noasm
9
10#include "textflag.h"
11
12#define R_TMP0 AX
13#define R_TMP1 BX
14#define R_LEN CX
15#define R_OFF DX
16#define R_SRC SI
17#define R_DST DI
18#define R_DBASE R8
19#define R_DLEN R9
20#define R_DEND R10
21#define R_SBASE R11
22#define R_SLEN R12
23#define R_SEND R13
24#define R_TMP2 R14
25#define R_TMP3 R15
26
27// The asm code generally follows the pure Go code in decode_other.go, except
28// where marked with a "!!!".
29
30// func decode(dst, src []byte) int
31//
32// All local variables fit into registers. The non-zero stack size is only to
33// spill registers and push args when issuing a CALL. The register allocation:
34// - R_TMP0 scratch
35// - R_TMP1 scratch
36// - R_LEN length or x (shared)
37// - R_OFF offset
38// - R_SRC &src[s]
39// - R_DST &dst[d]
40// + R_DBASE dst_base
41// + R_DLEN dst_len
42// + R_DEND dst_base + dst_len
43// + R_SBASE src_base
44// + R_SLEN src_len
45// + R_SEND src_base + src_len
46// - R_TMP2 used by doCopy
47// - R_TMP3 used by doCopy
48//
49// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
50// function, and after a CALL returns, and are not otherwise modified.
51//
52// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
53// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
54TEXT ·s2Decode(SB), NOSPLIT, $48-56
55 // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
56 MOVQ dst_base+0(FP), R_DBASE
57 MOVQ dst_len+8(FP), R_DLEN
58 MOVQ R_DBASE, R_DST
59 MOVQ R_DBASE, R_DEND
60 ADDQ R_DLEN, R_DEND
61 MOVQ src_base+24(FP), R_SBASE
62 MOVQ src_len+32(FP), R_SLEN
63 MOVQ R_SBASE, R_SRC
64 MOVQ R_SBASE, R_SEND
65 ADDQ R_SLEN, R_SEND
66 XORQ R_OFF, R_OFF
67
68loop:
69 // for s < len(src)
70 CMPQ R_SRC, R_SEND
71 JEQ end
72
73 // R_LEN = uint32(src[s])
74 //
75 // switch src[s] & 0x03
76 MOVBLZX (R_SRC), R_LEN
77 MOVL R_LEN, R_TMP1
78 ANDL $3, R_TMP1
79 CMPL R_TMP1, $1
80 JAE tagCopy
81
82 // ----------------------------------------
83 // The code below handles literal tags.
84
85 // case tagLiteral:
86 // x := uint32(src[s] >> 2)
87 // switch
88 SHRL $2, R_LEN
89 CMPL R_LEN, $60
90 JAE tagLit60Plus
91
92 // case x < 60:
93 // s++
94 INCQ R_SRC
95
96doLit:
97 // This is the end of the inner "switch", when we have a literal tag.
98 //
99 // We assume that R_LEN == x and x fits in a uint32, where x is the variable
100 // used in the pure Go decode_other.go code.
101
102 // length = int(x) + 1
103 //
104 // Unlike the pure Go code, we don't need to check if length <= 0 because
105 // R_LEN can hold 64 bits, so the increment cannot overflow.
106 INCQ R_LEN
107
108 // Prepare to check if copying length bytes will run past the end of dst or
109 // src.
110 //
111 // R_TMP0 = len(dst) - d
112 // R_TMP1 = len(src) - s
113 MOVQ R_DEND, R_TMP0
114 SUBQ R_DST, R_TMP0
115 MOVQ R_SEND, R_TMP1
116 SUBQ R_SRC, R_TMP1
117
118 // !!! Try a faster technique for short (16 or fewer bytes) copies.
119 //
120 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
121 // goto callMemmove // Fall back on calling runtime·memmove.
122 // }
123 //
124 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
125 // against 21 instead of 16, because it cannot assume that all of its input
126 // is contiguous in memory and so it needs to leave enough source bytes to
127 // read the next tag without refilling buffers, but Go's Decode assumes
128 // contiguousness (the src argument is a []byte).
129 CMPQ R_LEN, $16
130 JGT callMemmove
131 CMPQ R_TMP0, $16
132 JLT callMemmove
133 CMPQ R_TMP1, $16
134 JLT callMemmove
135
136 // !!! Implement the copy from src to dst as a 16-byte load and store.
137 // (Decode's documentation says that dst and src must not overlap.)
138 //
139 // This always copies 16 bytes, instead of only length bytes, but that's
140 // OK. If the input is a valid Snappy encoding then subsequent iterations
141 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
142 // non-nil error), so the overrun will be ignored.
143 //
144 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
145 // 16-byte loads and stores. This technique probably wouldn't be as
146 // effective on architectures that are fussier about alignment.
147 MOVOU 0(R_SRC), X0
148 MOVOU X0, 0(R_DST)
149
150 // d += length
151 // s += length
152 ADDQ R_LEN, R_DST
153 ADDQ R_LEN, R_SRC
154 JMP loop
155
156callMemmove:
157 // if length > len(dst)-d || length > len(src)-s { etc }
158 CMPQ R_LEN, R_TMP0
159 JGT errCorrupt
160 CMPQ R_LEN, R_TMP1
161 JGT errCorrupt
162
163 // copy(dst[d:], src[s:s+length])
164 //
165 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
166 // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
167 // three registers to the stack, to save local variables across the CALL.
168 MOVQ R_DST, 0(SP)
169 MOVQ R_SRC, 8(SP)
170 MOVQ R_LEN, 16(SP)
171 MOVQ R_DST, 24(SP)
172 MOVQ R_SRC, 32(SP)
173 MOVQ R_LEN, 40(SP)
174 MOVQ R_OFF, 48(SP)
175 CALL runtime·memmove(SB)
176
177 // Restore local variables: unspill registers from the stack and
178 // re-calculate R_DBASE-R_SEND.
179 MOVQ 24(SP), R_DST
180 MOVQ 32(SP), R_SRC
181 MOVQ 40(SP), R_LEN
182 MOVQ 48(SP), R_OFF
183 MOVQ dst_base+0(FP), R_DBASE
184 MOVQ dst_len+8(FP), R_DLEN
185 MOVQ R_DBASE, R_DEND
186 ADDQ R_DLEN, R_DEND
187 MOVQ src_base+24(FP), R_SBASE
188 MOVQ src_len+32(FP), R_SLEN
189 MOVQ R_SBASE, R_SEND
190 ADDQ R_SLEN, R_SEND
191
192 // d += length
193 // s += length
194 ADDQ R_LEN, R_DST
195 ADDQ R_LEN, R_SRC
196 JMP loop
197
198tagLit60Plus:
199 // !!! This fragment does the
200 //
201 // s += x - 58; if uint(s) > uint(len(src)) { etc }
202 //
203 // checks. In the asm version, we code it once instead of once per switch case.
204 ADDQ R_LEN, R_SRC
205 SUBQ $58, R_SRC
206 CMPQ R_SRC, R_SEND
207 JA errCorrupt
208
209 // case x == 60:
210 CMPL R_LEN, $61
211 JEQ tagLit61
212 JA tagLit62Plus
213
214 // x = uint32(src[s-1])
215 MOVBLZX -1(R_SRC), R_LEN
216 JMP doLit
217
218tagLit61:
219 // case x == 61:
220 // x = uint32(src[s-2]) | uint32(src[s-1])<<8
221 MOVWLZX -2(R_SRC), R_LEN
222 JMP doLit
223
224tagLit62Plus:
225 CMPL R_LEN, $62
226 JA tagLit63
227
228 // case x == 62:
229 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
230 // We read one byte, safe to read one back, since we are just reading tag.
231 // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
232 MOVL -4(R_SRC), R_LEN
233 SHRL $8, R_LEN
234 JMP doLit
235
236tagLit63:
237 // case x == 63:
238 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
239 MOVL -4(R_SRC), R_LEN
240 JMP doLit
241
242// The code above handles literal tags.
243// ----------------------------------------
244// The code below handles copy tags.
245
246tagCopy4:
247 // case tagCopy4:
248 // s += 5
249 ADDQ $5, R_SRC
250
251 // if uint(s) > uint(len(src)) { etc }
252 CMPQ R_SRC, R_SEND
253 JA errCorrupt
254
255 // length = 1 + int(src[s-5])>>2
256 SHRQ $2, R_LEN
257 INCQ R_LEN
258
259 // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
260 MOVLQZX -4(R_SRC), R_OFF
261 JMP doCopy
262
263tagCopy2:
264 // case tagCopy2:
265 // s += 3
266 ADDQ $3, R_SRC
267
268 // if uint(s) > uint(len(src)) { etc }
269 CMPQ R_SRC, R_SEND
270 JA errCorrupt
271
272 // length = 1 + int(src[s-3])>>2
273 SHRQ $2, R_LEN
274 INCQ R_LEN
275
276 // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
277 MOVWQZX -2(R_SRC), R_OFF
278 JMP doCopy
279
280tagCopy:
281 // We have a copy tag. We assume that:
282 // - R_TMP1 == src[s] & 0x03
283 // - R_LEN == src[s]
284 CMPQ R_TMP1, $2
285 JEQ tagCopy2
286 JA tagCopy4
287
288 // case tagCopy1:
289 // s += 2
290 ADDQ $2, R_SRC
291
292 // if uint(s) > uint(len(src)) { etc }
293 CMPQ R_SRC, R_SEND
294 JA errCorrupt
295
296 // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
297 // length = 4 + int(src[s-2])>>2&0x7
298 MOVBQZX -1(R_SRC), R_TMP1
299 MOVQ R_LEN, R_TMP0
300 SHRQ $2, R_LEN
301 ANDQ $0xe0, R_TMP0
302 ANDQ $7, R_LEN
303 SHLQ $3, R_TMP0
304 ADDQ $4, R_LEN
305 ORQ R_TMP1, R_TMP0
306
307 // check if repeat code, ZF set by ORQ.
308 JZ repeatCode
309
310 // This is a regular copy, transfer our temporary value to R_OFF (length)
311 MOVQ R_TMP0, R_OFF
312 JMP doCopy
313
314// This is a repeat code.
315repeatCode:
316 // If length < 9, reuse last offset, with the length already calculated.
317 CMPQ R_LEN, $9
318 JL doCopyRepeat
319
320 // Read additional bytes for length.
321 JE repeatLen1
322
323 // Rare, so the extra branch shouldn't hurt too much.
324 CMPQ R_LEN, $10
325 JE repeatLen2
326 JMP repeatLen3
327
328// Read repeat lengths.
329repeatLen1:
330 // s ++
331 ADDQ $1, R_SRC
332
333 // if uint(s) > uint(len(src)) { etc }
334 CMPQ R_SRC, R_SEND
335 JA errCorrupt
336
337 // length = src[s-1] + 8
338 MOVBQZX -1(R_SRC), R_LEN
339 ADDL $8, R_LEN
340 JMP doCopyRepeat
341
342repeatLen2:
343 // s +=2
344 ADDQ $2, R_SRC
345
346 // if uint(s) > uint(len(src)) { etc }
347 CMPQ R_SRC, R_SEND
348 JA errCorrupt
349
350 // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
351 MOVWQZX -2(R_SRC), R_LEN
352 ADDL $260, R_LEN
353 JMP doCopyRepeat
354
355repeatLen3:
356 // s +=3
357 ADDQ $3, R_SRC
358
359 // if uint(s) > uint(len(src)) { etc }
360 CMPQ R_SRC, R_SEND
361 JA errCorrupt
362
363 // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
364 // Read one byte further back (just part of the tag, shifted out)
365 MOVL -4(R_SRC), R_LEN
366 SHRL $8, R_LEN
367 ADDL $65540, R_LEN
368 JMP doCopyRepeat
369
370doCopy:
371 // This is the end of the outer "switch", when we have a copy tag.
372 //
373 // We assume that:
374 // - R_LEN == length && R_LEN > 0
375 // - R_OFF == offset
376
377 // if d < offset { etc }
378 MOVQ R_DST, R_TMP1
379 SUBQ R_DBASE, R_TMP1
380 CMPQ R_TMP1, R_OFF
381 JLT errCorrupt
382
383 // Repeat values can skip the test above, since any offset > 0 will be in dst.
384doCopyRepeat:
385 // if offset <= 0 { etc }
386 CMPQ R_OFF, $0
387 JLE errCorrupt
388
389 // if length > len(dst)-d { etc }
390 MOVQ R_DEND, R_TMP1
391 SUBQ R_DST, R_TMP1
392 CMPQ R_LEN, R_TMP1
393 JGT errCorrupt
394
395 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
396 //
397 // Set:
398 // - R_TMP2 = len(dst)-d
399 // - R_TMP3 = &dst[d-offset]
400 MOVQ R_DEND, R_TMP2
401 SUBQ R_DST, R_TMP2
402 MOVQ R_DST, R_TMP3
403 SUBQ R_OFF, R_TMP3
404
405 // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
406 //
407 // First, try using two 8-byte load/stores, similar to the doLit technique
408 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
409 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
410 // and not one 16-byte load/store, and the first store has to be before the
411 // second load, due to the overlap if offset is in the range [8, 16).
412 //
413 // if length > 16 || offset < 8 || len(dst)-d < 16 {
414 // goto slowForwardCopy
415 // }
416 // copy 16 bytes
417 // d += length
418 CMPQ R_LEN, $16
419 JGT slowForwardCopy
420 CMPQ R_OFF, $8
421 JLT slowForwardCopy
422 CMPQ R_TMP2, $16
423 JLT slowForwardCopy
424 MOVQ 0(R_TMP3), R_TMP0
425 MOVQ R_TMP0, 0(R_DST)
426 MOVQ 8(R_TMP3), R_TMP1
427 MOVQ R_TMP1, 8(R_DST)
428 ADDQ R_LEN, R_DST
429 JMP loop
430
431slowForwardCopy:
432 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
433 // can still try 8-byte load stores, provided we can overrun up to 10 extra
434 // bytes. As above, the overrun will be fixed up by subsequent iterations
435 // of the outermost loop.
436 //
437 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
438 // commentary says:
439 //
440 // ----
441 //
442 // The main part of this loop is a simple copy of eight bytes at a time
443 // until we've copied (at least) the requested amount of bytes. However,
444 // if d and d-offset are less than eight bytes apart (indicating a
445 // repeating pattern of length < 8), we first need to expand the pattern in
446 // order to get the correct results. For instance, if the buffer looks like
447 // this, with the eight-byte <d-offset> and <d> patterns marked as
448 // intervals:
449 //
450 // abxxxxxxxxxxxx
451 // [------] d-offset
452 // [------] d
453 //
454 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
455 // once, after which we can move <d> two bytes without moving <d-offset>:
456 //
457 // ababxxxxxxxxxx
458 // [------] d-offset
459 // [------] d
460 //
461 // and repeat the exercise until the two no longer overlap.
462 //
463 // This allows us to do very well in the special case of one single byte
464 // repeated many times, without taking a big hit for more general cases.
465 //
466 // The worst case of extra writing past the end of the match occurs when
467 // offset == 1 and length == 1; the last copy will read from byte positions
468 // [0..7] and write to [4..11], whereas it was only supposed to write to
469 // position 1. Thus, ten excess bytes.
470 //
471 // ----
472 //
473 // That "10 byte overrun" worst case is confirmed by Go's
474 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
475 // and finishSlowForwardCopy algorithm.
476 //
477 // if length > len(dst)-d-10 {
478 // goto verySlowForwardCopy
479 // }
480 SUBQ $10, R_TMP2
481 CMPQ R_LEN, R_TMP2
482 JGT verySlowForwardCopy
483
484 // We want to keep the offset, so we use R_TMP2 from here.
485 MOVQ R_OFF, R_TMP2
486
487makeOffsetAtLeast8:
488 // !!! As above, expand the pattern so that offset >= 8 and we can use
489 // 8-byte load/stores.
490 //
491 // for offset < 8 {
492 // copy 8 bytes from dst[d-offset:] to dst[d:]
493 // length -= offset
494 // d += offset
495 // offset += offset
496 // // The two previous lines together means that d-offset, and therefore
497 // // R_TMP3, is unchanged.
498 // }
499 CMPQ R_TMP2, $8
500 JGE fixUpSlowForwardCopy
501 MOVQ (R_TMP3), R_TMP1
502 MOVQ R_TMP1, (R_DST)
503 SUBQ R_TMP2, R_LEN
504 ADDQ R_TMP2, R_DST
505 ADDQ R_TMP2, R_TMP2
506 JMP makeOffsetAtLeast8
507
508fixUpSlowForwardCopy:
509 // !!! Add length (which might be negative now) to d (implied by R_DST being
510 // &dst[d]) so that d ends up at the right place when we jump back to the
511 // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
512 // length is positive, copying the remaining length bytes will write to the
513 // right place.
514 MOVQ R_DST, R_TMP0
515 ADDQ R_LEN, R_DST
516
517finishSlowForwardCopy:
518 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
519 // length means that we overrun, but as above, that will be fixed up by
520 // subsequent iterations of the outermost loop.
521 CMPQ R_LEN, $0
522 JLE loop
523 MOVQ (R_TMP3), R_TMP1
524 MOVQ R_TMP1, (R_TMP0)
525 ADDQ $8, R_TMP3
526 ADDQ $8, R_TMP0
527 SUBQ $8, R_LEN
528 JMP finishSlowForwardCopy
529
530verySlowForwardCopy:
531 // verySlowForwardCopy is a simple implementation of forward copy. In C
532 // parlance, this is a do/while loop instead of a while loop, since we know
533 // that length > 0. In Go syntax:
534 //
535 // for {
536 // dst[d] = dst[d - offset]
537 // d++
538 // length--
539 // if length == 0 {
540 // break
541 // }
542 // }
543 MOVB (R_TMP3), R_TMP1
544 MOVB R_TMP1, (R_DST)
545 INCQ R_TMP3
546 INCQ R_DST
547 DECQ R_LEN
548 JNZ verySlowForwardCopy
549 JMP loop
550
551// The code above handles copy tags.
552// ----------------------------------------
553
554end:
555 // This is the end of the "for s < len(src)".
556 //
557 // if d != len(dst) { etc }
558 CMPQ R_DST, R_DEND
559 JNE errCorrupt
560
561 // return 0
562 MOVQ $0, ret+48(FP)
563 RET
564
565errCorrupt:
566 // return decodeErrCodeCorrupt
567 MOVQ $1, ret+48(FP)
568 RET