aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost/compress/s2/decode_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/decode_arm64.s')
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_arm64.s574
1 files changed, 574 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
new file mode 100644
index 0000000..4b63d50
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
@@ -0,0 +1,574 @@
1// Copyright 2020 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !appengine
6// +build gc
7// +build !noasm
8
9#include "textflag.h"
10
11#define R_TMP0 R2
12#define R_TMP1 R3
13#define R_LEN R4
14#define R_OFF R5
15#define R_SRC R6
16#define R_DST R7
17#define R_DBASE R8
18#define R_DLEN R9
19#define R_DEND R10
20#define R_SBASE R11
21#define R_SLEN R12
22#define R_SEND R13
23#define R_TMP2 R14
24#define R_TMP3 R15
25
26// TEST_SRC will check if R_SRC is <= SRC_END
27#define TEST_SRC() \
28 CMP R_SEND, R_SRC \
29 BGT errCorrupt
30
31// MOVD R_SRC, R_TMP1
32// SUB R_SBASE, R_TMP1, R_TMP1
33// CMP R_SLEN, R_TMP1
34// BGT errCorrupt
35
36// The asm code generally follows the pure Go code in decode_other.go, except
37// where marked with a "!!!".
38
39// func decode(dst, src []byte) int
40//
41// All local variables fit into registers. The non-zero stack size is only to
42// spill registers and push args when issuing a CALL. The register allocation:
43// - R_TMP0 scratch
44// - R_TMP1 scratch
45// - R_LEN length or x
46// - R_OFF offset
47// - R_SRC &src[s]
48// - R_DST &dst[d]
49// + R_DBASE dst_base
50// + R_DLEN dst_len
51// + R_DEND dst_base + dst_len
52// + R_SBASE src_base
53// + R_SLEN src_len
54// + R_SEND src_base + src_len
55// - R_TMP2 used by doCopy
56// - R_TMP3 used by doCopy
57//
58// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
59// function, and after a CALL returns, and are not otherwise modified.
60//
61// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
62// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
63TEXT ·s2Decode(SB), NOSPLIT, $56-64
64 // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
65 MOVD dst_base+0(FP), R_DBASE
66 MOVD dst_len+8(FP), R_DLEN
67 MOVD R_DBASE, R_DST
68 MOVD R_DBASE, R_DEND
69 ADD R_DLEN, R_DEND, R_DEND
70 MOVD src_base+24(FP), R_SBASE
71 MOVD src_len+32(FP), R_SLEN
72 MOVD R_SBASE, R_SRC
73 MOVD R_SBASE, R_SEND
74 ADD R_SLEN, R_SEND, R_SEND
75 MOVD $0, R_OFF
76
77loop:
78 // for s < len(src)
79 CMP R_SEND, R_SRC
80 BEQ end
81
82 // R_LEN = uint32(src[s])
83 //
84 // switch src[s] & 0x03
85 MOVBU (R_SRC), R_LEN
86 MOVW R_LEN, R_TMP1
87 ANDW $3, R_TMP1
88 MOVW $1, R1
89 CMPW R1, R_TMP1
90 BGE tagCopy
91
92 // ----------------------------------------
93 // The code below handles literal tags.
94
95 // case tagLiteral:
96 // x := uint32(src[s] >> 2)
97 // switch
98 MOVW $60, R1
99 LSRW $2, R_LEN, R_LEN
100 CMPW R_LEN, R1
101 BLS tagLit60Plus
102
103 // case x < 60:
104 // s++
105 ADD $1, R_SRC, R_SRC
106
107doLit:
108 // This is the end of the inner "switch", when we have a literal tag.
109 //
110 // We assume that R_LEN == x and x fits in a uint32, where x is the variable
111 // used in the pure Go decode_other.go code.
112
113 // length = int(x) + 1
114 //
115 // Unlike the pure Go code, we don't need to check if length <= 0 because
116 // R_LEN can hold 64 bits, so the increment cannot overflow.
117 ADD $1, R_LEN, R_LEN
118
119 // Prepare to check if copying length bytes will run past the end of dst or
120 // src.
121 //
122 // R_TMP0 = len(dst) - d
123 // R_TMP1 = len(src) - s
124 MOVD R_DEND, R_TMP0
125 SUB R_DST, R_TMP0, R_TMP0
126 MOVD R_SEND, R_TMP1
127 SUB R_SRC, R_TMP1, R_TMP1
128
129 // !!! Try a faster technique for short (16 or fewer bytes) copies.
130 //
131 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
132 // goto callMemmove // Fall back on calling runtime·memmove.
133 // }
134 //
135 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
136 // against 21 instead of 16, because it cannot assume that all of its input
137 // is contiguous in memory and so it needs to leave enough source bytes to
138 // read the next tag without refilling buffers, but Go's Decode assumes
139 // contiguousness (the src argument is a []byte).
140 CMP $16, R_LEN
141 BGT callMemmove
142 CMP $16, R_TMP0
143 BLT callMemmove
144 CMP $16, R_TMP1
145 BLT callMemmove
146
147 // !!! Implement the copy from src to dst as a 16-byte load and store.
148 // (Decode's documentation says that dst and src must not overlap.)
149 //
150 // This always copies 16 bytes, instead of only length bytes, but that's
151 // OK. If the input is a valid Snappy encoding then subsequent iterations
152 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
153 // non-nil error), so the overrun will be ignored.
154 //
155 // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
156 // 16-byte loads and stores. This technique probably wouldn't be as
157 // effective on architectures that are fussier about alignment.
158 LDP 0(R_SRC), (R_TMP2, R_TMP3)
159 STP (R_TMP2, R_TMP3), 0(R_DST)
160
161 // d += length
162 // s += length
163 ADD R_LEN, R_DST, R_DST
164 ADD R_LEN, R_SRC, R_SRC
165 B loop
166
167callMemmove:
168 // if length > len(dst)-d || length > len(src)-s { etc }
169 CMP R_TMP0, R_LEN
170 BGT errCorrupt
171 CMP R_TMP1, R_LEN
172 BGT errCorrupt
173
174 // copy(dst[d:], src[s:s+length])
175 //
176 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
177 // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
178 // three registers to the stack, to save local variables across the CALL.
179 MOVD R_DST, 8(RSP)
180 MOVD R_SRC, 16(RSP)
181 MOVD R_LEN, 24(RSP)
182 MOVD R_DST, 32(RSP)
183 MOVD R_SRC, 40(RSP)
184 MOVD R_LEN, 48(RSP)
185 MOVD R_OFF, 56(RSP)
186 CALL runtime·memmove(SB)
187
188 // Restore local variables: unspill registers from the stack and
189 // re-calculate R_DBASE-R_SEND.
190 MOVD 32(RSP), R_DST
191 MOVD 40(RSP), R_SRC
192 MOVD 48(RSP), R_LEN
193 MOVD 56(RSP), R_OFF
194 MOVD dst_base+0(FP), R_DBASE
195 MOVD dst_len+8(FP), R_DLEN
196 MOVD R_DBASE, R_DEND
197 ADD R_DLEN, R_DEND, R_DEND
198 MOVD src_base+24(FP), R_SBASE
199 MOVD src_len+32(FP), R_SLEN
200 MOVD R_SBASE, R_SEND
201 ADD R_SLEN, R_SEND, R_SEND
202
203 // d += length
204 // s += length
205 ADD R_LEN, R_DST, R_DST
206 ADD R_LEN, R_SRC, R_SRC
207 B loop
208
209tagLit60Plus:
210 // !!! This fragment does the
211 //
212 // s += x - 58; if uint(s) > uint(len(src)) { etc }
213 //
214 // checks. In the asm version, we code it once instead of once per switch case.
215 ADD R_LEN, R_SRC, R_SRC
216 SUB $58, R_SRC, R_SRC
217 TEST_SRC()
218
219 // case x == 60:
220 MOVW $61, R1
221 CMPW R1, R_LEN
222 BEQ tagLit61
223 BGT tagLit62Plus
224
225 // x = uint32(src[s-1])
226 MOVBU -1(R_SRC), R_LEN
227 B doLit
228
229tagLit61:
230 // case x == 61:
231 // x = uint32(src[s-2]) | uint32(src[s-1])<<8
232 MOVHU -2(R_SRC), R_LEN
233 B doLit
234
235tagLit62Plus:
236 CMPW $62, R_LEN
237 BHI tagLit63
238
239 // case x == 62:
240 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
241 MOVHU -3(R_SRC), R_LEN
242 MOVBU -1(R_SRC), R_TMP1
243 ORR R_TMP1<<16, R_LEN
244 B doLit
245
246tagLit63:
247 // case x == 63:
248 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
249 MOVWU -4(R_SRC), R_LEN
250 B doLit
251
252 // The code above handles literal tags.
253 // ----------------------------------------
254 // The code below handles copy tags.
255
256tagCopy4:
257 // case tagCopy4:
258 // s += 5
259 ADD $5, R_SRC, R_SRC
260
261 // if uint(s) > uint(len(src)) { etc }
262 MOVD R_SRC, R_TMP1
263 SUB R_SBASE, R_TMP1, R_TMP1
264 CMP R_SLEN, R_TMP1
265 BGT errCorrupt
266
267 // length = 1 + int(src[s-5])>>2
268 MOVD $1, R1
269 ADD R_LEN>>2, R1, R_LEN
270
271 // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
272 MOVWU -4(R_SRC), R_OFF
273 B doCopy
274
275tagCopy2:
276 // case tagCopy2:
277 // s += 3
278 ADD $3, R_SRC, R_SRC
279
280 // if uint(s) > uint(len(src)) { etc }
281 TEST_SRC()
282
283 // length = 1 + int(src[s-3])>>2
284 MOVD $1, R1
285 ADD R_LEN>>2, R1, R_LEN
286
287 // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
288 MOVHU -2(R_SRC), R_OFF
289 B doCopy
290
291tagCopy:
292 // We have a copy tag. We assume that:
293 // - R_TMP1 == src[s] & 0x03
294 // - R_LEN == src[s]
295 CMP $2, R_TMP1
296 BEQ tagCopy2
297 BGT tagCopy4
298
299 // case tagCopy1:
300 // s += 2
301 ADD $2, R_SRC, R_SRC
302
303 // if uint(s) > uint(len(src)) { etc }
304 TEST_SRC()
305
306 // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
307 // Calculate offset in R_TMP0 in case it is a repeat.
308 MOVD R_LEN, R_TMP0
309 AND $0xe0, R_TMP0
310 MOVBU -1(R_SRC), R_TMP1
311 ORR R_TMP0<<3, R_TMP1, R_TMP0
312
313 // length = 4 + int(src[s-2])>>2&0x7
314 MOVD $7, R1
315 AND R_LEN>>2, R1, R_LEN
316 ADD $4, R_LEN, R_LEN
317
318 // check if repeat code with offset 0.
319 CMP $0, R_TMP0
320 BEQ repeatCode
321
322 // This is a regular copy, transfer our temporary value to R_OFF (offset)
323 MOVD R_TMP0, R_OFF
324 B doCopy
325
326 // This is a repeat code.
327repeatCode:
328 // If length < 9, reuse last offset, with the length already calculated.
329 CMP $9, R_LEN
330 BLT doCopyRepeat
331 BEQ repeatLen1
332 CMP $10, R_LEN
333 BEQ repeatLen2
334
335repeatLen3:
336 // s +=3
337 ADD $3, R_SRC, R_SRC
338
339 // if uint(s) > uint(len(src)) { etc }
340 TEST_SRC()
341
342 // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
343 MOVBU -1(R_SRC), R_TMP0
344 MOVHU -3(R_SRC), R_LEN
345 ORR R_TMP0<<16, R_LEN, R_LEN
346 ADD $65540, R_LEN, R_LEN
347 B doCopyRepeat
348
349repeatLen2:
350 // s +=2
351 ADD $2, R_SRC, R_SRC
352
353 // if uint(s) > uint(len(src)) { etc }
354 TEST_SRC()
355
356 // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
357 MOVHU -2(R_SRC), R_LEN
358 ADD $260, R_LEN, R_LEN
359 B doCopyRepeat
360
361repeatLen1:
362 // s +=1
363 ADD $1, R_SRC, R_SRC
364
365 // if uint(s) > uint(len(src)) { etc }
366 TEST_SRC()
367
368 // length = src[s-1] + 8
369 MOVBU -1(R_SRC), R_LEN
370 ADD $8, R_LEN, R_LEN
371 B doCopyRepeat
372
373doCopy:
374 // This is the end of the outer "switch", when we have a copy tag.
375 //
376 // We assume that:
377 // - R_LEN == length && R_LEN > 0
378 // - R_OFF == offset
379
380 // if d < offset { etc }
381 MOVD R_DST, R_TMP1
382 SUB R_DBASE, R_TMP1, R_TMP1
383 CMP R_OFF, R_TMP1
384 BLT errCorrupt
385
386 // Repeat values can skip the test above, since any offset > 0 will be in dst.
387doCopyRepeat:
388
389 // if offset <= 0 { etc }
390 CMP $0, R_OFF
391 BLE errCorrupt
392
393 // if length > len(dst)-d { etc }
394 MOVD R_DEND, R_TMP1
395 SUB R_DST, R_TMP1, R_TMP1
396 CMP R_TMP1, R_LEN
397 BGT errCorrupt
398
399 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
400 //
401 // Set:
402 // - R_TMP2 = len(dst)-d
403 // - R_TMP3 = &dst[d-offset]
404 MOVD R_DEND, R_TMP2
405 SUB R_DST, R_TMP2, R_TMP2
406 MOVD R_DST, R_TMP3
407 SUB R_OFF, R_TMP3, R_TMP3
408
409 // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
410 //
411 // First, try using two 8-byte load/stores, similar to the doLit technique
412 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
413 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
414 // and not one 16-byte load/store, and the first store has to be before the
415 // second load, due to the overlap if offset is in the range [8, 16).
416 //
417 // if length > 16 || offset < 8 || len(dst)-d < 16 {
418 // goto slowForwardCopy
419 // }
420 // copy 16 bytes
421 // d += length
422 CMP $16, R_LEN
423 BGT slowForwardCopy
424 CMP $8, R_OFF
425 BLT slowForwardCopy
426 CMP $16, R_TMP2
427 BLT slowForwardCopy
428 MOVD 0(R_TMP3), R_TMP0
429 MOVD R_TMP0, 0(R_DST)
430 MOVD 8(R_TMP3), R_TMP1
431 MOVD R_TMP1, 8(R_DST)
432 ADD R_LEN, R_DST, R_DST
433 B loop
434
435slowForwardCopy:
436 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
437 // can still try 8-byte load stores, provided we can overrun up to 10 extra
438 // bytes. As above, the overrun will be fixed up by subsequent iterations
439 // of the outermost loop.
440 //
441 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
442 // commentary says:
443 //
444 // ----
445 //
446 // The main part of this loop is a simple copy of eight bytes at a time
447 // until we've copied (at least) the requested amount of bytes. However,
448 // if d and d-offset are less than eight bytes apart (indicating a
449 // repeating pattern of length < 8), we first need to expand the pattern in
450 // order to get the correct results. For instance, if the buffer looks like
451 // this, with the eight-byte <d-offset> and <d> patterns marked as
452 // intervals:
453 //
454 // abxxxxxxxxxxxx
455 // [------] d-offset
456 // [------] d
457 //
458 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
459 // once, after which we can move <d> two bytes without moving <d-offset>:
460 //
461 // ababxxxxxxxxxx
462 // [------] d-offset
463 // [------] d
464 //
465 // and repeat the exercise until the two no longer overlap.
466 //
467 // This allows us to do very well in the special case of one single byte
468 // repeated many times, without taking a big hit for more general cases.
469 //
470 // The worst case of extra writing past the end of the match occurs when
471 // offset == 1 and length == 1; the last copy will read from byte positions
472 // [0..7] and write to [4..11], whereas it was only supposed to write to
473 // position 1. Thus, ten excess bytes.
474 //
475 // ----
476 //
477 // That "10 byte overrun" worst case is confirmed by Go's
478 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
479 // and finishSlowForwardCopy algorithm.
480 //
481 // if length > len(dst)-d-10 {
482 // goto verySlowForwardCopy
483 // }
484 SUB $10, R_TMP2, R_TMP2
485 CMP R_TMP2, R_LEN
486 BGT verySlowForwardCopy
487
488 // We want to keep the offset, so we use R_TMP2 from here.
489 MOVD R_OFF, R_TMP2
490
491makeOffsetAtLeast8:
492 // !!! As above, expand the pattern so that offset >= 8 and we can use
493 // 8-byte load/stores.
494 //
495 // for offset < 8 {
496 // copy 8 bytes from dst[d-offset:] to dst[d:]
497 // length -= offset
498 // d += offset
499 // offset += offset
500 // // The two previous lines together means that d-offset, and therefore
501 // // R_TMP3, is unchanged.
502 // }
503 CMP $8, R_TMP2
504 BGE fixUpSlowForwardCopy
505 MOVD (R_TMP3), R_TMP1
506 MOVD R_TMP1, (R_DST)
507 SUB R_TMP2, R_LEN, R_LEN
508 ADD R_TMP2, R_DST, R_DST
509 ADD R_TMP2, R_TMP2, R_TMP2
510 B makeOffsetAtLeast8
511
512fixUpSlowForwardCopy:
513 // !!! Add length (which might be negative now) to d (implied by R_DST being
514 // &dst[d]) so that d ends up at the right place when we jump back to the
515 // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
516 // length is positive, copying the remaining length bytes will write to the
517 // right place.
518 MOVD R_DST, R_TMP0
519 ADD R_LEN, R_DST, R_DST
520
521finishSlowForwardCopy:
522 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
523 // length means that we overrun, but as above, that will be fixed up by
524 // subsequent iterations of the outermost loop.
525 MOVD $0, R1
526 CMP R1, R_LEN
527 BLE loop
528 MOVD (R_TMP3), R_TMP1
529 MOVD R_TMP1, (R_TMP0)
530 ADD $8, R_TMP3, R_TMP3
531 ADD $8, R_TMP0, R_TMP0
532 SUB $8, R_LEN, R_LEN
533 B finishSlowForwardCopy
534
535verySlowForwardCopy:
536 // verySlowForwardCopy is a simple implementation of forward copy. In C
537 // parlance, this is a do/while loop instead of a while loop, since we know
538 // that length > 0. In Go syntax:
539 //
540 // for {
541 // dst[d] = dst[d - offset]
542 // d++
543 // length--
544 // if length == 0 {
545 // break
546 // }
547 // }
548 MOVB (R_TMP3), R_TMP1
549 MOVB R_TMP1, (R_DST)
550 ADD $1, R_TMP3, R_TMP3
551 ADD $1, R_DST, R_DST
552 SUB $1, R_LEN, R_LEN
553 CBNZ R_LEN, verySlowForwardCopy
554 B loop
555
556 // The code above handles copy tags.
557 // ----------------------------------------
558
559end:
560 // This is the end of the "for s < len(src)".
561 //
562 // if d != len(dst) { etc }
563 CMP R_DEND, R_DST
564 BNE errCorrupt
565
566 // return 0
567 MOVD $0, ret+48(FP)
568 RET
569
570errCorrupt:
571 // return decodeErrCodeCorrupt
572 MOVD $1, R_TMP0
573 MOVD R_TMP0, ret+48(FP)
574 RET