1 files changed, 0 insertions, 574 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
deleted file mode 100644
index 4b63d50..0000000
--- a/vendor/github.com/klauspost/compress/s2/decode_arm64.s
+++ /dev/null
@@ -1,574 +0,0 @@
-// Copyright 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// +build !appengine
-// +build gc
-// +build !noasm
-#include "textflag.h"
-#define R_TMP0 R2
-#define R_TMP1 R3
-#define R_LEN R4
-#define R_OFF R5
-#define R_SRC R6
-#define R_DST R7
-#define R_DBASE R8
-#define R_DLEN R9
-#define R_DEND R10
-#define R_SBASE R11
-#define R_SLEN R12
-#define R_SEND R13
-#define R_TMP2 R14
-#define R_TMP3 R15
-// TEST_SRC will check if R_SRC is <= SRC_END
-#define TEST_SRC() \
-        CMP R_SEND, R_SRC \
-        BGT errCorrupt
-// MOVD R_SRC, R_TMP1
-// SUB  R_SBASE, R_TMP1, R_TMP1
-// CMP  R_SLEN, R_TMP1
-// BGT  errCorrupt
-// The asm code generally follows the pure Go code in decode_other.go, except
-// where marked with a "!!!".
-// func decode(dst, src []byte) int
-//
-// All local variables fit into registers. The non-zero stack size is only to
-// spill registers and push args when issuing a CALL. The register allocation:
-//      - R_TMP0        scratch
-//      - R_TMP1        scratch
-//      - R_LEN length or x
-//      - R_OFF offset
-//      - R_SRC &src[s]
-//      - R_DST &dst[d]
-//      + R_DBASE       dst_base
-//      + R_DLEN        dst_len
-//      + R_DEND        dst_base + dst_len
-//      + R_SBASE       src_base
-//      + R_SLEN        src_len
-//      + R_SEND        src_base + src_len
-//      - R_TMP2        used by doCopy
-//      - R_TMP3        used by doCopy
-//
-// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
-// function, and after a CALL returns, and are not otherwise modified.
-//
-// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
-// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
-TEXT ·s2Decode(SB), NOSPLIT, $56-64
-        // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
-        MOVD dst_base+0(FP), R_DBASE
-        MOVD dst_len+8(FP), R_DLEN
-        MOVD R_DBASE, R_DST
-        MOVD R_DBASE, R_DEND
-        ADD  R_DLEN, R_DEND, R_DEND
-        MOVD src_base+24(FP), R_SBASE
-        MOVD src_len+32(FP), R_SLEN
-        MOVD R_SBASE, R_SRC
-        MOVD R_SBASE, R_SEND
-        ADD  R_SLEN, R_SEND, R_SEND
-        MOVD $0, R_OFF
-loop:
-        // for s < len(src)
-        CMP R_SEND, R_SRC
-        BEQ end
-        // R_LEN = uint32(src[s])
-        //
-        // switch src[s] & 0x03
-        MOVBU (R_SRC), R_LEN
-        MOVW  R_LEN, R_TMP1
-        ANDW  $3, R_TMP1
-        MOVW  $1, R1
-        CMPW  R1, R_TMP1
-        BGE   tagCopy
-        // ----------------------------------------
-        // The code below handles literal tags.
-        // case tagLiteral:
-        // x := uint32(src[s] >> 2)
-        // switch
-        MOVW $60, R1
-        LSRW $2, R_LEN, R_LEN
-        CMPW R_LEN, R1
-        BLS  tagLit60Plus
-        // case x < 60:
-        // s++
-        ADD $1, R_SRC, R_SRC
-doLit:
-        // This is the end of the inner "switch", when we have a literal tag.
-        //
-        // We assume that R_LEN == x and x fits in a uint32, where x is the variable
-        // used in the pure Go decode_other.go code.
-        // length = int(x) + 1
-        //
-        // Unlike the pure Go code, we don't need to check if length <= 0 because
-        // R_LEN can hold 64 bits, so the increment cannot overflow.
-        ADD $1, R_LEN, R_LEN
-        // Prepare to check if copying length bytes will run past the end of dst or
-        // src.
-        //
-        // R_TMP0 = len(dst) - d
-        // R_TMP1 = len(src) - s
-        MOVD R_DEND, R_TMP0
-        SUB  R_DST, R_TMP0, R_TMP0
-        MOVD R_SEND, R_TMP1
-        SUB  R_SRC, R_TMP1, R_TMP1
-        // !!! Try a faster technique for short (16 or fewer bytes) copies.
-        //
-        // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
-        //   goto callMemmove // Fall back on calling runtime·memmove.
-        // }
-        //
-        // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
-        // against 21 instead of 16, because it cannot assume that all of its input
-        // is contiguous in memory and so it needs to leave enough source bytes to
-        // read the next tag without refilling buffers, but Go's Decode assumes
-        // contiguousness (the src argument is a []byte).
-        CMP $16, R_LEN
-        BGT callMemmove
-        CMP $16, R_TMP0
-        BLT callMemmove
-        CMP $16, R_TMP1
-        BLT callMemmove
-        // !!! Implement the copy from src to dst as a 16-byte load and store.
-        // (Decode's documentation says that dst and src must not overlap.)
-        //
-        // This always copies 16 bytes, instead of only length bytes, but that's
-        // OK. If the input is a valid Snappy encoding then subsequent iterations
-        // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
-        // non-nil error), so the overrun will be ignored.
-        //
-        // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
-        // 16-byte loads and stores. This technique probably wouldn't be as
-        // effective on architectures that are fussier about alignment.
-        LDP 0(R_SRC), (R_TMP2, R_TMP3)
-        STP (R_TMP2, R_TMP3), 0(R_DST)
-        // d += length
-        // s += length
-        ADD R_LEN, R_DST, R_DST
-        ADD R_LEN, R_SRC, R_SRC
-        B   loop
-callMemmove:
-        // if length > len(dst)-d || length > len(src)-s { etc }
-        CMP R_TMP0, R_LEN
-        BGT errCorrupt
-        CMP R_TMP1, R_LEN
-        BGT errCorrupt
-        // copy(dst[d:], src[s:s+length])
-        //
-        // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
-        // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
-        // three registers to the stack, to save local variables across the CALL.
-        MOVD R_DST, 8(RSP)
-        MOVD R_SRC, 16(RSP)
-        MOVD R_LEN, 24(RSP)
-        MOVD R_DST, 32(RSP)
-        MOVD R_SRC, 40(RSP)
-        MOVD R_LEN, 48(RSP)
-        MOVD R_OFF, 56(RSP)
-        CALL runtime·memmove(SB)
-        // Restore local variables: unspill registers from the stack and
-        // re-calculate R_DBASE-R_SEND.
-        MOVD 32(RSP), R_DST
-        MOVD 40(RSP), R_SRC
-        MOVD 48(RSP), R_LEN
-        MOVD 56(RSP), R_OFF
-        MOVD dst_base+0(FP), R_DBASE
-        MOVD dst_len+8(FP), R_DLEN
-        MOVD R_DBASE, R_DEND
-        ADD  R_DLEN, R_DEND, R_DEND
-        MOVD src_base+24(FP), R_SBASE
-        MOVD src_len+32(FP), R_SLEN
-        MOVD R_SBASE, R_SEND
-        ADD  R_SLEN, R_SEND, R_SEND
-        // d += length
-        // s += length
-        ADD R_LEN, R_DST, R_DST
-        ADD R_LEN, R_SRC, R_SRC
-        B   loop
-tagLit60Plus:
-        // !!! This fragment does the
-        //
-        // s += x - 58; if uint(s) > uint(len(src)) { etc }
-        //
-        // checks. In the asm version, we code it once instead of once per switch case.
-        ADD R_LEN, R_SRC, R_SRC
-        SUB $58, R_SRC, R_SRC
-        TEST_SRC()
-        // case x == 60:
-        MOVW $61, R1
-        CMPW R1, R_LEN
-        BEQ  tagLit61
-        BGT  tagLit62Plus
-        // x = uint32(src[s-1])
-        MOVBU -1(R_SRC), R_LEN
-        B     doLit
-tagLit61:
-        // case x == 61:
-        // x = uint32(src[s-2]) | uint32(src[s-1])<<8
-        MOVHU -2(R_SRC), R_LEN
-        B     doLit
-tagLit62Plus:
-        CMPW $62, R_LEN
-        BHI  tagLit63
-        // case x == 62:
-        // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-        MOVHU -3(R_SRC), R_LEN
-        MOVBU -1(R_SRC), R_TMP1
-        ORR   R_TMP1<<16, R_LEN
-        B     doLit
-tagLit63:
-        // case x == 63:
-        // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-        MOVWU -4(R_SRC), R_LEN
-        B     doLit
-        // The code above handles literal tags.
-        // ----------------------------------------
-        // The code below handles copy tags.
-tagCopy4:
-        // case tagCopy4:
-        // s += 5
-        ADD $5, R_SRC, R_SRC
-        // if uint(s) > uint(len(src)) { etc }
-        MOVD R_SRC, R_TMP1
-        SUB  R_SBASE, R_TMP1, R_TMP1
-        CMP  R_SLEN, R_TMP1
-        BGT  errCorrupt
-        // length = 1 + int(src[s-5])>>2
-        MOVD $1, R1
-        ADD  R_LEN>>2, R1, R_LEN
-        // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
-        MOVWU -4(R_SRC), R_OFF
-        B     doCopy
-tagCopy2:
-        // case tagCopy2:
-        // s += 3
-        ADD $3, R_SRC, R_SRC
-        // if uint(s) > uint(len(src)) { etc }
-        TEST_SRC()
-        // length = 1 + int(src[s-3])>>2
-        MOVD $1, R1
-        ADD  R_LEN>>2, R1, R_LEN
-        // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
-        MOVHU -2(R_SRC), R_OFF
-        B     doCopy
-tagCopy:
-        // We have a copy tag. We assume that:
-        //      - R_TMP1 == src[s] & 0x03
-        //      - R_LEN == src[s]
-        CMP $2, R_TMP1
-        BEQ tagCopy2
-        BGT tagCopy4
-        // case tagCopy1:
-        // s += 2
-        ADD $2, R_SRC, R_SRC
-        // if uint(s) > uint(len(src)) { etc }
-        TEST_SRC()
-        // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
-        // Calculate offset in R_TMP0 in case it is a repeat.
-        MOVD  R_LEN, R_TMP0
-        AND   $0xe0, R_TMP0
-        MOVBU -1(R_SRC), R_TMP1
-        ORR   R_TMP0<<3, R_TMP1, R_TMP0
-        // length = 4 + int(src[s-2])>>2&0x7
-        MOVD $7, R1
-        AND  R_LEN>>2, R1, R_LEN
-        ADD  $4, R_LEN, R_LEN
-        // check if repeat code with offset 0.
-        CMP $0, R_TMP0
-        BEQ repeatCode
-        // This is a regular copy, transfer our temporary value to R_OFF (offset)
-        MOVD R_TMP0, R_OFF
-        B    doCopy
-        // This is a repeat code.
-repeatCode:
-        // If length < 9, reuse last offset, with the length already calculated.
-        CMP $9, R_LEN
-        BLT doCopyRepeat
-        BEQ repeatLen1
-        CMP $10, R_LEN
-        BEQ repeatLen2
-repeatLen3:
-        // s +=3
-        ADD $3, R_SRC, R_SRC
-        // if uint(s) > uint(len(src)) { etc }
-        TEST_SRC()
-        // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
-        MOVBU -1(R_SRC), R_TMP0
-        MOVHU -3(R_SRC), R_LEN
-        ORR   R_TMP0<<16, R_LEN, R_LEN
-        ADD   $65540, R_LEN, R_LEN
-        B     doCopyRepeat
-repeatLen2:
-        // s +=2
-        ADD $2, R_SRC, R_SRC
-        // if uint(s) > uint(len(src)) { etc }
-        TEST_SRC()
-        // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
-        MOVHU -2(R_SRC), R_LEN
-        ADD   $260, R_LEN, R_LEN
-        B     doCopyRepeat
-repeatLen1:
-        // s +=1
-        ADD $1, R_SRC, R_SRC
-        // if uint(s) > uint(len(src)) { etc }
-        TEST_SRC()
-        // length = src[s-1] + 8
-        MOVBU -1(R_SRC), R_LEN
-        ADD   $8, R_LEN, R_LEN
-        B     doCopyRepeat
-doCopy:
-        // This is the end of the outer "switch", when we have a copy tag.
-        //
-        // We assume that:
-        //      - R_LEN == length && R_LEN > 0
-        //      - R_OFF == offset
-        // if d < offset { etc }
-        MOVD R_DST, R_TMP1
-        SUB  R_DBASE, R_TMP1, R_TMP1
-        CMP  R_OFF, R_TMP1
-        BLT  errCorrupt
-        // Repeat values can skip the test above, since any offset > 0 will be in dst.
-doCopyRepeat:
-        // if offset <= 0 { etc }
-        CMP $0, R_OFF
-        BLE errCorrupt
-        // if length > len(dst)-d { etc }
-        MOVD R_DEND, R_TMP1
-        SUB  R_DST, R_TMP1, R_TMP1
-        CMP  R_TMP1, R_LEN
-        BGT  errCorrupt
-        // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
-        //
-        // Set:
-        //      - R_TMP2 = len(dst)-d
-        //      - R_TMP3 = &dst[d-offset]
-        MOVD R_DEND, R_TMP2
-        SUB  R_DST, R_TMP2, R_TMP2
-        MOVD R_DST, R_TMP3
-        SUB  R_OFF, R_TMP3, R_TMP3
-        // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
-        //
-        // First, try using two 8-byte load/stores, similar to the doLit technique
-        // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
-        // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
-        // and not one 16-byte load/store, and the first store has to be before the
-        // second load, due to the overlap if offset is in the range [8, 16).
-        //
-        // if length > 16 || offset < 8 || len(dst)-d < 16 {
-        //   goto slowForwardCopy
-        // }
-        // copy 16 bytes
-        // d += length
-        CMP  $16, R_LEN
-        BGT  slowForwardCopy
-        CMP  $8, R_OFF
-        BLT  slowForwardCopy
-        CMP  $16, R_TMP2
-        BLT  slowForwardCopy
-        MOVD 0(R_TMP3), R_TMP0
-        MOVD R_TMP0, 0(R_DST)
-        MOVD 8(R_TMP3), R_TMP1
-        MOVD R_TMP1, 8(R_DST)
-        ADD  R_LEN, R_DST, R_DST
-        B    loop
-slowForwardCopy:
-        // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
-        // can still try 8-byte load stores, provided we can overrun up to 10 extra
-        // bytes. As above, the overrun will be fixed up by subsequent iterations
-        // of the outermost loop.
-        //
-        // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
-        // commentary says:
-        //
-        // ----
-        //
-        // The main part of this loop is a simple copy of eight bytes at a time
-        // until we've copied (at least) the requested amount of bytes.  However,
-        // if d and d-offset are less than eight bytes apart (indicating a
-        // repeating pattern of length < 8), we first need to expand the pattern in
-        // order to get the correct results. For instance, if the buffer looks like
-        // this, with the eight-byte <d-offset> and <d> patterns marked as
-        // intervals:
-        //
-        //    abxxxxxxxxxxxx
-        //    [------]           d-offset
-        //      [------]         d
-        //
-        // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
-        // once, after which we can move <d> two bytes without moving <d-offset>:
-        //
-        //    ababxxxxxxxxxx
-        //    [------]           d-offset
-        //        [------]       d
-        //
-        // and repeat the exercise until the two no longer overlap.
-        //
-        // This allows us to do very well in the special case of one single byte
-        // repeated many times, without taking a big hit for more general cases.
-        //
-        // The worst case of extra writing past the end of the match occurs when
-        // offset == 1 and length == 1; the last copy will read from byte positions
-        // [0..7] and write to [4..11], whereas it was only supposed to write to
-        // position 1. Thus, ten excess bytes.
-        //
-        // ----
-        //
-        // That "10 byte overrun" worst case is confirmed by Go's
-        // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
-        // and finishSlowForwardCopy algorithm.
-        //
-        // if length > len(dst)-d-10 {
-        //   goto verySlowForwardCopy
-        // }
-        SUB $10, R_TMP2, R_TMP2
-        CMP R_TMP2, R_LEN
-        BGT verySlowForwardCopy
-        // We want to keep the offset, so we use R_TMP2 from here.
-        MOVD R_OFF, R_TMP2
-makeOffsetAtLeast8:
-        // !!! As above, expand the pattern so that offset >= 8 and we can use
-        // 8-byte load/stores.
-        //
-        // for offset < 8 {
-        //   copy 8 bytes from dst[d-offset:] to dst[d:]
-        //   length -= offset
-        //   d      += offset
-        //   offset += offset
-        //   // The two previous lines together means that d-offset, and therefore
-        //   // R_TMP3, is unchanged.
-        // }
-        CMP  $8, R_TMP2
-        BGE  fixUpSlowForwardCopy
-        MOVD (R_TMP3), R_TMP1
-        MOVD R_TMP1, (R_DST)
-        SUB  R_TMP2, R_LEN, R_LEN
-        ADD  R_TMP2, R_DST, R_DST
-        ADD  R_TMP2, R_TMP2, R_TMP2
-        B    makeOffsetAtLeast8
-fixUpSlowForwardCopy:
-        // !!! Add length (which might be negative now) to d (implied by R_DST being
-        // &dst[d]) so that d ends up at the right place when we jump back to the
-        // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
-        // length is positive, copying the remaining length bytes will write to the
-        // right place.
-        MOVD R_DST, R_TMP0
-        ADD  R_LEN, R_DST, R_DST
-finishSlowForwardCopy:
-        // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
-        // length means that we overrun, but as above, that will be fixed up by
-        // subsequent iterations of the outermost loop.
-        MOVD $0, R1
-        CMP  R1, R_LEN
-        BLE  loop
-        MOVD (R_TMP3), R_TMP1
-        MOVD R_TMP1, (R_TMP0)
-        ADD  $8, R_TMP3, R_TMP3
-        ADD  $8, R_TMP0, R_TMP0
-        SUB  $8, R_LEN, R_LEN
-        B    finishSlowForwardCopy
-verySlowForwardCopy:
-        // verySlowForwardCopy is a simple implementation of forward copy. In C
-        // parlance, this is a do/while loop instead of a while loop, since we know
-        // that length > 0. In Go syntax:
-        //
-        // for {
-        //   dst[d] = dst[d - offset]
-        //   d++
-        //   length--
-        //   if length == 0 {
-        //     break
-        //   }
-        // }
-        MOVB (R_TMP3), R_TMP1
-        MOVB R_TMP1, (R_DST)
-        ADD  $1, R_TMP3, R_TMP3
-        ADD  $1, R_DST, R_DST
-        SUB  $1, R_LEN, R_LEN
-        CBNZ R_LEN, verySlowForwardCopy
-        B    loop
-        // The code above handles copy tags.
-        // ----------------------------------------
-end:
-        // This is the end of the "for s < len(src)".
-        //
-        // if d != len(dst) { etc }
-        CMP R_DEND, R_DST
-        BNE errCorrupt
-        // return 0
-        MOVD $0, ret+48(FP)
-        RET
-errCorrupt:
-        // return decodeErrCodeCorrupt
-        MOVD $1, R_TMP0
-        MOVD R_TMP0, ret+48(FP)
-        RET

diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s deleted file mode 100644 index 4b63d50..0000000 --- a/vendor/github.com/klauspost/compress/s2/decode_arm64.s +++ /dev/null
@@ -1,574 +0,0 @@
1	// Copyright 2020 The Go Authors. All rights reserved.
2	// Use of this source code is governed by a BSD-style
3	// license that can be found in the LICENSE file.
4
5	// +build !appengine
6	// +build gc
7	// +build !noasm
8
9	#include "textflag.h"
10
11	#define R_TMP0 R2
12	#define R_TMP1 R3
13	#define R_LEN R4
14	#define R_OFF R5
15	#define R_SRC R6
16	#define R_DST R7
17	#define R_DBASE R8
18	#define R_DLEN R9
19	#define R_DEND R10
20	#define R_SBASE R11
21	#define R_SLEN R12
22	#define R_SEND R13
23	#define R_TMP2 R14
24	#define R_TMP3 R15
25
26	// TEST_SRC will check if R_SRC is <= SRC_END
27	#define TEST_SRC() \
28	CMP R_SEND, R_SRC \
29	BGT errCorrupt
30
31	// MOVD R_SRC, R_TMP1
32	// SUB R_SBASE, R_TMP1, R_TMP1
33	// CMP R_SLEN, R_TMP1
34	// BGT errCorrupt
35
36	// The asm code generally follows the pure Go code in decode_other.go, except
37	// where marked with a "!!!".
38
39	// func decode(dst, src []byte) int
40	//
41	// All local variables fit into registers. The non-zero stack size is only to
42	// spill registers and push args when issuing a CALL. The register allocation:
43	// - R_TMP0 scratch
44	// - R_TMP1 scratch
45	// - R_LEN length or x
46	// - R_OFF offset
47	// - R_SRC &src[s]
48	// - R_DST &dst[d]
49	// + R_DBASE dst_base
50	// + R_DLEN dst_len
51	// + R_DEND dst_base + dst_len
52	// + R_SBASE src_base
53	// + R_SLEN src_len
54	// + R_SEND src_base + src_len
55	// - R_TMP2 used by doCopy
56	// - R_TMP3 used by doCopy
57	//
58	// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
59	// function, and after a CALL returns, and are not otherwise modified.
60	//
61	// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
62	// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
63	TEXT ·s2Decode(SB), NOSPLIT, $56-64
64	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
65	MOVD dst_base+0(FP), R_DBASE
66	MOVD dst_len+8(FP), R_DLEN
67	MOVD R_DBASE, R_DST
68	MOVD R_DBASE, R_DEND
69	ADD R_DLEN, R_DEND, R_DEND
70	MOVD src_base+24(FP), R_SBASE
71	MOVD src_len+32(FP), R_SLEN
72	MOVD R_SBASE, R_SRC
73	MOVD R_SBASE, R_SEND
74	ADD R_SLEN, R_SEND, R_SEND
75	MOVD $0, R_OFF
76
77	loop:
78	// for s < len(src)
79	CMP R_SEND, R_SRC
80	BEQ end
81
82	// R_LEN = uint32(src[s])
83	//
84	// switch src[s] & 0x03
85	MOVBU (R_SRC), R_LEN
86	MOVW R_LEN, R_TMP1
87	ANDW $3, R_TMP1
88	MOVW $1, R1
89	CMPW R1, R_TMP1
90	BGE tagCopy
91
92	// ----------------------------------------
93	// The code below handles literal tags.
94
95	// case tagLiteral:
96	// x := uint32(src[s] >> 2)
97	// switch
98	MOVW $60, R1
99	LSRW $2, R_LEN, R_LEN
100	CMPW R_LEN, R1
101	BLS tagLit60Plus
102
103	// case x < 60:
104	// s++
105	ADD $1, R_SRC, R_SRC
106
107	doLit:
108	// This is the end of the inner "switch", when we have a literal tag.
109	//
110	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
111	// used in the pure Go decode_other.go code.
112
113	// length = int(x) + 1
114	//
115	// Unlike the pure Go code, we don't need to check if length <= 0 because
116	// R_LEN can hold 64 bits, so the increment cannot overflow.
117	ADD $1, R_LEN, R_LEN
118
119	// Prepare to check if copying length bytes will run past the end of dst or
120	// src.
121	//
122	// R_TMP0 = len(dst) - d
123	// R_TMP1 = len(src) - s
124	MOVD R_DEND, R_TMP0
125	SUB R_DST, R_TMP0, R_TMP0
126	MOVD R_SEND, R_TMP1
127	SUB R_SRC, R_TMP1, R_TMP1
128
129	// !!! Try a faster technique for short (16 or fewer bytes) copies.
130	//
131	// if length > 16 \|\| len(dst)-d < 16 \|\| len(src)-s < 16 {
132	// goto callMemmove // Fall back on calling runtime·memmove.
133	// }
134	//
135	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
136	// against 21 instead of 16, because it cannot assume that all of its input
137	// is contiguous in memory and so it needs to leave enough source bytes to
138	// read the next tag without refilling buffers, but Go's Decode assumes
139	// contiguousness (the src argument is a []byte).
140	CMP $16, R_LEN
141	BGT callMemmove
142	CMP $16, R_TMP0
143	BLT callMemmove
144	CMP $16, R_TMP1
145	BLT callMemmove
146
147	// !!! Implement the copy from src to dst as a 16-byte load and store.
148	// (Decode's documentation says that dst and src must not overlap.)
149	//
150	// This always copies 16 bytes, instead of only length bytes, but that's
151	// OK. If the input is a valid Snappy encoding then subsequent iterations
152	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
153	// non-nil error), so the overrun will be ignored.
154	//
155	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
156	// 16-byte loads and stores. This technique probably wouldn't be as
157	// effective on architectures that are fussier about alignment.
158	LDP 0(R_SRC), (R_TMP2, R_TMP3)
159	STP (R_TMP2, R_TMP3), 0(R_DST)
160
161	// d += length
162	// s += length
163	ADD R_LEN, R_DST, R_DST
164	ADD R_LEN, R_SRC, R_SRC
165	B loop
166
167	callMemmove:
168	// if length > len(dst)-d \|\| length > len(src)-s { etc }
169	CMP R_TMP0, R_LEN
170	BGT errCorrupt
171	CMP R_TMP1, R_LEN
172	BGT errCorrupt
173
174	// copy(dst[d:], src[s:s+length])
175	//
176	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
177	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
178	// three registers to the stack, to save local variables across the CALL.
179	MOVD R_DST, 8(RSP)
180	MOVD R_SRC, 16(RSP)
181	MOVD R_LEN, 24(RSP)
182	MOVD R_DST, 32(RSP)
183	MOVD R_SRC, 40(RSP)
184	MOVD R_LEN, 48(RSP)
185	MOVD R_OFF, 56(RSP)
186	CALL runtime·memmove(SB)
187
188	// Restore local variables: unspill registers from the stack and
189	// re-calculate R_DBASE-R_SEND.
190	MOVD 32(RSP), R_DST
191	MOVD 40(RSP), R_SRC
192	MOVD 48(RSP), R_LEN
193	MOVD 56(RSP), R_OFF
194	MOVD dst_base+0(FP), R_DBASE
195	MOVD dst_len+8(FP), R_DLEN
196	MOVD R_DBASE, R_DEND
197	ADD R_DLEN, R_DEND, R_DEND
198	MOVD src_base+24(FP), R_SBASE
199	MOVD src_len+32(FP), R_SLEN
200	MOVD R_SBASE, R_SEND
201	ADD R_SLEN, R_SEND, R_SEND
202
203	// d += length
204	// s += length
205	ADD R_LEN, R_DST, R_DST
206	ADD R_LEN, R_SRC, R_SRC
207	B loop
208
209	tagLit60Plus:
210	// !!! This fragment does the
211	//
212	// s += x - 58; if uint(s) > uint(len(src)) { etc }
213	//
214	// checks. In the asm version, we code it once instead of once per switch case.
215	ADD R_LEN, R_SRC, R_SRC
216	SUB $58, R_SRC, R_SRC
217	TEST_SRC()
218
219	// case x == 60:
220	MOVW $61, R1
221	CMPW R1, R_LEN
222	BEQ tagLit61
223	BGT tagLit62Plus
224
225	// x = uint32(src[s-1])
226	MOVBU -1(R_SRC), R_LEN
227	B doLit
228
229	tagLit61:
230	// case x == 61:
231	// x = uint32(src[s-2]) \| uint32(src[s-1])<<8
232	MOVHU -2(R_SRC), R_LEN
233	B doLit
234
235	tagLit62Plus:
236	CMPW $62, R_LEN
237	BHI tagLit63
238
239	// case x == 62:
240	// x = uint32(src[s-3]) \| uint32(src[s-2])<<8 \| uint32(src[s-1])<<16
241	MOVHU -3(R_SRC), R_LEN
242	MOVBU -1(R_SRC), R_TMP1
243	ORR R_TMP1<<16, R_LEN
244	B doLit
245
246	tagLit63:
247	// case x == 63:
248	// x = uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24
249	MOVWU -4(R_SRC), R_LEN
250	B doLit
251
252	// The code above handles literal tags.
253	// ----------------------------------------
254	// The code below handles copy tags.
255
256	tagCopy4:
257	// case tagCopy4:
258	// s += 5
259	ADD $5, R_SRC, R_SRC
260
261	// if uint(s) > uint(len(src)) { etc }
262	MOVD R_SRC, R_TMP1
263	SUB R_SBASE, R_TMP1, R_TMP1
264	CMP R_SLEN, R_TMP1
265	BGT errCorrupt
266
267	// length = 1 + int(src[s-5])>>2
268	MOVD $1, R1
269	ADD R_LEN>>2, R1, R_LEN
270
271	// offset = int(uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24)
272	MOVWU -4(R_SRC), R_OFF
273	B doCopy
274
275	tagCopy2:
276	// case tagCopy2:
277	// s += 3
278	ADD $3, R_SRC, R_SRC
279
280	// if uint(s) > uint(len(src)) { etc }
281	TEST_SRC()
282
283	// length = 1 + int(src[s-3])>>2
284	MOVD $1, R1
285	ADD R_LEN>>2, R1, R_LEN
286
287	// offset = int(uint32(src[s-2]) \| uint32(src[s-1])<<8)
288	MOVHU -2(R_SRC), R_OFF
289	B doCopy
290
291	tagCopy:
292	// We have a copy tag. We assume that:
293	// - R_TMP1 == src[s] & 0x03
294	// - R_LEN == src[s]
295	CMP $2, R_TMP1
296	BEQ tagCopy2
297	BGT tagCopy4
298
299	// case tagCopy1:
300	// s += 2
301	ADD $2, R_SRC, R_SRC
302
303	// if uint(s) > uint(len(src)) { etc }
304	TEST_SRC()
305
306	// offset = int(uint32(src[s-2])&0xe0<<3 \| uint32(src[s-1]))
307	// Calculate offset in R_TMP0 in case it is a repeat.
308	MOVD R_LEN, R_TMP0
309	AND $0xe0, R_TMP0
310	MOVBU -1(R_SRC), R_TMP1
311	ORR R_TMP0<<3, R_TMP1, R_TMP0
312
313	// length = 4 + int(src[s-2])>>2&0x7
314	MOVD $7, R1
315	AND R_LEN>>2, R1, R_LEN
316	ADD $4, R_LEN, R_LEN
317
318	// check if repeat code with offset 0.
319	CMP $0, R_TMP0
320	BEQ repeatCode
321
322	// This is a regular copy, transfer our temporary value to R_OFF (offset)
323	MOVD R_TMP0, R_OFF
324	B doCopy
325
326	// This is a repeat code.
327	repeatCode:
328	// If length < 9, reuse last offset, with the length already calculated.
329	CMP $9, R_LEN
330	BLT doCopyRepeat
331	BEQ repeatLen1
332	CMP $10, R_LEN
333	BEQ repeatLen2
334
335	repeatLen3:
336	// s +=3
337	ADD $3, R_SRC, R_SRC
338
339	// if uint(s) > uint(len(src)) { etc }
340	TEST_SRC()
341
342	// length = uint32(src[s-3]) \| (uint32(src[s-2])<<8) \| (uint32(src[s-1])<<16) + 65540
343	MOVBU -1(R_SRC), R_TMP0
344	MOVHU -3(R_SRC), R_LEN
345	ORR R_TMP0<<16, R_LEN, R_LEN
346	ADD $65540, R_LEN, R_LEN
347	B doCopyRepeat
348
349	repeatLen2:
350	// s +=2
351	ADD $2, R_SRC, R_SRC
352
353	// if uint(s) > uint(len(src)) { etc }
354	TEST_SRC()
355
356	// length = uint32(src[s-2]) \| (uint32(src[s-1])<<8) + 260
357	MOVHU -2(R_SRC), R_LEN
358	ADD $260, R_LEN, R_LEN
359	B doCopyRepeat
360
361	repeatLen1:
362	// s +=1
363	ADD $1, R_SRC, R_SRC
364
365	// if uint(s) > uint(len(src)) { etc }
366	TEST_SRC()
367
368	// length = src[s-1] + 8
369	MOVBU -1(R_SRC), R_LEN
370	ADD $8, R_LEN, R_LEN
371	B doCopyRepeat
372
373	doCopy:
374	// This is the end of the outer "switch", when we have a copy tag.
375	//
376	// We assume that:
377	// - R_LEN == length && R_LEN > 0
378	// - R_OFF == offset
379
380	// if d < offset { etc }
381	MOVD R_DST, R_TMP1
382	SUB R_DBASE, R_TMP1, R_TMP1
383	CMP R_OFF, R_TMP1
384	BLT errCorrupt
385
386	// Repeat values can skip the test above, since any offset > 0 will be in dst.
387	doCopyRepeat:
388
389	// if offset <= 0 { etc }
390	CMP $0, R_OFF
391	BLE errCorrupt
392
393	// if length > len(dst)-d { etc }
394	MOVD R_DEND, R_TMP1
395	SUB R_DST, R_TMP1, R_TMP1
396	CMP R_TMP1, R_LEN
397	BGT errCorrupt
398
399	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
400	//
401	// Set:
402	// - R_TMP2 = len(dst)-d
403	// - R_TMP3 = &dst[d-offset]
404	MOVD R_DEND, R_TMP2
405	SUB R_DST, R_TMP2, R_TMP2
406	MOVD R_DST, R_TMP3
407	SUB R_OFF, R_TMP3, R_TMP3
408
409	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
410	//
411	// First, try using two 8-byte load/stores, similar to the doLit technique
412	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
413	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
414	// and not one 16-byte load/store, and the first store has to be before the
415	// second load, due to the overlap if offset is in the range [8, 16).
416	//
417	// if length > 16 \|\| offset < 8 \|\| len(dst)-d < 16 {
418	// goto slowForwardCopy
419	// }
420	// copy 16 bytes
421	// d += length
422	CMP $16, R_LEN
423	BGT slowForwardCopy
424	CMP $8, R_OFF
425	BLT slowForwardCopy
426	CMP $16, R_TMP2
427	BLT slowForwardCopy
428	MOVD 0(R_TMP3), R_TMP0
429	MOVD R_TMP0, 0(R_DST)
430	MOVD 8(R_TMP3), R_TMP1
431	MOVD R_TMP1, 8(R_DST)
432	ADD R_LEN, R_DST, R_DST
433	B loop
434
435	slowForwardCopy:
436	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
437	// can still try 8-byte load stores, provided we can overrun up to 10 extra
438	// bytes. As above, the overrun will be fixed up by subsequent iterations
439	// of the outermost loop.
440	//
441	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
442	// commentary says:
443	//
444	// ----
445	//
446	// The main part of this loop is a simple copy of eight bytes at a time
447	// until we've copied (at least) the requested amount of bytes. However,
448	// if d and d-offset are less than eight bytes apart (indicating a
449	// repeating pattern of length < 8), we first need to expand the pattern in
450	// order to get the correct results. For instance, if the buffer looks like
451	// this, with the eight-byte <d-offset> and <d> patterns marked as
452	// intervals:
453	//
454	// abxxxxxxxxxxxx
455	// [------] d-offset
456	// [------] d
457	//
458	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
459	// once, after which we can move <d> two bytes without moving <d-offset>:
460	//
461	// ababxxxxxxxxxx
462	// [------] d-offset
463	// [------] d
464	//
465	// and repeat the exercise until the two no longer overlap.
466	//
467	// This allows us to do very well in the special case of one single byte
468	// repeated many times, without taking a big hit for more general cases.
469	//
470	// The worst case of extra writing past the end of the match occurs when
471	// offset == 1 and length == 1; the last copy will read from byte positions
472	// [0..7] and write to [4..11], whereas it was only supposed to write to
473	// position 1. Thus, ten excess bytes.
474	//
475	// ----
476	//
477	// That "10 byte overrun" worst case is confirmed by Go's
478	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
479	// and finishSlowForwardCopy algorithm.
480	//
481	// if length > len(dst)-d-10 {
482	// goto verySlowForwardCopy
483	// }
484	SUB $10, R_TMP2, R_TMP2
485	CMP R_TMP2, R_LEN
486	BGT verySlowForwardCopy
487
488	// We want to keep the offset, so we use R_TMP2 from here.
489	MOVD R_OFF, R_TMP2
490
491	makeOffsetAtLeast8:
492	// !!! As above, expand the pattern so that offset >= 8 and we can use
493	// 8-byte load/stores.
494	//
495	// for offset < 8 {
496	// copy 8 bytes from dst[d-offset:] to dst[d:]
497	// length -= offset
498	// d += offset
499	// offset += offset
500	// // The two previous lines together means that d-offset, and therefore
501	// // R_TMP3, is unchanged.
502	// }
503	CMP $8, R_TMP2
504	BGE fixUpSlowForwardCopy
505	MOVD (R_TMP3), R_TMP1
506	MOVD R_TMP1, (R_DST)
507	SUB R_TMP2, R_LEN, R_LEN
508	ADD R_TMP2, R_DST, R_DST
509	ADD R_TMP2, R_TMP2, R_TMP2
510	B makeOffsetAtLeast8
511
512	fixUpSlowForwardCopy:
513	// !!! Add length (which might be negative now) to d (implied by R_DST being
514	// &dst[d]) so that d ends up at the right place when we jump back to the
515	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
516	// length is positive, copying the remaining length bytes will write to the
517	// right place.
518	MOVD R_DST, R_TMP0
519	ADD R_LEN, R_DST, R_DST
520
521	finishSlowForwardCopy:
522	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
523	// length means that we overrun, but as above, that will be fixed up by
524	// subsequent iterations of the outermost loop.
525	MOVD $0, R1
526	CMP R1, R_LEN
527	BLE loop
528	MOVD (R_TMP3), R_TMP1
529	MOVD R_TMP1, (R_TMP0)
530	ADD $8, R_TMP3, R_TMP3
531	ADD $8, R_TMP0, R_TMP0
532	SUB $8, R_LEN, R_LEN
533	B finishSlowForwardCopy
534
535	verySlowForwardCopy:
536	// verySlowForwardCopy is a simple implementation of forward copy. In C
537	// parlance, this is a do/while loop instead of a while loop, since we know
538	// that length > 0. In Go syntax:
539	//
540	// for {
541	// dst[d] = dst[d - offset]
542	// d++
543	// length--
544	// if length == 0 {
545	// break
546	// }
547	// }
548	MOVB (R_TMP3), R_TMP1
549	MOVB R_TMP1, (R_DST)
550	ADD $1, R_TMP3, R_TMP3
551	ADD $1, R_DST, R_DST
552	SUB $1, R_LEN, R_LEN
553	CBNZ R_LEN, verySlowForwardCopy
554	B loop
555
556	// The code above handles copy tags.
557	// ----------------------------------------
558
559	end:
560	// This is the end of the "for s < len(src)".
561	//
562	// if d != len(dst) { etc }
563	CMP R_DEND, R_DST
564	BNE errCorrupt
565
566	// return 0
567	MOVD $0, ret+48(FP)
568	RET
569
570	errCorrupt:
571	// return decodeErrCodeCorrupt
572	MOVD $1, R_TMP0
573	MOVD R_TMP0, ret+48(FP)
574	RET