Make Nix builds work

author: Rutger Broekhoff 2023-12-29 21:31:53 +0100
committer: Rutger Broekhoff 2023-12-29 21:31:53 +0100
commit: 404aeae4545d2426c089a5f8d5e82dae56f5212b (patch)
tree: 2d84e00af272b39fc04f3795ae06bc48970e57b5 /vendor/github.com/klauspost/compress/s2/decode_arm64.s
parent: 209d8b0187ed025dec9ac149ebcced3462877bff (diff)
download: gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.tar.gz
gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.zip
1 files changed, 574 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
new file mode 100644
index 0000000..4b63d50
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
@@ -0,0 +1,574 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// +build !appengine
+// +build gc
+// +build !noasm
+#include "textflag.h"
+#define R_TMP0 R2
+#define R_TMP1 R3
+#define R_LEN R4
+#define R_OFF R5
+#define R_SRC R6
+#define R_DST R7
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+// TEST_SRC will check if R_SRC is <= SRC_END
+#define TEST_SRC() \
+        CMP R_SEND, R_SRC \
+        BGT errCorrupt
+// MOVD R_SRC, R_TMP1
+// SUB  R_SBASE, R_TMP1, R_TMP1
+// CMP  R_SLEN, R_TMP1
+// BGT  errCorrupt
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//      - R_TMP0        scratch
+//      - R_TMP1        scratch
+//      - R_LEN length or x
+//      - R_OFF offset
+//      - R_SRC &src[s]
+//      - R_DST &dst[d]
+//      + R_DBASE       dst_base
+//      + R_DLEN        dst_len
+//      + R_DEND        dst_base + dst_len
+//      + R_SBASE       src_base
+//      + R_SLEN        src_len
+//      + R_SEND        src_base + src_len
+//      - R_TMP2        used by doCopy
+//      - R_TMP3        used by doCopy
+//
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $56-64
+        // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+        MOVD dst_base+0(FP), R_DBASE
+        MOVD dst_len+8(FP), R_DLEN
+        MOVD R_DBASE, R_DST
+        MOVD R_DBASE, R_DEND
+        ADD  R_DLEN, R_DEND, R_DEND
+        MOVD src_base+24(FP), R_SBASE
+        MOVD src_len+32(FP), R_SLEN
+        MOVD R_SBASE, R_SRC
+        MOVD R_SBASE, R_SEND
+        ADD  R_SLEN, R_SEND, R_SEND
+        MOVD $0, R_OFF
+loop:
+        // for s < len(src)
+        CMP R_SEND, R_SRC
+        BEQ end
+        // R_LEN = uint32(src[s])
+        //
+        // switch src[s] & 0x03
+        MOVBU (R_SRC), R_LEN
+        MOVW  R_LEN, R_TMP1
+        ANDW  $3, R_TMP1
+        MOVW  $1, R1
+        CMPW  R1, R_TMP1
+        BGE   tagCopy
+        // ----------------------------------------
+        // The code below handles literal tags.
+        // case tagLiteral:
+        // x := uint32(src[s] >> 2)
+        // switch
+        MOVW $60, R1
+        LSRW $2, R_LEN, R_LEN
+        CMPW R_LEN, R1
+        BLS  tagLit60Plus
+        // case x < 60:
+        // s++
+        ADD $1, R_SRC, R_SRC
+doLit:
+        // This is the end of the inner "switch", when we have a literal tag.
+        //
+        // We assume that R_LEN == x and x fits in a uint32, where x is the variable
+        // used in the pure Go decode_other.go code.
+        // length = int(x) + 1
+        //
+        // Unlike the pure Go code, we don't need to check if length <= 0 because
+        // R_LEN can hold 64 bits, so the increment cannot overflow.
+        ADD $1, R_LEN, R_LEN
+        // Prepare to check if copying length bytes will run past the end of dst or
+        // src.
+        //
+        // R_TMP0 = len(dst) - d
+        // R_TMP1 = len(src) - s
+        MOVD R_DEND, R_TMP0
+        SUB  R_DST, R_TMP0, R_TMP0
+        MOVD R_SEND, R_TMP1
+        SUB  R_SRC, R_TMP1, R_TMP1
+        // !!! Try a faster technique for short (16 or fewer bytes) copies.
+        //
+        // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+        //   goto callMemmove // Fall back on calling runtime·memmove.
+        // }
+        //
+        // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+        // against 21 instead of 16, because it cannot assume that all of its input
+        // is contiguous in memory and so it needs to leave enough source bytes to
+        // read the next tag without refilling buffers, but Go's Decode assumes
+        // contiguousness (the src argument is a []byte).
+        CMP $16, R_LEN
+        BGT callMemmove
+        CMP $16, R_TMP0
+        BLT callMemmove
+        CMP $16, R_TMP1
+        BLT callMemmove
+        // !!! Implement the copy from src to dst as a 16-byte load and store.
+        // (Decode's documentation says that dst and src must not overlap.)
+        //
+        // This always copies 16 bytes, instead of only length bytes, but that's
+        // OK. If the input is a valid Snappy encoding then subsequent iterations
+        // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+        // non-nil error), so the overrun will be ignored.
+        //
+        // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
+        // 16-byte loads and stores. This technique probably wouldn't be as
+        // effective on architectures that are fussier about alignment.
+        LDP 0(R_SRC), (R_TMP2, R_TMP3)
+        STP (R_TMP2, R_TMP3), 0(R_DST)
+        // d += length
+        // s += length
+        ADD R_LEN, R_DST, R_DST
+        ADD R_LEN, R_SRC, R_SRC
+        B   loop
+callMemmove:
+        // if length > len(dst)-d || length > len(src)-s { etc }
+        CMP R_TMP0, R_LEN
+        BGT errCorrupt
+        CMP R_TMP1, R_LEN
+        BGT errCorrupt
+        // copy(dst[d:], src[s:s+length])
+        //
+        // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+        // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
+        // three registers to the stack, to save local variables across the CALL.
+        MOVD R_DST, 8(RSP)
+        MOVD R_SRC, 16(RSP)
+        MOVD R_LEN, 24(RSP)
+        MOVD R_DST, 32(RSP)
+        MOVD R_SRC, 40(RSP)
+        MOVD R_LEN, 48(RSP)
+        MOVD R_OFF, 56(RSP)
+        CALL runtime·memmove(SB)
+        // Restore local variables: unspill registers from the stack and
+        // re-calculate R_DBASE-R_SEND.
+        MOVD 32(RSP), R_DST
+        MOVD 40(RSP), R_SRC
+        MOVD 48(RSP), R_LEN
+        MOVD 56(RSP), R_OFF
+        MOVD dst_base+0(FP), R_DBASE
+        MOVD dst_len+8(FP), R_DLEN
+        MOVD R_DBASE, R_DEND
+        ADD  R_DLEN, R_DEND, R_DEND
+        MOVD src_base+24(FP), R_SBASE
+        MOVD src_len+32(FP), R_SLEN
+        MOVD R_SBASE, R_SEND
+        ADD  R_SLEN, R_SEND, R_SEND
+        // d += length
+        // s += length
+        ADD R_LEN, R_DST, R_DST
+        ADD R_LEN, R_SRC, R_SRC
+        B   loop
+tagLit60Plus:
+        // !!! This fragment does the
+        //
+        // s += x - 58; if uint(s) > uint(len(src)) { etc }
+        //
+        // checks. In the asm version, we code it once instead of once per switch case.
+        ADD R_LEN, R_SRC, R_SRC
+        SUB $58, R_SRC, R_SRC
+        TEST_SRC()
+        // case x == 60:
+        MOVW $61, R1
+        CMPW R1, R_LEN
+        BEQ  tagLit61
+        BGT  tagLit62Plus
+        // x = uint32(src[s-1])
+        MOVBU -1(R_SRC), R_LEN
+        B     doLit
+tagLit61:
+        // case x == 61:
+        // x = uint32(src[s-2]) | uint32(src[s-1])<<8
+        MOVHU -2(R_SRC), R_LEN
+        B     doLit
+tagLit62Plus:
+        CMPW $62, R_LEN
+        BHI  tagLit63
+        // case x == 62:
+        // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+        MOVHU -3(R_SRC), R_LEN
+        MOVBU -1(R_SRC), R_TMP1
+        ORR   R_TMP1<<16, R_LEN
+        B     doLit
+tagLit63:
+        // case x == 63:
+        // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+        MOVWU -4(R_SRC), R_LEN
+        B     doLit
+        // The code above handles literal tags.
+        // ----------------------------------------
+        // The code below handles copy tags.
+tagCopy4:
+        // case tagCopy4:
+        // s += 5
+        ADD $5, R_SRC, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        MOVD R_SRC, R_TMP1
+        SUB  R_SBASE, R_TMP1, R_TMP1
+        CMP  R_SLEN, R_TMP1
+        BGT  errCorrupt
+        // length = 1 + int(src[s-5])>>2
+        MOVD $1, R1
+        ADD  R_LEN>>2, R1, R_LEN
+        // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+        MOVWU -4(R_SRC), R_OFF
+        B     doCopy
+tagCopy2:
+        // case tagCopy2:
+        // s += 3
+        ADD $3, R_SRC, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        TEST_SRC()
+        // length = 1 + int(src[s-3])>>2
+        MOVD $1, R1
+        ADD  R_LEN>>2, R1, R_LEN
+        // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+        MOVHU -2(R_SRC), R_OFF
+        B     doCopy
+tagCopy:
+        // We have a copy tag. We assume that:
+        //      - R_TMP1 == src[s] & 0x03
+        //      - R_LEN == src[s]
+        CMP $2, R_TMP1
+        BEQ tagCopy2
+        BGT tagCopy4
+        // case tagCopy1:
+        // s += 2
+        ADD $2, R_SRC, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        TEST_SRC()
+        // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+        // Calculate offset in R_TMP0 in case it is a repeat.
+        MOVD  R_LEN, R_TMP0
+        AND   $0xe0, R_TMP0
+        MOVBU -1(R_SRC), R_TMP1
+        ORR   R_TMP0<<3, R_TMP1, R_TMP0
+        // length = 4 + int(src[s-2])>>2&0x7
+        MOVD $7, R1
+        AND  R_LEN>>2, R1, R_LEN
+        ADD  $4, R_LEN, R_LEN
+        // check if repeat code with offset 0.
+        CMP $0, R_TMP0
+        BEQ repeatCode
+        // This is a regular copy, transfer our temporary value to R_OFF (offset)
+        MOVD R_TMP0, R_OFF
+        B    doCopy
+        // This is a repeat code.
+repeatCode:
+        // If length < 9, reuse last offset, with the length already calculated.
+        CMP $9, R_LEN
+        BLT doCopyRepeat
+        BEQ repeatLen1
+        CMP $10, R_LEN
+        BEQ repeatLen2
+repeatLen3:
+        // s +=3
+        ADD $3, R_SRC, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        TEST_SRC()
+        // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
+        MOVBU -1(R_SRC), R_TMP0
+        MOVHU -3(R_SRC), R_LEN
+        ORR   R_TMP0<<16, R_LEN, R_LEN
+        ADD   $65540, R_LEN, R_LEN
+        B     doCopyRepeat
+repeatLen2:
+        // s +=2
+        ADD $2, R_SRC, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        TEST_SRC()
+        // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
+        MOVHU -2(R_SRC), R_LEN
+        ADD   $260, R_LEN, R_LEN
+        B     doCopyRepeat
+repeatLen1:
+        // s +=1
+        ADD $1, R_SRC, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        TEST_SRC()
+        // length = src[s-1] + 8
+        MOVBU -1(R_SRC), R_LEN
+        ADD   $8, R_LEN, R_LEN
+        B     doCopyRepeat
+doCopy:
+        // This is the end of the outer "switch", when we have a copy tag.
+        //
+        // We assume that:
+        //      - R_LEN == length && R_LEN > 0
+        //      - R_OFF == offset
+        // if d < offset { etc }
+        MOVD R_DST, R_TMP1
+        SUB  R_DBASE, R_TMP1, R_TMP1
+        CMP  R_OFF, R_TMP1
+        BLT  errCorrupt
+        // Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+        // if offset <= 0 { etc }
+        CMP $0, R_OFF
+        BLE errCorrupt
+        // if length > len(dst)-d { etc }
+        MOVD R_DEND, R_TMP1
+        SUB  R_DST, R_TMP1, R_TMP1
+        CMP  R_TMP1, R_LEN
+        BGT  errCorrupt
+        // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+        //
+        // Set:
+        //      - R_TMP2 = len(dst)-d
+        //      - R_TMP3 = &dst[d-offset]
+        MOVD R_DEND, R_TMP2
+        SUB  R_DST, R_TMP2, R_TMP2
+        MOVD R_DST, R_TMP3
+        SUB  R_OFF, R_TMP3, R_TMP3
+        // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+        //
+        // First, try using two 8-byte load/stores, similar to the doLit technique
+        // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+        // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+        // and not one 16-byte load/store, and the first store has to be before the
+        // second load, due to the overlap if offset is in the range [8, 16).
+        //
+        // if length > 16 || offset < 8 || len(dst)-d < 16 {
+        //   goto slowForwardCopy
+        // }
+        // copy 16 bytes
+        // d += length
+        CMP  $16, R_LEN
+        BGT  slowForwardCopy
+        CMP  $8, R_OFF
+        BLT  slowForwardCopy
+        CMP  $16, R_TMP2
+        BLT  slowForwardCopy
+        MOVD 0(R_TMP3), R_TMP0
+        MOVD R_TMP0, 0(R_DST)
+        MOVD 8(R_TMP3), R_TMP1
+        MOVD R_TMP1, 8(R_DST)
+        ADD  R_LEN, R_DST, R_DST
+        B    loop
+slowForwardCopy:
+        // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+        // can still try 8-byte load stores, provided we can overrun up to 10 extra
+        // bytes. As above, the overrun will be fixed up by subsequent iterations
+        // of the outermost loop.
+        //
+        // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+        // commentary says:
+        //
+        // ----
+        //
+        // The main part of this loop is a simple copy of eight bytes at a time
+        // until we've copied (at least) the requested amount of bytes.  However,
+        // if d and d-offset are less than eight bytes apart (indicating a
+        // repeating pattern of length < 8), we first need to expand the pattern in
+        // order to get the correct results. For instance, if the buffer looks like
+        // this, with the eight-byte <d-offset> and <d> patterns marked as
+        // intervals:
+        //
+        //    abxxxxxxxxxxxx
+        //    [------]           d-offset
+        //      [------]         d
+        //
+        // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+        // once, after which we can move <d> two bytes without moving <d-offset>:
+        //
+        //    ababxxxxxxxxxx
+        //    [------]           d-offset
+        //        [------]       d
+        //
+        // and repeat the exercise until the two no longer overlap.
+        //
+        // This allows us to do very well in the special case of one single byte
+        // repeated many times, without taking a big hit for more general cases.
+        //
+        // The worst case of extra writing past the end of the match occurs when
+        // offset == 1 and length == 1; the last copy will read from byte positions
+        // [0..7] and write to [4..11], whereas it was only supposed to write to
+        // position 1. Thus, ten excess bytes.
+        //
+        // ----
+        //
+        // That "10 byte overrun" worst case is confirmed by Go's
+        // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+        // and finishSlowForwardCopy algorithm.
+        //
+        // if length > len(dst)-d-10 {
+        //   goto verySlowForwardCopy
+        // }
+        SUB $10, R_TMP2, R_TMP2
+        CMP R_TMP2, R_LEN
+        BGT verySlowForwardCopy
+        // We want to keep the offset, so we use R_TMP2 from here.
+        MOVD R_OFF, R_TMP2
+makeOffsetAtLeast8:
+        // !!! As above, expand the pattern so that offset >= 8 and we can use
+        // 8-byte load/stores.
+        //
+        // for offset < 8 {
+        //   copy 8 bytes from dst[d-offset:] to dst[d:]
+        //   length -= offset
+        //   d      += offset
+        //   offset += offset
+        //   // The two previous lines together means that d-offset, and therefore
+        //   // R_TMP3, is unchanged.
+        // }
+        CMP  $8, R_TMP2
+        BGE  fixUpSlowForwardCopy
+        MOVD (R_TMP3), R_TMP1
+        MOVD R_TMP1, (R_DST)
+        SUB  R_TMP2, R_LEN, R_LEN
+        ADD  R_TMP2, R_DST, R_DST
+        ADD  R_TMP2, R_TMP2, R_TMP2
+        B    makeOffsetAtLeast8
+fixUpSlowForwardCopy:
+        // !!! Add length (which might be negative now) to d (implied by R_DST being
+        // &dst[d]) so that d ends up at the right place when we jump back to the
+        // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
+        // length is positive, copying the remaining length bytes will write to the
+        // right place.
+        MOVD R_DST, R_TMP0
+        ADD  R_LEN, R_DST, R_DST
+finishSlowForwardCopy:
+        // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+        // length means that we overrun, but as above, that will be fixed up by
+        // subsequent iterations of the outermost loop.
+        MOVD $0, R1
+        CMP  R1, R_LEN
+        BLE  loop
+        MOVD (R_TMP3), R_TMP1
+        MOVD R_TMP1, (R_TMP0)
+        ADD  $8, R_TMP3, R_TMP3
+        ADD  $8, R_TMP0, R_TMP0
+        SUB  $8, R_LEN, R_LEN
+        B    finishSlowForwardCopy
+verySlowForwardCopy:
+        // verySlowForwardCopy is a simple implementation of forward copy. In C
+        // parlance, this is a do/while loop instead of a while loop, since we know
+        // that length > 0. In Go syntax:
+        //
+        // for {
+        //   dst[d] = dst[d - offset]
+        //   d++
+        //   length--
+        //   if length == 0 {
+        //     break
+        //   }
+        // }
+        MOVB (R_TMP3), R_TMP1
+        MOVB R_TMP1, (R_DST)
+        ADD  $1, R_TMP3, R_TMP3
+        ADD  $1, R_DST, R_DST
+        SUB  $1, R_LEN, R_LEN
+        CBNZ R_LEN, verySlowForwardCopy
+        B    loop
+        // The code above handles copy tags.
+        // ----------------------------------------
+end:
+        // This is the end of the "for s < len(src)".
+        //
+        // if d != len(dst) { etc }
+        CMP R_DEND, R_DST
+        BNE errCorrupt
+        // return 0
+        MOVD $0, ret+48(FP)
+        RET
+errCorrupt:
+        // return decodeErrCodeCorrupt
+        MOVD $1, R_TMP0
+        MOVD R_TMP0, ret+48(FP)
+        RET
author	Rutger Broekhoff	2023-12-29 21:31:53 +0100
committer	Rutger Broekhoff	2023-12-29 21:31:53 +0100
commit	404aeae4545d2426c089a5f8d5e82dae56f5212b (patch)
tree	2d84e00af272b39fc04f3795ae06bc48970e57b5 /vendor/github.com/klauspost/compress/s2/decode_arm64.s
parent	209d8b0187ed025dec9ac149ebcced3462877bff (diff)
download	gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.tar.gz gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.zip

diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s new file mode 100644 index 0000000..4b63d50 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
@@ -0,0 +1,574 @@
	1	// Copyright 2020 The Go Authors. All rights reserved.
	2	// Use of this source code is governed by a BSD-style
	3	// license that can be found in the LICENSE file.
	4
	5	// +build !appengine
	6	// +build gc
	7	// +build !noasm
	8
	9	#include "textflag.h"
	10
	11	#define R_TMP0 R2
	12	#define R_TMP1 R3
	13	#define R_LEN R4
	14	#define R_OFF R5
	15	#define R_SRC R6
	16	#define R_DST R7
	17	#define R_DBASE R8
	18	#define R_DLEN R9
	19	#define R_DEND R10
	20	#define R_SBASE R11
	21	#define R_SLEN R12
	22	#define R_SEND R13
	23	#define R_TMP2 R14
	24	#define R_TMP3 R15
	25
	26	// TEST_SRC will check if R_SRC is <= SRC_END
	27	#define TEST_SRC() \
	28	CMP R_SEND, R_SRC \
	29	BGT errCorrupt
	30
	31	// MOVD R_SRC, R_TMP1
	32	// SUB R_SBASE, R_TMP1, R_TMP1
	33	// CMP R_SLEN, R_TMP1
	34	// BGT errCorrupt
	35
	36	// The asm code generally follows the pure Go code in decode_other.go, except
	37	// where marked with a "!!!".
	38
	39	// func decode(dst, src []byte) int
	40	//
	41	// All local variables fit into registers. The non-zero stack size is only to
	42	// spill registers and push args when issuing a CALL. The register allocation:
	43	// - R_TMP0 scratch
	44	// - R_TMP1 scratch
	45	// - R_LEN length or x
	46	// - R_OFF offset
	47	// - R_SRC &src[s]
	48	// - R_DST &dst[d]
	49	// + R_DBASE dst_base
	50	// + R_DLEN dst_len
	51	// + R_DEND dst_base + dst_len
	52	// + R_SBASE src_base
	53	// + R_SLEN src_len
	54	// + R_SEND src_base + src_len
	55	// - R_TMP2 used by doCopy
	56	// - R_TMP3 used by doCopy
	57	//
	58	// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
	59	// function, and after a CALL returns, and are not otherwise modified.
	60	//
	61	// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
	62	// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
	63	TEXT ·s2Decode(SB), NOSPLIT, $56-64
	64	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
	65	MOVD dst_base+0(FP), R_DBASE
	66	MOVD dst_len+8(FP), R_DLEN
	67	MOVD R_DBASE, R_DST
	68	MOVD R_DBASE, R_DEND
	69	ADD R_DLEN, R_DEND, R_DEND
	70	MOVD src_base+24(FP), R_SBASE
	71	MOVD src_len+32(FP), R_SLEN
	72	MOVD R_SBASE, R_SRC
	73	MOVD R_SBASE, R_SEND
	74	ADD R_SLEN, R_SEND, R_SEND
	75	MOVD $0, R_OFF
	76
	77	loop:
	78	// for s < len(src)
	79	CMP R_SEND, R_SRC
	80	BEQ end
	81
	82	// R_LEN = uint32(src[s])
	83	//
	84	// switch src[s] & 0x03
	85	MOVBU (R_SRC), R_LEN
	86	MOVW R_LEN, R_TMP1
	87	ANDW $3, R_TMP1
	88	MOVW $1, R1
	89	CMPW R1, R_TMP1
	90	BGE tagCopy
	91
	92	// ----------------------------------------
	93	// The code below handles literal tags.
	94
	95	// case tagLiteral:
	96	// x := uint32(src[s] >> 2)
	97	// switch
	98	MOVW $60, R1
	99	LSRW $2, R_LEN, R_LEN
	100	CMPW R_LEN, R1
	101	BLS tagLit60Plus
	102
	103	// case x < 60:
	104	// s++
	105	ADD $1, R_SRC, R_SRC
	106
	107	doLit:
	108	// This is the end of the inner "switch", when we have a literal tag.
	109	//
	110	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
	111	// used in the pure Go decode_other.go code.
	112
	113	// length = int(x) + 1
	114	//
	115	// Unlike the pure Go code, we don't need to check if length <= 0 because
	116	// R_LEN can hold 64 bits, so the increment cannot overflow.
	117	ADD $1, R_LEN, R_LEN
	118
	119	// Prepare to check if copying length bytes will run past the end of dst or
	120	// src.
	121	//
	122	// R_TMP0 = len(dst) - d
	123	// R_TMP1 = len(src) - s
	124	MOVD R_DEND, R_TMP0
	125	SUB R_DST, R_TMP0, R_TMP0
	126	MOVD R_SEND, R_TMP1
	127	SUB R_SRC, R_TMP1, R_TMP1
	128
	129	// !!! Try a faster technique for short (16 or fewer bytes) copies.
	130	//
	131	// if length > 16 \|\| len(dst)-d < 16 \|\| len(src)-s < 16 {
	132	// goto callMemmove // Fall back on calling runtime·memmove.
	133	// }
	134	//
	135	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
	136	// against 21 instead of 16, because it cannot assume that all of its input
	137	// is contiguous in memory and so it needs to leave enough source bytes to
	138	// read the next tag without refilling buffers, but Go's Decode assumes
	139	// contiguousness (the src argument is a []byte).
	140	CMP $16, R_LEN
	141	BGT callMemmove
	142	CMP $16, R_TMP0
	143	BLT callMemmove
	144	CMP $16, R_TMP1
	145	BLT callMemmove
	146
	147	// !!! Implement the copy from src to dst as a 16-byte load and store.
	148	// (Decode's documentation says that dst and src must not overlap.)
	149	//
	150	// This always copies 16 bytes, instead of only length bytes, but that's
	151	// OK. If the input is a valid Snappy encoding then subsequent iterations
	152	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
	153	// non-nil error), so the overrun will be ignored.
	154	//
	155	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
	156	// 16-byte loads and stores. This technique probably wouldn't be as
	157	// effective on architectures that are fussier about alignment.
	158	LDP 0(R_SRC), (R_TMP2, R_TMP3)
	159	STP (R_TMP2, R_TMP3), 0(R_DST)
	160
	161	// d += length
	162	// s += length
	163	ADD R_LEN, R_DST, R_DST
	164	ADD R_LEN, R_SRC, R_SRC
	165	B loop
	166
	167	callMemmove:
	168	// if length > len(dst)-d \|\| length > len(src)-s { etc }
	169	CMP R_TMP0, R_LEN
	170	BGT errCorrupt
	171	CMP R_TMP1, R_LEN
	172	BGT errCorrupt
	173
	174	// copy(dst[d:], src[s:s+length])
	175	//
	176	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
	177	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
	178	// three registers to the stack, to save local variables across the CALL.
	179	MOVD R_DST, 8(RSP)
	180	MOVD R_SRC, 16(RSP)
	181	MOVD R_LEN, 24(RSP)
	182	MOVD R_DST, 32(RSP)
	183	MOVD R_SRC, 40(RSP)
	184	MOVD R_LEN, 48(RSP)
	185	MOVD R_OFF, 56(RSP)
	186	CALL runtime·memmove(SB)
	187
	188	// Restore local variables: unspill registers from the stack and
	189	// re-calculate R_DBASE-R_SEND.
	190	MOVD 32(RSP), R_DST
	191	MOVD 40(RSP), R_SRC
	192	MOVD 48(RSP), R_LEN
	193	MOVD 56(RSP), R_OFF
	194	MOVD dst_base+0(FP), R_DBASE
	195	MOVD dst_len+8(FP), R_DLEN
	196	MOVD R_DBASE, R_DEND
	197	ADD R_DLEN, R_DEND, R_DEND
	198	MOVD src_base+24(FP), R_SBASE
	199	MOVD src_len+32(FP), R_SLEN
	200	MOVD R_SBASE, R_SEND
	201	ADD R_SLEN, R_SEND, R_SEND
	202
	203	// d += length
	204	// s += length
	205	ADD R_LEN, R_DST, R_DST
	206	ADD R_LEN, R_SRC, R_SRC
	207	B loop
	208
	209	tagLit60Plus:
	210	// !!! This fragment does the
	211	//
	212	// s += x - 58; if uint(s) > uint(len(src)) { etc }
	213	//
	214	// checks. In the asm version, we code it once instead of once per switch case.
	215	ADD R_LEN, R_SRC, R_SRC
	216	SUB $58, R_SRC, R_SRC
	217	TEST_SRC()
	218
	219	// case x == 60:
	220	MOVW $61, R1
	221	CMPW R1, R_LEN
	222	BEQ tagLit61
	223	BGT tagLit62Plus
	224
	225	// x = uint32(src[s-1])
	226	MOVBU -1(R_SRC), R_LEN
	227	B doLit
	228
	229	tagLit61:
	230	// case x == 61:
	231	// x = uint32(src[s-2]) \| uint32(src[s-1])<<8
	232	MOVHU -2(R_SRC), R_LEN
	233	B doLit
	234
	235	tagLit62Plus:
	236	CMPW $62, R_LEN
	237	BHI tagLit63
	238
	239	// case x == 62:
	240	// x = uint32(src[s-3]) \| uint32(src[s-2])<<8 \| uint32(src[s-1])<<16
	241	MOVHU -3(R_SRC), R_LEN
	242	MOVBU -1(R_SRC), R_TMP1
	243	ORR R_TMP1<<16, R_LEN
	244	B doLit
	245
	246	tagLit63:
	247	// case x == 63:
	248	// x = uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24
	249	MOVWU -4(R_SRC), R_LEN
	250	B doLit
	251
	252	// The code above handles literal tags.
	253	// ----------------------------------------
	254	// The code below handles copy tags.
	255
	256	tagCopy4:
	257	// case tagCopy4:
	258	// s += 5
	259	ADD $5, R_SRC, R_SRC
	260
	261	// if uint(s) > uint(len(src)) { etc }
	262	MOVD R_SRC, R_TMP1
	263	SUB R_SBASE, R_TMP1, R_TMP1
	264	CMP R_SLEN, R_TMP1
	265	BGT errCorrupt
	266
	267	// length = 1 + int(src[s-5])>>2
	268	MOVD $1, R1
	269	ADD R_LEN>>2, R1, R_LEN
	270
	271	// offset = int(uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24)
	272	MOVWU -4(R_SRC), R_OFF
	273	B doCopy
	274
	275	tagCopy2:
	276	// case tagCopy2:
	277	// s += 3
	278	ADD $3, R_SRC, R_SRC
	279
	280	// if uint(s) > uint(len(src)) { etc }
	281	TEST_SRC()
	282
	283	// length = 1 + int(src[s-3])>>2
	284	MOVD $1, R1
	285	ADD R_LEN>>2, R1, R_LEN
	286
	287	// offset = int(uint32(src[s-2]) \| uint32(src[s-1])<<8)
	288	MOVHU -2(R_SRC), R_OFF
	289	B doCopy
	290
	291	tagCopy:
	292	// We have a copy tag. We assume that:
	293	// - R_TMP1 == src[s] & 0x03
	294	// - R_LEN == src[s]
	295	CMP $2, R_TMP1
	296	BEQ tagCopy2
	297	BGT tagCopy4
	298
	299	// case tagCopy1:
	300	// s += 2
	301	ADD $2, R_SRC, R_SRC
	302
	303	// if uint(s) > uint(len(src)) { etc }
	304	TEST_SRC()
	305
	306	// offset = int(uint32(src[s-2])&0xe0<<3 \| uint32(src[s-1]))
	307	// Calculate offset in R_TMP0 in case it is a repeat.
	308	MOVD R_LEN, R_TMP0
	309	AND $0xe0, R_TMP0
	310	MOVBU -1(R_SRC), R_TMP1
	311	ORR R_TMP0<<3, R_TMP1, R_TMP0
	312
	313	// length = 4 + int(src[s-2])>>2&0x7
	314	MOVD $7, R1
	315	AND R_LEN>>2, R1, R_LEN
	316	ADD $4, R_LEN, R_LEN
	317
	318	// check if repeat code with offset 0.
	319	CMP $0, R_TMP0
	320	BEQ repeatCode
	321
	322	// This is a regular copy, transfer our temporary value to R_OFF (offset)
	323	MOVD R_TMP0, R_OFF
	324	B doCopy
	325
	326	// This is a repeat code.
	327	repeatCode:
	328	// If length < 9, reuse last offset, with the length already calculated.
	329	CMP $9, R_LEN
	330	BLT doCopyRepeat
	331	BEQ repeatLen1
	332	CMP $10, R_LEN
	333	BEQ repeatLen2
	334
	335	repeatLen3:
	336	// s +=3
	337	ADD $3, R_SRC, R_SRC
	338
	339	// if uint(s) > uint(len(src)) { etc }
	340	TEST_SRC()
	341
	342	// length = uint32(src[s-3]) \| (uint32(src[s-2])<<8) \| (uint32(src[s-1])<<16) + 65540
	343	MOVBU -1(R_SRC), R_TMP0
	344	MOVHU -3(R_SRC), R_LEN
	345	ORR R_TMP0<<16, R_LEN, R_LEN
	346	ADD $65540, R_LEN, R_LEN
	347	B doCopyRepeat
	348
	349	repeatLen2:
	350	// s +=2
	351	ADD $2, R_SRC, R_SRC
	352
	353	// if uint(s) > uint(len(src)) { etc }
	354	TEST_SRC()
	355
	356	// length = uint32(src[s-2]) \| (uint32(src[s-1])<<8) + 260
	357	MOVHU -2(R_SRC), R_LEN
	358	ADD $260, R_LEN, R_LEN
	359	B doCopyRepeat
	360
	361	repeatLen1:
	362	// s +=1
	363	ADD $1, R_SRC, R_SRC
	364
	365	// if uint(s) > uint(len(src)) { etc }
	366	TEST_SRC()
	367
	368	// length = src[s-1] + 8
	369	MOVBU -1(R_SRC), R_LEN
	370	ADD $8, R_LEN, R_LEN
	371	B doCopyRepeat
	372
	373	doCopy:
	374	// This is the end of the outer "switch", when we have a copy tag.
	375	//
	376	// We assume that:
	377	// - R_LEN == length && R_LEN > 0
	378	// - R_OFF == offset
	379
	380	// if d < offset { etc }
	381	MOVD R_DST, R_TMP1
	382	SUB R_DBASE, R_TMP1, R_TMP1
	383	CMP R_OFF, R_TMP1
	384	BLT errCorrupt
	385
	386	// Repeat values can skip the test above, since any offset > 0 will be in dst.
	387	doCopyRepeat:
	388
	389	// if offset <= 0 { etc }
	390	CMP $0, R_OFF
	391	BLE errCorrupt
	392
	393	// if length > len(dst)-d { etc }
	394	MOVD R_DEND, R_TMP1
	395	SUB R_DST, R_TMP1, R_TMP1
	396	CMP R_TMP1, R_LEN
	397	BGT errCorrupt
	398
	399	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
	400	//
	401	// Set:
	402	// - R_TMP2 = len(dst)-d
	403	// - R_TMP3 = &dst[d-offset]
	404	MOVD R_DEND, R_TMP2
	405	SUB R_DST, R_TMP2, R_TMP2
	406	MOVD R_DST, R_TMP3
	407	SUB R_OFF, R_TMP3, R_TMP3
	408
	409	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
	410	//
	411	// First, try using two 8-byte load/stores, similar to the doLit technique
	412	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
	413	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
	414	// and not one 16-byte load/store, and the first store has to be before the
	415	// second load, due to the overlap if offset is in the range [8, 16).
	416	//
	417	// if length > 16 \|\| offset < 8 \|\| len(dst)-d < 16 {
	418	// goto slowForwardCopy
	419	// }
	420	// copy 16 bytes
	421	// d += length
	422	CMP $16, R_LEN
	423	BGT slowForwardCopy
	424	CMP $8, R_OFF
	425	BLT slowForwardCopy
	426	CMP $16, R_TMP2
	427	BLT slowForwardCopy
	428	MOVD 0(R_TMP3), R_TMP0
	429	MOVD R_TMP0, 0(R_DST)
	430	MOVD 8(R_TMP3), R_TMP1
	431	MOVD R_TMP1, 8(R_DST)
	432	ADD R_LEN, R_DST, R_DST
	433	B loop
	434
	435	slowForwardCopy:
	436	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
	437	// can still try 8-byte load stores, provided we can overrun up to 10 extra
	438	// bytes. As above, the overrun will be fixed up by subsequent iterations
	439	// of the outermost loop.
	440	//
	441	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
	442	// commentary says:
	443	//
	444	// ----
	445	//
	446	// The main part of this loop is a simple copy of eight bytes at a time
	447	// until we've copied (at least) the requested amount of bytes. However,
	448	// if d and d-offset are less than eight bytes apart (indicating a
	449	// repeating pattern of length < 8), we first need to expand the pattern in
	450	// order to get the correct results. For instance, if the buffer looks like
	451	// this, with the eight-byte <d-offset> and <d> patterns marked as
	452	// intervals:
	453	//
	454	// abxxxxxxxxxxxx
	455	// [------] d-offset
	456	// [------] d
	457	//
	458	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
	459	// once, after which we can move <d> two bytes without moving <d-offset>:
	460	//
	461	// ababxxxxxxxxxx
	462	// [------] d-offset
	463	// [------] d
	464	//
	465	// and repeat the exercise until the two no longer overlap.
	466	//
	467	// This allows us to do very well in the special case of one single byte
	468	// repeated many times, without taking a big hit for more general cases.
	469	//
	470	// The worst case of extra writing past the end of the match occurs when
	471	// offset == 1 and length == 1; the last copy will read from byte positions
	472	// [0..7] and write to [4..11], whereas it was only supposed to write to
	473	// position 1. Thus, ten excess bytes.
	474	//
	475	// ----
	476	//
	477	// That "10 byte overrun" worst case is confirmed by Go's
	478	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
	479	// and finishSlowForwardCopy algorithm.
	480	//
	481	// if length > len(dst)-d-10 {
	482	// goto verySlowForwardCopy
	483	// }
	484	SUB $10, R_TMP2, R_TMP2
	485	CMP R_TMP2, R_LEN
	486	BGT verySlowForwardCopy
	487
	488	// We want to keep the offset, so we use R_TMP2 from here.
	489	MOVD R_OFF, R_TMP2
	490
	491	makeOffsetAtLeast8:
	492	// !!! As above, expand the pattern so that offset >= 8 and we can use
	493	// 8-byte load/stores.
	494	//
	495	// for offset < 8 {
	496	// copy 8 bytes from dst[d-offset:] to dst[d:]
	497	// length -= offset
	498	// d += offset
	499	// offset += offset
	500	// // The two previous lines together means that d-offset, and therefore
	501	// // R_TMP3, is unchanged.
	502	// }
	503	CMP $8, R_TMP2
	504	BGE fixUpSlowForwardCopy
	505	MOVD (R_TMP3), R_TMP1
	506	MOVD R_TMP1, (R_DST)
	507	SUB R_TMP2, R_LEN, R_LEN
	508	ADD R_TMP2, R_DST, R_DST
	509	ADD R_TMP2, R_TMP2, R_TMP2
	510	B makeOffsetAtLeast8
	511
	512	fixUpSlowForwardCopy:
	513	// !!! Add length (which might be negative now) to d (implied by R_DST being
	514	// &dst[d]) so that d ends up at the right place when we jump back to the
	515	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
	516	// length is positive, copying the remaining length bytes will write to the
	517	// right place.
	518	MOVD R_DST, R_TMP0
	519	ADD R_LEN, R_DST, R_DST
	520
	521	finishSlowForwardCopy:
	522	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
	523	// length means that we overrun, but as above, that will be fixed up by
	524	// subsequent iterations of the outermost loop.
	525	MOVD $0, R1
	526	CMP R1, R_LEN
	527	BLE loop
	528	MOVD (R_TMP3), R_TMP1
	529	MOVD R_TMP1, (R_TMP0)
	530	ADD $8, R_TMP3, R_TMP3
	531	ADD $8, R_TMP0, R_TMP0
	532	SUB $8, R_LEN, R_LEN
	533	B finishSlowForwardCopy
	534
	535	verySlowForwardCopy:
	536	// verySlowForwardCopy is a simple implementation of forward copy. In C
	537	// parlance, this is a do/while loop instead of a while loop, since we know
	538	// that length > 0. In Go syntax:
	539	//
	540	// for {
	541	// dst[d] = dst[d - offset]
	542	// d++
	543	// length--
	544	// if length == 0 {
	545	// break
	546	// }
	547	// }
	548	MOVB (R_TMP3), R_TMP1
	549	MOVB R_TMP1, (R_DST)
	550	ADD $1, R_TMP3, R_TMP3
	551	ADD $1, R_DST, R_DST
	552	SUB $1, R_LEN, R_LEN
	553	CBNZ R_LEN, verySlowForwardCopy
	554	B loop
	555
	556	// The code above handles copy tags.
	557	// ----------------------------------------
	558
	559	end:
	560	// This is the end of the "for s < len(src)".
	561	//
	562	// if d != len(dst) { etc }
	563	CMP R_DEND, R_DST
	564	BNE errCorrupt
	565
	566	// return 0
	567	MOVD $0, ret+48(FP)
	568	RET
	569
	570	errCorrupt:
	571	// return decodeErrCodeCorrupt
	572	MOVD $1, R_TMP0
	573	MOVD R_TMP0, ret+48(FP)
	574	RET