1 files changed, 568 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
new file mode 100644
index 0000000..9b105e0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
@@ -0,0 +1,568 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Copyright (c) 2019 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// +build !appengine
+// +build gc
+// +build !noasm
+#include "textflag.h"
+#define R_TMP0 AX
+#define R_TMP1 BX
+#define R_LEN CX
+#define R_OFF DX
+#define R_SRC SI
+#define R_DST DI
+#define R_DBASE R8
+#define R_DLEN R9
+#define R_DEND R10
+#define R_SBASE R11
+#define R_SLEN R12
+#define R_SEND R13
+#define R_TMP2 R14
+#define R_TMP3 R15
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//      - R_TMP0        scratch
+//      - R_TMP1        scratch
+//      - R_LEN     length or x (shared)
+//      - R_OFF     offset
+//      - R_SRC     &src[s]
+//      - R_DST     &dst[d]
+//      + R_DBASE       dst_base
+//      + R_DLEN        dst_len
+//      + R_DEND        dst_base + dst_len
+//      + R_SBASE       src_base
+//      + R_SLEN        src_len
+//      + R_SEND        src_base + src_len
+//      - R_TMP2        used by doCopy
+//      - R_TMP3        used by doCopy
+//
+// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly R_DST - R_DBASE,  and len(dst)-d is R_DEND - R_DST.
+// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
+TEXT ·s2Decode(SB), NOSPLIT, $48-56
+        // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
+        MOVQ dst_base+0(FP), R_DBASE
+        MOVQ dst_len+8(FP), R_DLEN
+        MOVQ R_DBASE, R_DST
+        MOVQ R_DBASE, R_DEND
+        ADDQ R_DLEN, R_DEND
+        MOVQ src_base+24(FP), R_SBASE
+        MOVQ src_len+32(FP), R_SLEN
+        MOVQ R_SBASE, R_SRC
+        MOVQ R_SBASE, R_SEND
+        ADDQ R_SLEN, R_SEND
+        XORQ R_OFF, R_OFF
+loop:
+        // for s < len(src)
+        CMPQ R_SRC, R_SEND
+        JEQ  end
+        // R_LEN = uint32(src[s])
+        //
+        // switch src[s] & 0x03
+        MOVBLZX (R_SRC), R_LEN
+        MOVL    R_LEN, R_TMP1
+        ANDL    $3, R_TMP1
+        CMPL    R_TMP1, $1
+        JAE     tagCopy
+        // ----------------------------------------
+        // The code below handles literal tags.
+        // case tagLiteral:
+        // x := uint32(src[s] >> 2)
+        // switch
+        SHRL $2, R_LEN
+        CMPL R_LEN, $60
+        JAE  tagLit60Plus
+        // case x < 60:
+        // s++
+        INCQ R_SRC
+doLit:
+        // This is the end of the inner "switch", when we have a literal tag.
+        //
+        // We assume that R_LEN == x and x fits in a uint32, where x is the variable
+        // used in the pure Go decode_other.go code.
+        // length = int(x) + 1
+        //
+        // Unlike the pure Go code, we don't need to check if length <= 0 because
+        // R_LEN can hold 64 bits, so the increment cannot overflow.
+        INCQ R_LEN
+        // Prepare to check if copying length bytes will run past the end of dst or
+        // src.
+        //
+        // R_TMP0 = len(dst) - d
+        // R_TMP1 = len(src) - s
+        MOVQ R_DEND, R_TMP0
+        SUBQ R_DST, R_TMP0
+        MOVQ R_SEND, R_TMP1
+        SUBQ R_SRC, R_TMP1
+        // !!! Try a faster technique for short (16 or fewer bytes) copies.
+        //
+        // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+        //   goto callMemmove // Fall back on calling runtime·memmove.
+        // }
+        //
+        // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+        // against 21 instead of 16, because it cannot assume that all of its input
+        // is contiguous in memory and so it needs to leave enough source bytes to
+        // read the next tag without refilling buffers, but Go's Decode assumes
+        // contiguousness (the src argument is a []byte).
+        CMPQ R_LEN, $16
+        JGT  callMemmove
+        CMPQ R_TMP0, $16
+        JLT  callMemmove
+        CMPQ R_TMP1, $16
+        JLT  callMemmove
+        // !!! Implement the copy from src to dst as a 16-byte load and store.
+        // (Decode's documentation says that dst and src must not overlap.)
+        //
+        // This always copies 16 bytes, instead of only length bytes, but that's
+        // OK. If the input is a valid Snappy encoding then subsequent iterations
+        // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+        // non-nil error), so the overrun will be ignored.
+        //
+        // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+        // 16-byte loads and stores. This technique probably wouldn't be as
+        // effective on architectures that are fussier about alignment.
+        MOVOU 0(R_SRC), X0
+        MOVOU X0, 0(R_DST)
+        // d += length
+        // s += length
+        ADDQ R_LEN, R_DST
+        ADDQ R_LEN, R_SRC
+        JMP  loop
+callMemmove:
+        // if length > len(dst)-d || length > len(src)-s { etc }
+        CMPQ R_LEN, R_TMP0
+        JGT  errCorrupt
+        CMPQ R_LEN, R_TMP1
+        JGT  errCorrupt
+        // copy(dst[d:], src[s:s+length])
+        //
+        // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+        // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
+        // three registers to the stack, to save local variables across the CALL.
+        MOVQ R_DST, 0(SP)
+        MOVQ R_SRC, 8(SP)
+        MOVQ R_LEN, 16(SP)
+        MOVQ R_DST, 24(SP)
+        MOVQ R_SRC, 32(SP)
+        MOVQ R_LEN, 40(SP)
+        MOVQ R_OFF, 48(SP)
+        CALL runtime·memmove(SB)
+        // Restore local variables: unspill registers from the stack and
+        // re-calculate R_DBASE-R_SEND.
+        MOVQ 24(SP), R_DST
+        MOVQ 32(SP), R_SRC
+        MOVQ 40(SP), R_LEN
+        MOVQ 48(SP), R_OFF
+        MOVQ dst_base+0(FP), R_DBASE
+        MOVQ dst_len+8(FP), R_DLEN
+        MOVQ R_DBASE, R_DEND
+        ADDQ R_DLEN, R_DEND
+        MOVQ src_base+24(FP), R_SBASE
+        MOVQ src_len+32(FP), R_SLEN
+        MOVQ R_SBASE, R_SEND
+        ADDQ R_SLEN, R_SEND
+        // d += length
+        // s += length
+        ADDQ R_LEN, R_DST
+        ADDQ R_LEN, R_SRC
+        JMP  loop
+tagLit60Plus:
+        // !!! This fragment does the
+        //
+        // s += x - 58; if uint(s) > uint(len(src)) { etc }
+        //
+        // checks. In the asm version, we code it once instead of once per switch case.
+        ADDQ R_LEN, R_SRC
+        SUBQ $58, R_SRC
+        CMPQ R_SRC, R_SEND
+        JA   errCorrupt
+        // case x == 60:
+        CMPL R_LEN, $61
+        JEQ  tagLit61
+        JA   tagLit62Plus
+        // x = uint32(src[s-1])
+        MOVBLZX -1(R_SRC), R_LEN
+        JMP     doLit
+tagLit61:
+        // case x == 61:
+        // x = uint32(src[s-2]) | uint32(src[s-1])<<8
+        MOVWLZX -2(R_SRC), R_LEN
+        JMP     doLit
+tagLit62Plus:
+        CMPL R_LEN, $62
+        JA   tagLit63
+        // case x == 62:
+        // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+        // We read one byte, safe to read one back, since we are just reading tag.
+        // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
+        MOVL -4(R_SRC), R_LEN
+        SHRL $8, R_LEN
+        JMP  doLit
+tagLit63:
+        // case x == 63:
+        // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+        MOVL -4(R_SRC), R_LEN
+        JMP  doLit
+// The code above handles literal tags.
+// ----------------------------------------
+// The code below handles copy tags.
+tagCopy4:
+        // case tagCopy4:
+        // s += 5
+        ADDQ $5, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        CMPQ R_SRC, R_SEND
+        JA   errCorrupt
+        // length = 1 + int(src[s-5])>>2
+        SHRQ $2, R_LEN
+        INCQ R_LEN
+        // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+        MOVLQZX -4(R_SRC), R_OFF
+        JMP     doCopy
+tagCopy2:
+        // case tagCopy2:
+        // s += 3
+        ADDQ $3, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        CMPQ R_SRC, R_SEND
+        JA   errCorrupt
+        // length = 1 + int(src[s-3])>>2
+        SHRQ $2, R_LEN
+        INCQ R_LEN
+        // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+        MOVWQZX -2(R_SRC), R_OFF
+        JMP     doCopy
+tagCopy:
+        // We have a copy tag. We assume that:
+        //      - R_TMP1 == src[s] & 0x03
+        //      - R_LEN == src[s]
+        CMPQ R_TMP1, $2
+        JEQ  tagCopy2
+        JA   tagCopy4
+        // case tagCopy1:
+        // s += 2
+        ADDQ $2, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        CMPQ R_SRC, R_SEND
+        JA   errCorrupt
+        // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+        // length = 4 + int(src[s-2])>>2&0x7
+        MOVBQZX -1(R_SRC), R_TMP1
+        MOVQ    R_LEN, R_TMP0
+        SHRQ    $2, R_LEN
+        ANDQ    $0xe0, R_TMP0
+        ANDQ    $7, R_LEN
+        SHLQ    $3, R_TMP0
+        ADDQ    $4, R_LEN
+        ORQ     R_TMP1, R_TMP0
+        // check if repeat code, ZF set by ORQ.
+        JZ repeatCode
+        // This is a regular copy, transfer our temporary value to R_OFF (length)
+        MOVQ R_TMP0, R_OFF
+        JMP  doCopy
+// This is a repeat code.
+repeatCode:
+        // If length < 9, reuse last offset, with the length already calculated.
+        CMPQ R_LEN, $9
+        JL   doCopyRepeat
+        // Read additional bytes for length.
+        JE repeatLen1
+        // Rare, so the extra branch shouldn't hurt too much.
+        CMPQ R_LEN, $10
+        JE   repeatLen2
+        JMP  repeatLen3
+// Read repeat lengths.
+repeatLen1:
+        // s ++
+        ADDQ $1, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        CMPQ R_SRC, R_SEND
+        JA   errCorrupt
+        // length = src[s-1] + 8
+        MOVBQZX -1(R_SRC), R_LEN
+        ADDL    $8, R_LEN
+        JMP     doCopyRepeat
+repeatLen2:
+        // s +=2
+        ADDQ $2, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        CMPQ R_SRC, R_SEND
+        JA   errCorrupt
+        // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
+        MOVWQZX -2(R_SRC), R_LEN
+        ADDL    $260, R_LEN
+        JMP     doCopyRepeat
+repeatLen3:
+        // s +=3
+        ADDQ $3, R_SRC
+        // if uint(s) > uint(len(src)) { etc }
+        CMPQ R_SRC, R_SEND
+        JA   errCorrupt
+        // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
+        // Read one byte further back (just part of the tag, shifted out)
+        MOVL -4(R_SRC), R_LEN
+        SHRL $8, R_LEN
+        ADDL $65540, R_LEN
+        JMP  doCopyRepeat
+doCopy:
+        // This is the end of the outer "switch", when we have a copy tag.
+        //
+        // We assume that:
+        //      - R_LEN == length && R_LEN > 0
+        //      - R_OFF == offset
+        // if d < offset { etc }
+        MOVQ R_DST, R_TMP1
+        SUBQ R_DBASE, R_TMP1
+        CMPQ R_TMP1, R_OFF
+        JLT  errCorrupt
+        // Repeat values can skip the test above, since any offset > 0 will be in dst.
+doCopyRepeat:
+        // if offset <= 0 { etc }
+        CMPQ R_OFF, $0
+        JLE  errCorrupt
+        // if length > len(dst)-d { etc }
+        MOVQ R_DEND, R_TMP1
+        SUBQ R_DST, R_TMP1
+        CMPQ R_LEN, R_TMP1
+        JGT  errCorrupt
+        // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+        //
+        // Set:
+        //      - R_TMP2 = len(dst)-d
+        //      - R_TMP3 = &dst[d-offset]
+        MOVQ R_DEND, R_TMP2
+        SUBQ R_DST, R_TMP2
+        MOVQ R_DST, R_TMP3
+        SUBQ R_OFF, R_TMP3
+        // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+        //
+        // First, try using two 8-byte load/stores, similar to the doLit technique
+        // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+        // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+        // and not one 16-byte load/store, and the first store has to be before the
+        // second load, due to the overlap if offset is in the range [8, 16).
+        //
+        // if length > 16 || offset < 8 || len(dst)-d < 16 {
+        //   goto slowForwardCopy
+        // }
+        // copy 16 bytes
+        // d += length
+        CMPQ R_LEN, $16
+        JGT  slowForwardCopy
+        CMPQ R_OFF, $8
+        JLT  slowForwardCopy
+        CMPQ R_TMP2, $16
+        JLT  slowForwardCopy
+        MOVQ 0(R_TMP3), R_TMP0
+        MOVQ R_TMP0, 0(R_DST)
+        MOVQ 8(R_TMP3), R_TMP1
+        MOVQ R_TMP1, 8(R_DST)
+        ADDQ R_LEN, R_DST
+        JMP  loop
+slowForwardCopy:
+        // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+        // can still try 8-byte load stores, provided we can overrun up to 10 extra
+        // bytes. As above, the overrun will be fixed up by subsequent iterations
+        // of the outermost loop.
+        //
+        // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+        // commentary says:
+        //
+        // ----
+        //
+        // The main part of this loop is a simple copy of eight bytes at a time
+        // until we've copied (at least) the requested amount of bytes.  However,
+        // if d and d-offset are less than eight bytes apart (indicating a
+        // repeating pattern of length < 8), we first need to expand the pattern in
+        // order to get the correct results. For instance, if the buffer looks like
+        // this, with the eight-byte <d-offset> and <d> patterns marked as
+        // intervals:
+        //
+        //    abxxxxxxxxxxxx
+        //    [------]           d-offset
+        //      [------]         d
+        //
+        // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+        // once, after which we can move <d> two bytes without moving <d-offset>:
+        //
+        //    ababxxxxxxxxxx
+        //    [------]           d-offset
+        //        [------]       d
+        //
+        // and repeat the exercise until the two no longer overlap.
+        //
+        // This allows us to do very well in the special case of one single byte
+        // repeated many times, without taking a big hit for more general cases.
+        //
+        // The worst case of extra writing past the end of the match occurs when
+        // offset == 1 and length == 1; the last copy will read from byte positions
+        // [0..7] and write to [4..11], whereas it was only supposed to write to
+        // position 1. Thus, ten excess bytes.
+        //
+        // ----
+        //
+        // That "10 byte overrun" worst case is confirmed by Go's
+        // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+        // and finishSlowForwardCopy algorithm.
+        //
+        // if length > len(dst)-d-10 {
+        //   goto verySlowForwardCopy
+        // }
+        SUBQ $10, R_TMP2
+        CMPQ R_LEN, R_TMP2
+        JGT  verySlowForwardCopy
+        // We want to keep the offset, so we use R_TMP2 from here.
+        MOVQ R_OFF, R_TMP2
+makeOffsetAtLeast8:
+        // !!! As above, expand the pattern so that offset >= 8 and we can use
+        // 8-byte load/stores.
+        //
+        // for offset < 8 {
+        //   copy 8 bytes from dst[d-offset:] to dst[d:]
+        //   length -= offset
+        //   d      += offset
+        //   offset += offset
+        //   // The two previous lines together means that d-offset, and therefore
+        //   // R_TMP3, is unchanged.
+        // }
+        CMPQ R_TMP2, $8
+        JGE  fixUpSlowForwardCopy
+        MOVQ (R_TMP3), R_TMP1
+        MOVQ R_TMP1, (R_DST)
+        SUBQ R_TMP2, R_LEN
+        ADDQ R_TMP2, R_DST
+        ADDQ R_TMP2, R_TMP2
+        JMP  makeOffsetAtLeast8
+fixUpSlowForwardCopy:
+        // !!! Add length (which might be negative now) to d (implied by R_DST being
+        // &dst[d]) so that d ends up at the right place when we jump back to the
+        // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
+        // length is positive, copying the remaining length bytes will write to the
+        // right place.
+        MOVQ R_DST, R_TMP0
+        ADDQ R_LEN, R_DST
+finishSlowForwardCopy:
+        // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+        // length means that we overrun, but as above, that will be fixed up by
+        // subsequent iterations of the outermost loop.
+        CMPQ R_LEN, $0
+        JLE  loop
+        MOVQ (R_TMP3), R_TMP1
+        MOVQ R_TMP1, (R_TMP0)
+        ADDQ $8, R_TMP3
+        ADDQ $8, R_TMP0
+        SUBQ $8, R_LEN
+        JMP  finishSlowForwardCopy
+verySlowForwardCopy:
+        // verySlowForwardCopy is a simple implementation of forward copy. In C
+        // parlance, this is a do/while loop instead of a while loop, since we know
+        // that length > 0. In Go syntax:
+        //
+        // for {
+        //   dst[d] = dst[d - offset]
+        //   d++
+        //   length--
+        //   if length == 0 {
+        //     break
+        //   }
+        // }
+        MOVB (R_TMP3), R_TMP1
+        MOVB R_TMP1, (R_DST)
+        INCQ R_TMP3
+        INCQ R_DST
+        DECQ R_LEN
+        JNZ  verySlowForwardCopy
+        JMP  loop
+// The code above handles copy tags.
+// ----------------------------------------
+end:
+        // This is the end of the "for s < len(src)".
+        //
+        // if d != len(dst) { etc }
+        CMPQ R_DST, R_DEND
+        JNE  errCorrupt
+        // return 0
+        MOVQ $0, ret+48(FP)
+        RET
+errCorrupt:
+        // return decodeErrCodeCorrupt
+        MOVQ $1, ret+48(FP)
+        RET

diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s new file mode 100644 index 0000000..9b105e0 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
@@ -0,0 +1,568 @@
	1	// Copyright 2016 The Go Authors. All rights reserved.
	2	// Copyright (c) 2019 Klaus Post. All rights reserved.
	3	// Use of this source code is governed by a BSD-style
	4	// license that can be found in the LICENSE file.
	5
	6	// +build !appengine
	7	// +build gc
	8	// +build !noasm
	9
	10	#include "textflag.h"
	11
	12	#define R_TMP0 AX
	13	#define R_TMP1 BX
	14	#define R_LEN CX
	15	#define R_OFF DX
	16	#define R_SRC SI
	17	#define R_DST DI
	18	#define R_DBASE R8
	19	#define R_DLEN R9
	20	#define R_DEND R10
	21	#define R_SBASE R11
	22	#define R_SLEN R12
	23	#define R_SEND R13
	24	#define R_TMP2 R14
	25	#define R_TMP3 R15
	26
	27	// The asm code generally follows the pure Go code in decode_other.go, except
	28	// where marked with a "!!!".
	29
	30	// func decode(dst, src []byte) int
	31	//
	32	// All local variables fit into registers. The non-zero stack size is only to
	33	// spill registers and push args when issuing a CALL. The register allocation:
	34	// - R_TMP0 scratch
	35	// - R_TMP1 scratch
	36	// - R_LEN length or x (shared)
	37	// - R_OFF offset
	38	// - R_SRC &src[s]
	39	// - R_DST &dst[d]
	40	// + R_DBASE dst_base
	41	// + R_DLEN dst_len
	42	// + R_DEND dst_base + dst_len
	43	// + R_SBASE src_base
	44	// + R_SLEN src_len
	45	// + R_SEND src_base + src_len
	46	// - R_TMP2 used by doCopy
	47	// - R_TMP3 used by doCopy
	48	//
	49	// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
	50	// function, and after a CALL returns, and are not otherwise modified.
	51	//
	52	// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
	53	// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
	54	TEXT ·s2Decode(SB), NOSPLIT, $48-56
	55	// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
	56	MOVQ dst_base+0(FP), R_DBASE
	57	MOVQ dst_len+8(FP), R_DLEN
	58	MOVQ R_DBASE, R_DST
	59	MOVQ R_DBASE, R_DEND
	60	ADDQ R_DLEN, R_DEND
	61	MOVQ src_base+24(FP), R_SBASE
	62	MOVQ src_len+32(FP), R_SLEN
	63	MOVQ R_SBASE, R_SRC
	64	MOVQ R_SBASE, R_SEND
	65	ADDQ R_SLEN, R_SEND
	66	XORQ R_OFF, R_OFF
	67
	68	loop:
	69	// for s < len(src)
	70	CMPQ R_SRC, R_SEND
	71	JEQ end
	72
	73	// R_LEN = uint32(src[s])
	74	//
	75	// switch src[s] & 0x03
	76	MOVBLZX (R_SRC), R_LEN
	77	MOVL R_LEN, R_TMP1
	78	ANDL $3, R_TMP1
	79	CMPL R_TMP1, $1
	80	JAE tagCopy
	81
	82	// ----------------------------------------
	83	// The code below handles literal tags.
	84
	85	// case tagLiteral:
	86	// x := uint32(src[s] >> 2)
	87	// switch
	88	SHRL $2, R_LEN
	89	CMPL R_LEN, $60
	90	JAE tagLit60Plus
	91
	92	// case x < 60:
	93	// s++
	94	INCQ R_SRC
	95
	96	doLit:
	97	// This is the end of the inner "switch", when we have a literal tag.
	98	//
	99	// We assume that R_LEN == x and x fits in a uint32, where x is the variable
	100	// used in the pure Go decode_other.go code.
	101
	102	// length = int(x) + 1
	103	//
	104	// Unlike the pure Go code, we don't need to check if length <= 0 because
	105	// R_LEN can hold 64 bits, so the increment cannot overflow.
	106	INCQ R_LEN
	107
	108	// Prepare to check if copying length bytes will run past the end of dst or
	109	// src.
	110	//
	111	// R_TMP0 = len(dst) - d
	112	// R_TMP1 = len(src) - s
	113	MOVQ R_DEND, R_TMP0
	114	SUBQ R_DST, R_TMP0
	115	MOVQ R_SEND, R_TMP1
	116	SUBQ R_SRC, R_TMP1
	117
	118	// !!! Try a faster technique for short (16 or fewer bytes) copies.
	119	//
	120	// if length > 16 \|\| len(dst)-d < 16 \|\| len(src)-s < 16 {
	121	// goto callMemmove // Fall back on calling runtime·memmove.
	122	// }
	123	//
	124	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
	125	// against 21 instead of 16, because it cannot assume that all of its input
	126	// is contiguous in memory and so it needs to leave enough source bytes to
	127	// read the next tag without refilling buffers, but Go's Decode assumes
	128	// contiguousness (the src argument is a []byte).
	129	CMPQ R_LEN, $16
	130	JGT callMemmove
	131	CMPQ R_TMP0, $16
	132	JLT callMemmove
	133	CMPQ R_TMP1, $16
	134	JLT callMemmove
	135
	136	// !!! Implement the copy from src to dst as a 16-byte load and store.
	137	// (Decode's documentation says that dst and src must not overlap.)
	138	//
	139	// This always copies 16 bytes, instead of only length bytes, but that's
	140	// OK. If the input is a valid Snappy encoding then subsequent iterations
	141	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
	142	// non-nil error), so the overrun will be ignored.
	143	//
	144	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
	145	// 16-byte loads and stores. This technique probably wouldn't be as
	146	// effective on architectures that are fussier about alignment.
	147	MOVOU 0(R_SRC), X0
	148	MOVOU X0, 0(R_DST)
	149
	150	// d += length
	151	// s += length
	152	ADDQ R_LEN, R_DST
	153	ADDQ R_LEN, R_SRC
	154	JMP loop
	155
	156	callMemmove:
	157	// if length > len(dst)-d \|\| length > len(src)-s { etc }
	158	CMPQ R_LEN, R_TMP0
	159	JGT errCorrupt
	160	CMPQ R_LEN, R_TMP1
	161	JGT errCorrupt
	162
	163	// copy(dst[d:], src[s:s+length])
	164	//
	165	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
	166	// R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
	167	// three registers to the stack, to save local variables across the CALL.
	168	MOVQ R_DST, 0(SP)
	169	MOVQ R_SRC, 8(SP)
	170	MOVQ R_LEN, 16(SP)
	171	MOVQ R_DST, 24(SP)
	172	MOVQ R_SRC, 32(SP)
	173	MOVQ R_LEN, 40(SP)
	174	MOVQ R_OFF, 48(SP)
	175	CALL runtime·memmove(SB)
	176
	177	// Restore local variables: unspill registers from the stack and
	178	// re-calculate R_DBASE-R_SEND.
	179	MOVQ 24(SP), R_DST
	180	MOVQ 32(SP), R_SRC
	181	MOVQ 40(SP), R_LEN
	182	MOVQ 48(SP), R_OFF
	183	MOVQ dst_base+0(FP), R_DBASE
	184	MOVQ dst_len+8(FP), R_DLEN
	185	MOVQ R_DBASE, R_DEND
	186	ADDQ R_DLEN, R_DEND
	187	MOVQ src_base+24(FP), R_SBASE
	188	MOVQ src_len+32(FP), R_SLEN
	189	MOVQ R_SBASE, R_SEND
	190	ADDQ R_SLEN, R_SEND
	191
	192	// d += length
	193	// s += length
	194	ADDQ R_LEN, R_DST
	195	ADDQ R_LEN, R_SRC
	196	JMP loop
	197
	198	tagLit60Plus:
	199	// !!! This fragment does the
	200	//
	201	// s += x - 58; if uint(s) > uint(len(src)) { etc }
	202	//
	203	// checks. In the asm version, we code it once instead of once per switch case.
	204	ADDQ R_LEN, R_SRC
	205	SUBQ $58, R_SRC
	206	CMPQ R_SRC, R_SEND
	207	JA errCorrupt
	208
	209	// case x == 60:
	210	CMPL R_LEN, $61
	211	JEQ tagLit61
	212	JA tagLit62Plus
	213
	214	// x = uint32(src[s-1])
	215	MOVBLZX -1(R_SRC), R_LEN
	216	JMP doLit
	217
	218	tagLit61:
	219	// case x == 61:
	220	// x = uint32(src[s-2]) \| uint32(src[s-1])<<8
	221	MOVWLZX -2(R_SRC), R_LEN
	222	JMP doLit
	223
	224	tagLit62Plus:
	225	CMPL R_LEN, $62
	226	JA tagLit63
	227
	228	// case x == 62:
	229	// x = uint32(src[s-3]) \| uint32(src[s-2])<<8 \| uint32(src[s-1])<<16
	230	// We read one byte, safe to read one back, since we are just reading tag.
	231	// x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
	232	MOVL -4(R_SRC), R_LEN
	233	SHRL $8, R_LEN
	234	JMP doLit
	235
	236	tagLit63:
	237	// case x == 63:
	238	// x = uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24
	239	MOVL -4(R_SRC), R_LEN
	240	JMP doLit
	241
	242	// The code above handles literal tags.
	243	// ----------------------------------------
	244	// The code below handles copy tags.
	245
	246	tagCopy4:
	247	// case tagCopy4:
	248	// s += 5
	249	ADDQ $5, R_SRC
	250
	251	// if uint(s) > uint(len(src)) { etc }
	252	CMPQ R_SRC, R_SEND
	253	JA errCorrupt
	254
	255	// length = 1 + int(src[s-5])>>2
	256	SHRQ $2, R_LEN
	257	INCQ R_LEN
	258
	259	// offset = int(uint32(src[s-4]) \| uint32(src[s-3])<<8 \| uint32(src[s-2])<<16 \| uint32(src[s-1])<<24)
	260	MOVLQZX -4(R_SRC), R_OFF
	261	JMP doCopy
	262
	263	tagCopy2:
	264	// case tagCopy2:
	265	// s += 3
	266	ADDQ $3, R_SRC
	267
	268	// if uint(s) > uint(len(src)) { etc }
	269	CMPQ R_SRC, R_SEND
	270	JA errCorrupt
	271
	272	// length = 1 + int(src[s-3])>>2
	273	SHRQ $2, R_LEN
	274	INCQ R_LEN
	275
	276	// offset = int(uint32(src[s-2]) \| uint32(src[s-1])<<8)
	277	MOVWQZX -2(R_SRC), R_OFF
	278	JMP doCopy
	279
	280	tagCopy:
	281	// We have a copy tag. We assume that:
	282	// - R_TMP1 == src[s] & 0x03
	283	// - R_LEN == src[s]
	284	CMPQ R_TMP1, $2
	285	JEQ tagCopy2
	286	JA tagCopy4
	287
	288	// case tagCopy1:
	289	// s += 2
	290	ADDQ $2, R_SRC
	291
	292	// if uint(s) > uint(len(src)) { etc }
	293	CMPQ R_SRC, R_SEND
	294	JA errCorrupt
	295
	296	// offset = int(uint32(src[s-2])&0xe0<<3 \| uint32(src[s-1]))
	297	// length = 4 + int(src[s-2])>>2&0x7
	298	MOVBQZX -1(R_SRC), R_TMP1
	299	MOVQ R_LEN, R_TMP0
	300	SHRQ $2, R_LEN
	301	ANDQ $0xe0, R_TMP0
	302	ANDQ $7, R_LEN
	303	SHLQ $3, R_TMP0
	304	ADDQ $4, R_LEN
	305	ORQ R_TMP1, R_TMP0
	306
	307	// check if repeat code, ZF set by ORQ.
	308	JZ repeatCode
	309
	310	// This is a regular copy, transfer our temporary value to R_OFF (length)
	311	MOVQ R_TMP0, R_OFF
	312	JMP doCopy
	313
	314	// This is a repeat code.
	315	repeatCode:
	316	// If length < 9, reuse last offset, with the length already calculated.
	317	CMPQ R_LEN, $9
	318	JL doCopyRepeat
	319
	320	// Read additional bytes for length.
	321	JE repeatLen1
	322
	323	// Rare, so the extra branch shouldn't hurt too much.
	324	CMPQ R_LEN, $10
	325	JE repeatLen2
	326	JMP repeatLen3
	327
	328	// Read repeat lengths.
	329	repeatLen1:
	330	// s ++
	331	ADDQ $1, R_SRC
	332
	333	// if uint(s) > uint(len(src)) { etc }
	334	CMPQ R_SRC, R_SEND
	335	JA errCorrupt
	336
	337	// length = src[s-1] + 8
	338	MOVBQZX -1(R_SRC), R_LEN
	339	ADDL $8, R_LEN
	340	JMP doCopyRepeat
	341
	342	repeatLen2:
	343	// s +=2
	344	ADDQ $2, R_SRC
	345
	346	// if uint(s) > uint(len(src)) { etc }
	347	CMPQ R_SRC, R_SEND
	348	JA errCorrupt
	349
	350	// length = uint32(src[s-2]) \| (uint32(src[s-1])<<8) + (1 << 8)
	351	MOVWQZX -2(R_SRC), R_LEN
	352	ADDL $260, R_LEN
	353	JMP doCopyRepeat
	354
	355	repeatLen3:
	356	// s +=3
	357	ADDQ $3, R_SRC
	358
	359	// if uint(s) > uint(len(src)) { etc }
	360	CMPQ R_SRC, R_SEND
	361	JA errCorrupt
	362
	363	// length = uint32(src[s-3]) \| (uint32(src[s-2])<<8) \| (uint32(src[s-1])<<16) + (1 << 16)
	364	// Read one byte further back (just part of the tag, shifted out)
	365	MOVL -4(R_SRC), R_LEN
	366	SHRL $8, R_LEN
	367	ADDL $65540, R_LEN
	368	JMP doCopyRepeat
	369
	370	doCopy:
	371	// This is the end of the outer "switch", when we have a copy tag.
	372	//
	373	// We assume that:
	374	// - R_LEN == length && R_LEN > 0
	375	// - R_OFF == offset
	376
	377	// if d < offset { etc }
	378	MOVQ R_DST, R_TMP1
	379	SUBQ R_DBASE, R_TMP1
	380	CMPQ R_TMP1, R_OFF
	381	JLT errCorrupt
	382
	383	// Repeat values can skip the test above, since any offset > 0 will be in dst.
	384	doCopyRepeat:
	385	// if offset <= 0 { etc }
	386	CMPQ R_OFF, $0
	387	JLE errCorrupt
	388
	389	// if length > len(dst)-d { etc }
	390	MOVQ R_DEND, R_TMP1
	391	SUBQ R_DST, R_TMP1
	392	CMPQ R_LEN, R_TMP1
	393	JGT errCorrupt
	394
	395	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
	396	//
	397	// Set:
	398	// - R_TMP2 = len(dst)-d
	399	// - R_TMP3 = &dst[d-offset]
	400	MOVQ R_DEND, R_TMP2
	401	SUBQ R_DST, R_TMP2
	402	MOVQ R_DST, R_TMP3
	403	SUBQ R_OFF, R_TMP3
	404
	405	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
	406	//
	407	// First, try using two 8-byte load/stores, similar to the doLit technique
	408	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
	409	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
	410	// and not one 16-byte load/store, and the first store has to be before the
	411	// second load, due to the overlap if offset is in the range [8, 16).
	412	//
	413	// if length > 16 \|\| offset < 8 \|\| len(dst)-d < 16 {
	414	// goto slowForwardCopy
	415	// }
	416	// copy 16 bytes
	417	// d += length
	418	CMPQ R_LEN, $16
	419	JGT slowForwardCopy
	420	CMPQ R_OFF, $8
	421	JLT slowForwardCopy
	422	CMPQ R_TMP2, $16
	423	JLT slowForwardCopy
	424	MOVQ 0(R_TMP3), R_TMP0
	425	MOVQ R_TMP0, 0(R_DST)
	426	MOVQ 8(R_TMP3), R_TMP1
	427	MOVQ R_TMP1, 8(R_DST)
	428	ADDQ R_LEN, R_DST
	429	JMP loop
	430
	431	slowForwardCopy:
	432	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
	433	// can still try 8-byte load stores, provided we can overrun up to 10 extra
	434	// bytes. As above, the overrun will be fixed up by subsequent iterations
	435	// of the outermost loop.
	436	//
	437	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
	438	// commentary says:
	439	//
	440	// ----
	441	//
	442	// The main part of this loop is a simple copy of eight bytes at a time
	443	// until we've copied (at least) the requested amount of bytes. However,
	444	// if d and d-offset are less than eight bytes apart (indicating a
	445	// repeating pattern of length < 8), we first need to expand the pattern in
	446	// order to get the correct results. For instance, if the buffer looks like
	447	// this, with the eight-byte <d-offset> and <d> patterns marked as
	448	// intervals:
	449	//
	450	// abxxxxxxxxxxxx
	451	// [------] d-offset
	452	// [------] d
	453	//
	454	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
	455	// once, after which we can move <d> two bytes without moving <d-offset>:
	456	//
	457	// ababxxxxxxxxxx
	458	// [------] d-offset
	459	// [------] d
	460	//
	461	// and repeat the exercise until the two no longer overlap.
	462	//
	463	// This allows us to do very well in the special case of one single byte
	464	// repeated many times, without taking a big hit for more general cases.
	465	//
	466	// The worst case of extra writing past the end of the match occurs when
	467	// offset == 1 and length == 1; the last copy will read from byte positions
	468	// [0..7] and write to [4..11], whereas it was only supposed to write to
	469	// position 1. Thus, ten excess bytes.
	470	//
	471	// ----
	472	//
	473	// That "10 byte overrun" worst case is confirmed by Go's
	474	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
	475	// and finishSlowForwardCopy algorithm.
	476	//
	477	// if length > len(dst)-d-10 {
	478	// goto verySlowForwardCopy
	479	// }
	480	SUBQ $10, R_TMP2
	481	CMPQ R_LEN, R_TMP2
	482	JGT verySlowForwardCopy
	483
	484	// We want to keep the offset, so we use R_TMP2 from here.
	485	MOVQ R_OFF, R_TMP2
	486
	487	makeOffsetAtLeast8:
	488	// !!! As above, expand the pattern so that offset >= 8 and we can use
	489	// 8-byte load/stores.
	490	//
	491	// for offset < 8 {
	492	// copy 8 bytes from dst[d-offset:] to dst[d:]
	493	// length -= offset
	494	// d += offset
	495	// offset += offset
	496	// // The two previous lines together means that d-offset, and therefore
	497	// // R_TMP3, is unchanged.
	498	// }
	499	CMPQ R_TMP2, $8
	500	JGE fixUpSlowForwardCopy
	501	MOVQ (R_TMP3), R_TMP1
	502	MOVQ R_TMP1, (R_DST)
	503	SUBQ R_TMP2, R_LEN
	504	ADDQ R_TMP2, R_DST
	505	ADDQ R_TMP2, R_TMP2
	506	JMP makeOffsetAtLeast8
	507
	508	fixUpSlowForwardCopy:
	509	// !!! Add length (which might be negative now) to d (implied by R_DST being
	510	// &dst[d]) so that d ends up at the right place when we jump back to the
	511	// top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
	512	// length is positive, copying the remaining length bytes will write to the
	513	// right place.
	514	MOVQ R_DST, R_TMP0
	515	ADDQ R_LEN, R_DST
	516
	517	finishSlowForwardCopy:
	518	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
	519	// length means that we overrun, but as above, that will be fixed up by
	520	// subsequent iterations of the outermost loop.
	521	CMPQ R_LEN, $0
	522	JLE loop
	523	MOVQ (R_TMP3), R_TMP1
	524	MOVQ R_TMP1, (R_TMP0)
	525	ADDQ $8, R_TMP3
	526	ADDQ $8, R_TMP0
	527	SUBQ $8, R_LEN
	528	JMP finishSlowForwardCopy
	529
	530	verySlowForwardCopy:
	531	// verySlowForwardCopy is a simple implementation of forward copy. In C
	532	// parlance, this is a do/while loop instead of a while loop, since we know
	533	// that length > 0. In Go syntax:
	534	//
	535	// for {
	536	// dst[d] = dst[d - offset]
	537	// d++
	538	// length--
	539	// if length == 0 {
	540	// break
	541	// }
	542	// }
	543	MOVB (R_TMP3), R_TMP1
	544	MOVB R_TMP1, (R_DST)
	545	INCQ R_TMP3
	546	INCQ R_DST
	547	DECQ R_LEN
	548	JNZ verySlowForwardCopy
	549	JMP loop
	550
	551	// The code above handles copy tags.
	552	// ----------------------------------------
	553
	554	end:
	555	// This is the end of the "for s < len(src)".
	556	//
	557	// if d != len(dst) { etc }
	558	CMPQ R_DST, R_DEND
	559	JNE errCorrupt
	560
	561	// return 0
	562	MOVQ $0, ret+48(FP)
	563	RET
	564
	565	errCorrupt:
	566	// return decodeErrCodeCorrupt
	567	MOVQ $1, ret+48(FP)
	568	RET