From 8db41da676ac8368ef7c2549d56239a5ff5eedde Mon Sep 17 00:00:00 2001 From: Rutger Broekhoff Date: Tue, 2 Jan 2024 18:56:31 +0100 Subject: Delete vendor directory --- .../klauspost/compress/s2/decode_amd64.s | 568 --------------------- 1 file changed, 568 deletions(-) delete mode 100644 vendor/github.com/klauspost/compress/s2/decode_amd64.s (limited to 'vendor/github.com/klauspost/compress/s2/decode_amd64.s') diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s deleted file mode 100644 index 9b105e0..0000000 --- a/vendor/github.com/klauspost/compress/s2/decode_amd64.s +++ /dev/null @@ -1,568 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Copyright (c) 2019 Klaus Post. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine -// +build gc -// +build !noasm - -#include "textflag.h" - -#define R_TMP0 AX -#define R_TMP1 BX -#define R_LEN CX -#define R_OFF DX -#define R_SRC SI -#define R_DST DI -#define R_DBASE R8 -#define R_DLEN R9 -#define R_DEND R10 -#define R_SBASE R11 -#define R_SLEN R12 -#define R_SEND R13 -#define R_TMP2 R14 -#define R_TMP3 R15 - -// The asm code generally follows the pure Go code in decode_other.go, except -// where marked with a "!!!". - -// func decode(dst, src []byte) int -// -// All local variables fit into registers. The non-zero stack size is only to -// spill registers and push args when issuing a CALL. The register allocation: -// - R_TMP0 scratch -// - R_TMP1 scratch -// - R_LEN length or x (shared) -// - R_OFF offset -// - R_SRC &src[s] -// - R_DST &dst[d] -// + R_DBASE dst_base -// + R_DLEN dst_len -// + R_DEND dst_base + dst_len -// + R_SBASE src_base -// + R_SLEN src_len -// + R_SEND src_base + src_len -// - R_TMP2 used by doCopy -// - R_TMP3 used by doCopy -// -// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the -// function, and after a CALL returns, and are not otherwise modified. -// -// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. -// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. -TEXT ·s2Decode(SB), NOSPLIT, $48-56 - // Initialize R_SRC, R_DST and R_DBASE-R_SEND. - MOVQ dst_base+0(FP), R_DBASE - MOVQ dst_len+8(FP), R_DLEN - MOVQ R_DBASE, R_DST - MOVQ R_DBASE, R_DEND - ADDQ R_DLEN, R_DEND - MOVQ src_base+24(FP), R_SBASE - MOVQ src_len+32(FP), R_SLEN - MOVQ R_SBASE, R_SRC - MOVQ R_SBASE, R_SEND - ADDQ R_SLEN, R_SEND - XORQ R_OFF, R_OFF - -loop: - // for s < len(src) - CMPQ R_SRC, R_SEND - JEQ end - - // R_LEN = uint32(src[s]) - // - // switch src[s] & 0x03 - MOVBLZX (R_SRC), R_LEN - MOVL R_LEN, R_TMP1 - ANDL $3, R_TMP1 - CMPL R_TMP1, $1 - JAE tagCopy - - // ---------------------------------------- - // The code below handles literal tags. - - // case tagLiteral: - // x := uint32(src[s] >> 2) - // switch - SHRL $2, R_LEN - CMPL R_LEN, $60 - JAE tagLit60Plus - - // case x < 60: - // s++ - INCQ R_SRC - -doLit: - // This is the end of the inner "switch", when we have a literal tag. - // - // We assume that R_LEN == x and x fits in a uint32, where x is the variable - // used in the pure Go decode_other.go code. - - // length = int(x) + 1 - // - // Unlike the pure Go code, we don't need to check if length <= 0 because - // R_LEN can hold 64 bits, so the increment cannot overflow. - INCQ R_LEN - - // Prepare to check if copying length bytes will run past the end of dst or - // src. - // - // R_TMP0 = len(dst) - d - // R_TMP1 = len(src) - s - MOVQ R_DEND, R_TMP0 - SUBQ R_DST, R_TMP0 - MOVQ R_SEND, R_TMP1 - SUBQ R_SRC, R_TMP1 - - // !!! Try a faster technique for short (16 or fewer bytes) copies. - // - // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { - // goto callMemmove // Fall back on calling runtime·memmove. - // } - // - // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s - // against 21 instead of 16, because it cannot assume that all of its input - // is contiguous in memory and so it needs to leave enough source bytes to - // read the next tag without refilling buffers, but Go's Decode assumes - // contiguousness (the src argument is a []byte). - CMPQ R_LEN, $16 - JGT callMemmove - CMPQ R_TMP0, $16 - JLT callMemmove - CMPQ R_TMP1, $16 - JLT callMemmove - - // !!! Implement the copy from src to dst as a 16-byte load and store. - // (Decode's documentation says that dst and src must not overlap.) - // - // This always copies 16 bytes, instead of only length bytes, but that's - // OK. If the input is a valid Snappy encoding then subsequent iterations - // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a - // non-nil error), so the overrun will be ignored. - // - // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or - // 16-byte loads and stores. This technique probably wouldn't be as - // effective on architectures that are fussier about alignment. - MOVOU 0(R_SRC), X0 - MOVOU X0, 0(R_DST) - - // d += length - // s += length - ADDQ R_LEN, R_DST - ADDQ R_LEN, R_SRC - JMP loop - -callMemmove: - // if length > len(dst)-d || length > len(src)-s { etc } - CMPQ R_LEN, R_TMP0 - JGT errCorrupt - CMPQ R_LEN, R_TMP1 - JGT errCorrupt - - // copy(dst[d:], src[s:s+length]) - // - // This means calling runtime·memmove(&dst[d], &src[s], length), so we push - // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those - // three registers to the stack, to save local variables across the CALL. - MOVQ R_DST, 0(SP) - MOVQ R_SRC, 8(SP) - MOVQ R_LEN, 16(SP) - MOVQ R_DST, 24(SP) - MOVQ R_SRC, 32(SP) - MOVQ R_LEN, 40(SP) - MOVQ R_OFF, 48(SP) - CALL runtime·memmove(SB) - - // Restore local variables: unspill registers from the stack and - // re-calculate R_DBASE-R_SEND. - MOVQ 24(SP), R_DST - MOVQ 32(SP), R_SRC - MOVQ 40(SP), R_LEN - MOVQ 48(SP), R_OFF - MOVQ dst_base+0(FP), R_DBASE - MOVQ dst_len+8(FP), R_DLEN - MOVQ R_DBASE, R_DEND - ADDQ R_DLEN, R_DEND - MOVQ src_base+24(FP), R_SBASE - MOVQ src_len+32(FP), R_SLEN - MOVQ R_SBASE, R_SEND - ADDQ R_SLEN, R_SEND - - // d += length - // s += length - ADDQ R_LEN, R_DST - ADDQ R_LEN, R_SRC - JMP loop - -tagLit60Plus: - // !!! This fragment does the - // - // s += x - 58; if uint(s) > uint(len(src)) { etc } - // - // checks. In the asm version, we code it once instead of once per switch case. - ADDQ R_LEN, R_SRC - SUBQ $58, R_SRC - CMPQ R_SRC, R_SEND - JA errCorrupt - - // case x == 60: - CMPL R_LEN, $61 - JEQ tagLit61 - JA tagLit62Plus - - // x = uint32(src[s-1]) - MOVBLZX -1(R_SRC), R_LEN - JMP doLit - -tagLit61: - // case x == 61: - // x = uint32(src[s-2]) | uint32(src[s-1])<<8 - MOVWLZX -2(R_SRC), R_LEN - JMP doLit - -tagLit62Plus: - CMPL R_LEN, $62 - JA tagLit63 - - // case x == 62: - // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 - // We read one byte, safe to read one back, since we are just reading tag. - // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8 - MOVL -4(R_SRC), R_LEN - SHRL $8, R_LEN - JMP doLit - -tagLit63: - // case x == 63: - // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 - MOVL -4(R_SRC), R_LEN - JMP doLit - -// The code above handles literal tags. -// ---------------------------------------- -// The code below handles copy tags. - -tagCopy4: - // case tagCopy4: - // s += 5 - ADDQ $5, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // length = 1 + int(src[s-5])>>2 - SHRQ $2, R_LEN - INCQ R_LEN - - // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) - MOVLQZX -4(R_SRC), R_OFF - JMP doCopy - -tagCopy2: - // case tagCopy2: - // s += 3 - ADDQ $3, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // length = 1 + int(src[s-3])>>2 - SHRQ $2, R_LEN - INCQ R_LEN - - // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) - MOVWQZX -2(R_SRC), R_OFF - JMP doCopy - -tagCopy: - // We have a copy tag. We assume that: - // - R_TMP1 == src[s] & 0x03 - // - R_LEN == src[s] - CMPQ R_TMP1, $2 - JEQ tagCopy2 - JA tagCopy4 - - // case tagCopy1: - // s += 2 - ADDQ $2, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) - // length = 4 + int(src[s-2])>>2&0x7 - MOVBQZX -1(R_SRC), R_TMP1 - MOVQ R_LEN, R_TMP0 - SHRQ $2, R_LEN - ANDQ $0xe0, R_TMP0 - ANDQ $7, R_LEN - SHLQ $3, R_TMP0 - ADDQ $4, R_LEN - ORQ R_TMP1, R_TMP0 - - // check if repeat code, ZF set by ORQ. - JZ repeatCode - - // This is a regular copy, transfer our temporary value to R_OFF (length) - MOVQ R_TMP0, R_OFF - JMP doCopy - -// This is a repeat code. -repeatCode: - // If length < 9, reuse last offset, with the length already calculated. - CMPQ R_LEN, $9 - JL doCopyRepeat - - // Read additional bytes for length. - JE repeatLen1 - - // Rare, so the extra branch shouldn't hurt too much. - CMPQ R_LEN, $10 - JE repeatLen2 - JMP repeatLen3 - -// Read repeat lengths. -repeatLen1: - // s ++ - ADDQ $1, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // length = src[s-1] + 8 - MOVBQZX -1(R_SRC), R_LEN - ADDL $8, R_LEN - JMP doCopyRepeat - -repeatLen2: - // s +=2 - ADDQ $2, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8) - MOVWQZX -2(R_SRC), R_LEN - ADDL $260, R_LEN - JMP doCopyRepeat - -repeatLen3: - // s +=3 - ADDQ $3, R_SRC - - // if uint(s) > uint(len(src)) { etc } - CMPQ R_SRC, R_SEND - JA errCorrupt - - // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16) - // Read one byte further back (just part of the tag, shifted out) - MOVL -4(R_SRC), R_LEN - SHRL $8, R_LEN - ADDL $65540, R_LEN - JMP doCopyRepeat - -doCopy: - // This is the end of the outer "switch", when we have a copy tag. - // - // We assume that: - // - R_LEN == length && R_LEN > 0 - // - R_OFF == offset - - // if d < offset { etc } - MOVQ R_DST, R_TMP1 - SUBQ R_DBASE, R_TMP1 - CMPQ R_TMP1, R_OFF - JLT errCorrupt - - // Repeat values can skip the test above, since any offset > 0 will be in dst. -doCopyRepeat: - // if offset <= 0 { etc } - CMPQ R_OFF, $0 - JLE errCorrupt - - // if length > len(dst)-d { etc } - MOVQ R_DEND, R_TMP1 - SUBQ R_DST, R_TMP1 - CMPQ R_LEN, R_TMP1 - JGT errCorrupt - - // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length - // - // Set: - // - R_TMP2 = len(dst)-d - // - R_TMP3 = &dst[d-offset] - MOVQ R_DEND, R_TMP2 - SUBQ R_DST, R_TMP2 - MOVQ R_DST, R_TMP3 - SUBQ R_OFF, R_TMP3 - - // !!! Try a faster technique for short (16 or fewer bytes) forward copies. - // - // First, try using two 8-byte load/stores, similar to the doLit technique - // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is - // still OK if offset >= 8. Note that this has to be two 8-byte load/stores - // and not one 16-byte load/store, and the first store has to be before the - // second load, due to the overlap if offset is in the range [8, 16). - // - // if length > 16 || offset < 8 || len(dst)-d < 16 { - // goto slowForwardCopy - // } - // copy 16 bytes - // d += length - CMPQ R_LEN, $16 - JGT slowForwardCopy - CMPQ R_OFF, $8 - JLT slowForwardCopy - CMPQ R_TMP2, $16 - JLT slowForwardCopy - MOVQ 0(R_TMP3), R_TMP0 - MOVQ R_TMP0, 0(R_DST) - MOVQ 8(R_TMP3), R_TMP1 - MOVQ R_TMP1, 8(R_DST) - ADDQ R_LEN, R_DST - JMP loop - -slowForwardCopy: - // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we - // can still try 8-byte load stores, provided we can overrun up to 10 extra - // bytes. As above, the overrun will be fixed up by subsequent iterations - // of the outermost loop. - // - // The C++ snappy code calls this technique IncrementalCopyFastPath. Its - // commentary says: - // - // ---- - // - // The main part of this loop is a simple copy of eight bytes at a time - // until we've copied (at least) the requested amount of bytes. However, - // if d and d-offset are less than eight bytes apart (indicating a - // repeating pattern of length < 8), we first need to expand the pattern in - // order to get the correct results. For instance, if the buffer looks like - // this, with the eight-byte and patterns marked as - // intervals: - // - // abxxxxxxxxxxxx - // [------] d-offset - // [------] d - // - // a single eight-byte copy from to will repeat the pattern - // once, after which we can move two bytes without moving : - // - // ababxxxxxxxxxx - // [------] d-offset - // [------] d - // - // and repeat the exercise until the two no longer overlap. - // - // This allows us to do very well in the special case of one single byte - // repeated many times, without taking a big hit for more general cases. - // - // The worst case of extra writing past the end of the match occurs when - // offset == 1 and length == 1; the last copy will read from byte positions - // [0..7] and write to [4..11], whereas it was only supposed to write to - // position 1. Thus, ten excess bytes. - // - // ---- - // - // That "10 byte overrun" worst case is confirmed by Go's - // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy - // and finishSlowForwardCopy algorithm. - // - // if length > len(dst)-d-10 { - // goto verySlowForwardCopy - // } - SUBQ $10, R_TMP2 - CMPQ R_LEN, R_TMP2 - JGT verySlowForwardCopy - - // We want to keep the offset, so we use R_TMP2 from here. - MOVQ R_OFF, R_TMP2 - -makeOffsetAtLeast8: - // !!! As above, expand the pattern so that offset >= 8 and we can use - // 8-byte load/stores. - // - // for offset < 8 { - // copy 8 bytes from dst[d-offset:] to dst[d:] - // length -= offset - // d += offset - // offset += offset - // // The two previous lines together means that d-offset, and therefore - // // R_TMP3, is unchanged. - // } - CMPQ R_TMP2, $8 - JGE fixUpSlowForwardCopy - MOVQ (R_TMP3), R_TMP1 - MOVQ R_TMP1, (R_DST) - SUBQ R_TMP2, R_LEN - ADDQ R_TMP2, R_DST - ADDQ R_TMP2, R_TMP2 - JMP makeOffsetAtLeast8 - -fixUpSlowForwardCopy: - // !!! Add length (which might be negative now) to d (implied by R_DST being - // &dst[d]) so that d ends up at the right place when we jump back to the - // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if - // length is positive, copying the remaining length bytes will write to the - // right place. - MOVQ R_DST, R_TMP0 - ADDQ R_LEN, R_DST - -finishSlowForwardCopy: - // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative - // length means that we overrun, but as above, that will be fixed up by - // subsequent iterations of the outermost loop. - CMPQ R_LEN, $0 - JLE loop - MOVQ (R_TMP3), R_TMP1 - MOVQ R_TMP1, (R_TMP0) - ADDQ $8, R_TMP3 - ADDQ $8, R_TMP0 - SUBQ $8, R_LEN - JMP finishSlowForwardCopy - -verySlowForwardCopy: - // verySlowForwardCopy is a simple implementation of forward copy. In C - // parlance, this is a do/while loop instead of a while loop, since we know - // that length > 0. In Go syntax: - // - // for { - // dst[d] = dst[d - offset] - // d++ - // length-- - // if length == 0 { - // break - // } - // } - MOVB (R_TMP3), R_TMP1 - MOVB R_TMP1, (R_DST) - INCQ R_TMP3 - INCQ R_DST - DECQ R_LEN - JNZ verySlowForwardCopy - JMP loop - -// The code above handles copy tags. -// ---------------------------------------- - -end: - // This is the end of the "for s < len(src)". - // - // if d != len(dst) { etc } - CMPQ R_DST, R_DEND - JNE errCorrupt - - // return 0 - MOVQ $0, ret+48(FP) - RET - -errCorrupt: - // return decodeErrCodeCorrupt - MOVQ $1, ret+48(FP) - RET -- cgit v1.2.3