From 404aeae4545d2426c089a5f8d5e82dae56f5212b Mon Sep 17 00:00:00 2001 From: Rutger Broekhoff Date: Fri, 29 Dec 2023 21:31:53 +0100 Subject: Make Nix builds work --- .../klauspost/compress/s2/decode_amd64.s | 568 +++++++++++++++++++++ 1 file changed, 568 insertions(+) create mode 100644 vendor/github.com/klauspost/compress/s2/decode_amd64.s (limited to 'vendor/github.com/klauspost/compress/s2/decode_amd64.s') diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s new file mode 100644 index 0000000..9b105e0 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s @@ -0,0 +1,568 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Copyright (c) 2019 Klaus Post. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !appengine +// +build gc +// +build !noasm + +#include "textflag.h" + +#define R_TMP0 AX +#define R_TMP1 BX +#define R_LEN CX +#define R_OFF DX +#define R_SRC SI +#define R_DST DI +#define R_DBASE R8 +#define R_DLEN R9 +#define R_DEND R10 +#define R_SBASE R11 +#define R_SLEN R12 +#define R_SEND R13 +#define R_TMP2 R14 +#define R_TMP3 R15 + +// The asm code generally follows the pure Go code in decode_other.go, except +// where marked with a "!!!". + +// func decode(dst, src []byte) int +// +// All local variables fit into registers. The non-zero stack size is only to +// spill registers and push args when issuing a CALL. The register allocation: +// - R_TMP0 scratch +// - R_TMP1 scratch +// - R_LEN length or x (shared) +// - R_OFF offset +// - R_SRC &src[s] +// - R_DST &dst[d] +// + R_DBASE dst_base +// + R_DLEN dst_len +// + R_DEND dst_base + dst_len +// + R_SBASE src_base +// + R_SLEN src_len +// + R_SEND src_base + src_len +// - R_TMP2 used by doCopy +// - R_TMP3 used by doCopy +// +// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the +// function, and after a CALL returns, and are not otherwise modified. +// +// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. +// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. +TEXT ·s2Decode(SB), NOSPLIT, $48-56 + // Initialize R_SRC, R_DST and R_DBASE-R_SEND. + MOVQ dst_base+0(FP), R_DBASE + MOVQ dst_len+8(FP), R_DLEN + MOVQ R_DBASE, R_DST + MOVQ R_DBASE, R_DEND + ADDQ R_DLEN, R_DEND + MOVQ src_base+24(FP), R_SBASE + MOVQ src_len+32(FP), R_SLEN + MOVQ R_SBASE, R_SRC + MOVQ R_SBASE, R_SEND + ADDQ R_SLEN, R_SEND + XORQ R_OFF, R_OFF + +loop: + // for s < len(src) + CMPQ R_SRC, R_SEND + JEQ end + + // R_LEN = uint32(src[s]) + // + // switch src[s] & 0x03 + MOVBLZX (R_SRC), R_LEN + MOVL R_LEN, R_TMP1 + ANDL $3, R_TMP1 + CMPL R_TMP1, $1 + JAE tagCopy + + // ---------------------------------------- + // The code below handles literal tags. + + // case tagLiteral: + // x := uint32(src[s] >> 2) + // switch + SHRL $2, R_LEN + CMPL R_LEN, $60 + JAE tagLit60Plus + + // case x < 60: + // s++ + INCQ R_SRC + +doLit: + // This is the end of the inner "switch", when we have a literal tag. + // + // We assume that R_LEN == x and x fits in a uint32, where x is the variable + // used in the pure Go decode_other.go code. + + // length = int(x) + 1 + // + // Unlike the pure Go code, we don't need to check if length <= 0 because + // R_LEN can hold 64 bits, so the increment cannot overflow. + INCQ R_LEN + + // Prepare to check if copying length bytes will run past the end of dst or + // src. + // + // R_TMP0 = len(dst) - d + // R_TMP1 = len(src) - s + MOVQ R_DEND, R_TMP0 + SUBQ R_DST, R_TMP0 + MOVQ R_SEND, R_TMP1 + SUBQ R_SRC, R_TMP1 + + // !!! Try a faster technique for short (16 or fewer bytes) copies. + // + // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { + // goto callMemmove // Fall back on calling runtime·memmove. + // } + // + // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s + // against 21 instead of 16, because it cannot assume that all of its input + // is contiguous in memory and so it needs to leave enough source bytes to + // read the next tag without refilling buffers, but Go's Decode assumes + // contiguousness (the src argument is a []byte). + CMPQ R_LEN, $16 + JGT callMemmove + CMPQ R_TMP0, $16 + JLT callMemmove + CMPQ R_TMP1, $16 + JLT callMemmove + + // !!! Implement the copy from src to dst as a 16-byte load and store. + // (Decode's documentation says that dst and src must not overlap.) + // + // This always copies 16 bytes, instead of only length bytes, but that's + // OK. If the input is a valid Snappy encoding then subsequent iterations + // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a + // non-nil error), so the overrun will be ignored. + // + // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or + // 16-byte loads and stores. This technique probably wouldn't be as + // effective on architectures that are fussier about alignment. + MOVOU 0(R_SRC), X0 + MOVOU X0, 0(R_DST) + + // d += length + // s += length + ADDQ R_LEN, R_DST + ADDQ R_LEN, R_SRC + JMP loop + +callMemmove: + // if length > len(dst)-d || length > len(src)-s { etc } + CMPQ R_LEN, R_TMP0 + JGT errCorrupt + CMPQ R_LEN, R_TMP1 + JGT errCorrupt + + // copy(dst[d:], src[s:s+length]) + // + // This means calling runtime·memmove(&dst[d], &src[s], length), so we push + // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those + // three registers to the stack, to save local variables across the CALL. + MOVQ R_DST, 0(SP) + MOVQ R_SRC, 8(SP) + MOVQ R_LEN, 16(SP) + MOVQ R_DST, 24(SP) + MOVQ R_SRC, 32(SP) + MOVQ R_LEN, 40(SP) + MOVQ R_OFF, 48(SP) + CALL runtime·memmove(SB) + + // Restore local variables: unspill registers from the stack and + // re-calculate R_DBASE-R_SEND. + MOVQ 24(SP), R_DST + MOVQ 32(SP), R_SRC + MOVQ 40(SP), R_LEN + MOVQ 48(SP), R_OFF + MOVQ dst_base+0(FP), R_DBASE + MOVQ dst_len+8(FP), R_DLEN + MOVQ R_DBASE, R_DEND + ADDQ R_DLEN, R_DEND + MOVQ src_base+24(FP), R_SBASE + MOVQ src_len+32(FP), R_SLEN + MOVQ R_SBASE, R_SEND + ADDQ R_SLEN, R_SEND + + // d += length + // s += length + ADDQ R_LEN, R_DST + ADDQ R_LEN, R_SRC + JMP loop + +tagLit60Plus: + // !!! This fragment does the + // + // s += x - 58; if uint(s) > uint(len(src)) { etc } + // + // checks. In the asm version, we code it once instead of once per switch case. + ADDQ R_LEN, R_SRC + SUBQ $58, R_SRC + CMPQ R_SRC, R_SEND + JA errCorrupt + + // case x == 60: + CMPL R_LEN, $61 + JEQ tagLit61 + JA tagLit62Plus + + // x = uint32(src[s-1]) + MOVBLZX -1(R_SRC), R_LEN + JMP doLit + +tagLit61: + // case x == 61: + // x = uint32(src[s-2]) | uint32(src[s-1])<<8 + MOVWLZX -2(R_SRC), R_LEN + JMP doLit + +tagLit62Plus: + CMPL R_LEN, $62 + JA tagLit63 + + // case x == 62: + // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 + // We read one byte, safe to read one back, since we are just reading tag. + // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8 + MOVL -4(R_SRC), R_LEN + SHRL $8, R_LEN + JMP doLit + +tagLit63: + // case x == 63: + // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 + MOVL -4(R_SRC), R_LEN + JMP doLit + +// The code above handles literal tags. +// ---------------------------------------- +// The code below handles copy tags. + +tagCopy4: + // case tagCopy4: + // s += 5 + ADDQ $5, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // length = 1 + int(src[s-5])>>2 + SHRQ $2, R_LEN + INCQ R_LEN + + // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) + MOVLQZX -4(R_SRC), R_OFF + JMP doCopy + +tagCopy2: + // case tagCopy2: + // s += 3 + ADDQ $3, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // length = 1 + int(src[s-3])>>2 + SHRQ $2, R_LEN + INCQ R_LEN + + // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) + MOVWQZX -2(R_SRC), R_OFF + JMP doCopy + +tagCopy: + // We have a copy tag. We assume that: + // - R_TMP1 == src[s] & 0x03 + // - R_LEN == src[s] + CMPQ R_TMP1, $2 + JEQ tagCopy2 + JA tagCopy4 + + // case tagCopy1: + // s += 2 + ADDQ $2, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) + // length = 4 + int(src[s-2])>>2&0x7 + MOVBQZX -1(R_SRC), R_TMP1 + MOVQ R_LEN, R_TMP0 + SHRQ $2, R_LEN + ANDQ $0xe0, R_TMP0 + ANDQ $7, R_LEN + SHLQ $3, R_TMP0 + ADDQ $4, R_LEN + ORQ R_TMP1, R_TMP0 + + // check if repeat code, ZF set by ORQ. + JZ repeatCode + + // This is a regular copy, transfer our temporary value to R_OFF (length) + MOVQ R_TMP0, R_OFF + JMP doCopy + +// This is a repeat code. +repeatCode: + // If length < 9, reuse last offset, with the length already calculated. + CMPQ R_LEN, $9 + JL doCopyRepeat + + // Read additional bytes for length. + JE repeatLen1 + + // Rare, so the extra branch shouldn't hurt too much. + CMPQ R_LEN, $10 + JE repeatLen2 + JMP repeatLen3 + +// Read repeat lengths. +repeatLen1: + // s ++ + ADDQ $1, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // length = src[s-1] + 8 + MOVBQZX -1(R_SRC), R_LEN + ADDL $8, R_LEN + JMP doCopyRepeat + +repeatLen2: + // s +=2 + ADDQ $2, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8) + MOVWQZX -2(R_SRC), R_LEN + ADDL $260, R_LEN + JMP doCopyRepeat + +repeatLen3: + // s +=3 + ADDQ $3, R_SRC + + // if uint(s) > uint(len(src)) { etc } + CMPQ R_SRC, R_SEND + JA errCorrupt + + // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16) + // Read one byte further back (just part of the tag, shifted out) + MOVL -4(R_SRC), R_LEN + SHRL $8, R_LEN + ADDL $65540, R_LEN + JMP doCopyRepeat + +doCopy: + // This is the end of the outer "switch", when we have a copy tag. + // + // We assume that: + // - R_LEN == length && R_LEN > 0 + // - R_OFF == offset + + // if d < offset { etc } + MOVQ R_DST, R_TMP1 + SUBQ R_DBASE, R_TMP1 + CMPQ R_TMP1, R_OFF + JLT errCorrupt + + // Repeat values can skip the test above, since any offset > 0 will be in dst. +doCopyRepeat: + // if offset <= 0 { etc } + CMPQ R_OFF, $0 + JLE errCorrupt + + // if length > len(dst)-d { etc } + MOVQ R_DEND, R_TMP1 + SUBQ R_DST, R_TMP1 + CMPQ R_LEN, R_TMP1 + JGT errCorrupt + + // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length + // + // Set: + // - R_TMP2 = len(dst)-d + // - R_TMP3 = &dst[d-offset] + MOVQ R_DEND, R_TMP2 + SUBQ R_DST, R_TMP2 + MOVQ R_DST, R_TMP3 + SUBQ R_OFF, R_TMP3 + + // !!! Try a faster technique for short (16 or fewer bytes) forward copies. + // + // First, try using two 8-byte load/stores, similar to the doLit technique + // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is + // still OK if offset >= 8. Note that this has to be two 8-byte load/stores + // and not one 16-byte load/store, and the first store has to be before the + // second load, due to the overlap if offset is in the range [8, 16). + // + // if length > 16 || offset < 8 || len(dst)-d < 16 { + // goto slowForwardCopy + // } + // copy 16 bytes + // d += length + CMPQ R_LEN, $16 + JGT slowForwardCopy + CMPQ R_OFF, $8 + JLT slowForwardCopy + CMPQ R_TMP2, $16 + JLT slowForwardCopy + MOVQ 0(R_TMP3), R_TMP0 + MOVQ R_TMP0, 0(R_DST) + MOVQ 8(R_TMP3), R_TMP1 + MOVQ R_TMP1, 8(R_DST) + ADDQ R_LEN, R_DST + JMP loop + +slowForwardCopy: + // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we + // can still try 8-byte load stores, provided we can overrun up to 10 extra + // bytes. As above, the overrun will be fixed up by subsequent iterations + // of the outermost loop. + // + // The C++ snappy code calls this technique IncrementalCopyFastPath. Its + // commentary says: + // + // ---- + // + // The main part of this loop is a simple copy of eight bytes at a time + // until we've copied (at least) the requested amount of bytes. However, + // if d and d-offset are less than eight bytes apart (indicating a + // repeating pattern of length < 8), we first need to expand the pattern in + // order to get the correct results. For instance, if the buffer looks like + // this, with the eight-byte and patterns marked as + // intervals: + // + // abxxxxxxxxxxxx + // [------] d-offset + // [------] d + // + // a single eight-byte copy from to will repeat the pattern + // once, after which we can move two bytes without moving : + // + // ababxxxxxxxxxx + // [------] d-offset + // [------] d + // + // and repeat the exercise until the two no longer overlap. + // + // This allows us to do very well in the special case of one single byte + // repeated many times, without taking a big hit for more general cases. + // + // The worst case of extra writing past the end of the match occurs when + // offset == 1 and length == 1; the last copy will read from byte positions + // [0..7] and write to [4..11], whereas it was only supposed to write to + // position 1. Thus, ten excess bytes. + // + // ---- + // + // That "10 byte overrun" worst case is confirmed by Go's + // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy + // and finishSlowForwardCopy algorithm. + // + // if length > len(dst)-d-10 { + // goto verySlowForwardCopy + // } + SUBQ $10, R_TMP2 + CMPQ R_LEN, R_TMP2 + JGT verySlowForwardCopy + + // We want to keep the offset, so we use R_TMP2 from here. + MOVQ R_OFF, R_TMP2 + +makeOffsetAtLeast8: + // !!! As above, expand the pattern so that offset >= 8 and we can use + // 8-byte load/stores. + // + // for offset < 8 { + // copy 8 bytes from dst[d-offset:] to dst[d:] + // length -= offset + // d += offset + // offset += offset + // // The two previous lines together means that d-offset, and therefore + // // R_TMP3, is unchanged. + // } + CMPQ R_TMP2, $8 + JGE fixUpSlowForwardCopy + MOVQ (R_TMP3), R_TMP1 + MOVQ R_TMP1, (R_DST) + SUBQ R_TMP2, R_LEN + ADDQ R_TMP2, R_DST + ADDQ R_TMP2, R_TMP2 + JMP makeOffsetAtLeast8 + +fixUpSlowForwardCopy: + // !!! Add length (which might be negative now) to d (implied by R_DST being + // &dst[d]) so that d ends up at the right place when we jump back to the + // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if + // length is positive, copying the remaining length bytes will write to the + // right place. + MOVQ R_DST, R_TMP0 + ADDQ R_LEN, R_DST + +finishSlowForwardCopy: + // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative + // length means that we overrun, but as above, that will be fixed up by + // subsequent iterations of the outermost loop. + CMPQ R_LEN, $0 + JLE loop + MOVQ (R_TMP3), R_TMP1 + MOVQ R_TMP1, (R_TMP0) + ADDQ $8, R_TMP3 + ADDQ $8, R_TMP0 + SUBQ $8, R_LEN + JMP finishSlowForwardCopy + +verySlowForwardCopy: + // verySlowForwardCopy is a simple implementation of forward copy. In C + // parlance, this is a do/while loop instead of a while loop, since we know + // that length > 0. In Go syntax: + // + // for { + // dst[d] = dst[d - offset] + // d++ + // length-- + // if length == 0 { + // break + // } + // } + MOVB (R_TMP3), R_TMP1 + MOVB R_TMP1, (R_DST) + INCQ R_TMP3 + INCQ R_DST + DECQ R_LEN + JNZ verySlowForwardCopy + JMP loop + +// The code above handles copy tags. +// ---------------------------------------- + +end: + // This is the end of the "for s < len(src)". + // + // if d != len(dst) { etc } + CMPQ R_DST, R_DEND + JNE errCorrupt + + // return 0 + MOVQ $0, ret+48(FP) + RET + +errCorrupt: + // return decodeErrCodeCorrupt + MOVQ $1, ret+48(FP) + RET -- cgit v1.2.3