diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/decode_amd64.s')
| -rw-r--r-- | vendor/github.com/klauspost/compress/s2/decode_amd64.s | 568 |
1 files changed, 568 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s new file mode 100644 index 0000000..9b105e0 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s | |||
| @@ -0,0 +1,568 @@ | |||
| 1 | // Copyright 2016 The Go Authors. All rights reserved. | ||
| 2 | // Copyright (c) 2019 Klaus Post. All rights reserved. | ||
| 3 | // Use of this source code is governed by a BSD-style | ||
| 4 | // license that can be found in the LICENSE file. | ||
| 5 | |||
| 6 | // +build !appengine | ||
| 7 | // +build gc | ||
| 8 | // +build !noasm | ||
| 9 | |||
| 10 | #include "textflag.h" | ||
| 11 | |||
| 12 | #define R_TMP0 AX | ||
| 13 | #define R_TMP1 BX | ||
| 14 | #define R_LEN CX | ||
| 15 | #define R_OFF DX | ||
| 16 | #define R_SRC SI | ||
| 17 | #define R_DST DI | ||
| 18 | #define R_DBASE R8 | ||
| 19 | #define R_DLEN R9 | ||
| 20 | #define R_DEND R10 | ||
| 21 | #define R_SBASE R11 | ||
| 22 | #define R_SLEN R12 | ||
| 23 | #define R_SEND R13 | ||
| 24 | #define R_TMP2 R14 | ||
| 25 | #define R_TMP3 R15 | ||
| 26 | |||
| 27 | // The asm code generally follows the pure Go code in decode_other.go, except | ||
| 28 | // where marked with a "!!!". | ||
| 29 | |||
| 30 | // func decode(dst, src []byte) int | ||
| 31 | // | ||
| 32 | // All local variables fit into registers. The non-zero stack size is only to | ||
| 33 | // spill registers and push args when issuing a CALL. The register allocation: | ||
| 34 | // - R_TMP0 scratch | ||
| 35 | // - R_TMP1 scratch | ||
| 36 | // - R_LEN length or x (shared) | ||
| 37 | // - R_OFF offset | ||
| 38 | // - R_SRC &src[s] | ||
| 39 | // - R_DST &dst[d] | ||
| 40 | // + R_DBASE dst_base | ||
| 41 | // + R_DLEN dst_len | ||
| 42 | // + R_DEND dst_base + dst_len | ||
| 43 | // + R_SBASE src_base | ||
| 44 | // + R_SLEN src_len | ||
| 45 | // + R_SEND src_base + src_len | ||
| 46 | // - R_TMP2 used by doCopy | ||
| 47 | // - R_TMP3 used by doCopy | ||
| 48 | // | ||
| 49 | // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the | ||
| 50 | // function, and after a CALL returns, and are not otherwise modified. | ||
| 51 | // | ||
| 52 | // The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. | ||
| 53 | // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. | ||
| 54 | TEXT ·s2Decode(SB), NOSPLIT, $48-56 | ||
| 55 | // Initialize R_SRC, R_DST and R_DBASE-R_SEND. | ||
| 56 | MOVQ dst_base+0(FP), R_DBASE | ||
| 57 | MOVQ dst_len+8(FP), R_DLEN | ||
| 58 | MOVQ R_DBASE, R_DST | ||
| 59 | MOVQ R_DBASE, R_DEND | ||
| 60 | ADDQ R_DLEN, R_DEND | ||
| 61 | MOVQ src_base+24(FP), R_SBASE | ||
| 62 | MOVQ src_len+32(FP), R_SLEN | ||
| 63 | MOVQ R_SBASE, R_SRC | ||
| 64 | MOVQ R_SBASE, R_SEND | ||
| 65 | ADDQ R_SLEN, R_SEND | ||
| 66 | XORQ R_OFF, R_OFF | ||
| 67 | |||
| 68 | loop: | ||
| 69 | // for s < len(src) | ||
| 70 | CMPQ R_SRC, R_SEND | ||
| 71 | JEQ end | ||
| 72 | |||
| 73 | // R_LEN = uint32(src[s]) | ||
| 74 | // | ||
| 75 | // switch src[s] & 0x03 | ||
| 76 | MOVBLZX (R_SRC), R_LEN | ||
| 77 | MOVL R_LEN, R_TMP1 | ||
| 78 | ANDL $3, R_TMP1 | ||
| 79 | CMPL R_TMP1, $1 | ||
| 80 | JAE tagCopy | ||
| 81 | |||
| 82 | // ---------------------------------------- | ||
| 83 | // The code below handles literal tags. | ||
| 84 | |||
| 85 | // case tagLiteral: | ||
| 86 | // x := uint32(src[s] >> 2) | ||
| 87 | // switch | ||
| 88 | SHRL $2, R_LEN | ||
| 89 | CMPL R_LEN, $60 | ||
| 90 | JAE tagLit60Plus | ||
| 91 | |||
| 92 | // case x < 60: | ||
| 93 | // s++ | ||
| 94 | INCQ R_SRC | ||
| 95 | |||
| 96 | doLit: | ||
| 97 | // This is the end of the inner "switch", when we have a literal tag. | ||
| 98 | // | ||
| 99 | // We assume that R_LEN == x and x fits in a uint32, where x is the variable | ||
| 100 | // used in the pure Go decode_other.go code. | ||
| 101 | |||
| 102 | // length = int(x) + 1 | ||
| 103 | // | ||
| 104 | // Unlike the pure Go code, we don't need to check if length <= 0 because | ||
| 105 | // R_LEN can hold 64 bits, so the increment cannot overflow. | ||
| 106 | INCQ R_LEN | ||
| 107 | |||
| 108 | // Prepare to check if copying length bytes will run past the end of dst or | ||
| 109 | // src. | ||
| 110 | // | ||
| 111 | // R_TMP0 = len(dst) - d | ||
| 112 | // R_TMP1 = len(src) - s | ||
| 113 | MOVQ R_DEND, R_TMP0 | ||
| 114 | SUBQ R_DST, R_TMP0 | ||
| 115 | MOVQ R_SEND, R_TMP1 | ||
| 116 | SUBQ R_SRC, R_TMP1 | ||
| 117 | |||
| 118 | // !!! Try a faster technique for short (16 or fewer bytes) copies. | ||
| 119 | // | ||
| 120 | // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { | ||
| 121 | // goto callMemmove // Fall back on calling runtime·memmove. | ||
| 122 | // } | ||
| 123 | // | ||
| 124 | // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s | ||
| 125 | // against 21 instead of 16, because it cannot assume that all of its input | ||
| 126 | // is contiguous in memory and so it needs to leave enough source bytes to | ||
| 127 | // read the next tag without refilling buffers, but Go's Decode assumes | ||
| 128 | // contiguousness (the src argument is a []byte). | ||
| 129 | CMPQ R_LEN, $16 | ||
| 130 | JGT callMemmove | ||
| 131 | CMPQ R_TMP0, $16 | ||
| 132 | JLT callMemmove | ||
| 133 | CMPQ R_TMP1, $16 | ||
| 134 | JLT callMemmove | ||
| 135 | |||
| 136 | // !!! Implement the copy from src to dst as a 16-byte load and store. | ||
| 137 | // (Decode's documentation says that dst and src must not overlap.) | ||
| 138 | // | ||
| 139 | // This always copies 16 bytes, instead of only length bytes, but that's | ||
| 140 | // OK. If the input is a valid Snappy encoding then subsequent iterations | ||
| 141 | // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a | ||
| 142 | // non-nil error), so the overrun will be ignored. | ||
| 143 | // | ||
| 144 | // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or | ||
| 145 | // 16-byte loads and stores. This technique probably wouldn't be as | ||
| 146 | // effective on architectures that are fussier about alignment. | ||
| 147 | MOVOU 0(R_SRC), X0 | ||
| 148 | MOVOU X0, 0(R_DST) | ||
| 149 | |||
| 150 | // d += length | ||
| 151 | // s += length | ||
| 152 | ADDQ R_LEN, R_DST | ||
| 153 | ADDQ R_LEN, R_SRC | ||
| 154 | JMP loop | ||
| 155 | |||
| 156 | callMemmove: | ||
| 157 | // if length > len(dst)-d || length > len(src)-s { etc } | ||
| 158 | CMPQ R_LEN, R_TMP0 | ||
| 159 | JGT errCorrupt | ||
| 160 | CMPQ R_LEN, R_TMP1 | ||
| 161 | JGT errCorrupt | ||
| 162 | |||
| 163 | // copy(dst[d:], src[s:s+length]) | ||
| 164 | // | ||
| 165 | // This means calling runtime·memmove(&dst[d], &src[s], length), so we push | ||
| 166 | // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those | ||
| 167 | // three registers to the stack, to save local variables across the CALL. | ||
| 168 | MOVQ R_DST, 0(SP) | ||
| 169 | MOVQ R_SRC, 8(SP) | ||
| 170 | MOVQ R_LEN, 16(SP) | ||
| 171 | MOVQ R_DST, 24(SP) | ||
| 172 | MOVQ R_SRC, 32(SP) | ||
| 173 | MOVQ R_LEN, 40(SP) | ||
| 174 | MOVQ R_OFF, 48(SP) | ||
| 175 | CALL runtime·memmove(SB) | ||
| 176 | |||
| 177 | // Restore local variables: unspill registers from the stack and | ||
| 178 | // re-calculate R_DBASE-R_SEND. | ||
| 179 | MOVQ 24(SP), R_DST | ||
| 180 | MOVQ 32(SP), R_SRC | ||
| 181 | MOVQ 40(SP), R_LEN | ||
| 182 | MOVQ 48(SP), R_OFF | ||
| 183 | MOVQ dst_base+0(FP), R_DBASE | ||
| 184 | MOVQ dst_len+8(FP), R_DLEN | ||
| 185 | MOVQ R_DBASE, R_DEND | ||
| 186 | ADDQ R_DLEN, R_DEND | ||
| 187 | MOVQ src_base+24(FP), R_SBASE | ||
| 188 | MOVQ src_len+32(FP), R_SLEN | ||
| 189 | MOVQ R_SBASE, R_SEND | ||
| 190 | ADDQ R_SLEN, R_SEND | ||
| 191 | |||
| 192 | // d += length | ||
| 193 | // s += length | ||
| 194 | ADDQ R_LEN, R_DST | ||
| 195 | ADDQ R_LEN, R_SRC | ||
| 196 | JMP loop | ||
| 197 | |||
| 198 | tagLit60Plus: | ||
| 199 | // !!! This fragment does the | ||
| 200 | // | ||
| 201 | // s += x - 58; if uint(s) > uint(len(src)) { etc } | ||
| 202 | // | ||
| 203 | // checks. In the asm version, we code it once instead of once per switch case. | ||
| 204 | ADDQ R_LEN, R_SRC | ||
| 205 | SUBQ $58, R_SRC | ||
| 206 | CMPQ R_SRC, R_SEND | ||
| 207 | JA errCorrupt | ||
| 208 | |||
| 209 | // case x == 60: | ||
| 210 | CMPL R_LEN, $61 | ||
| 211 | JEQ tagLit61 | ||
| 212 | JA tagLit62Plus | ||
| 213 | |||
| 214 | // x = uint32(src[s-1]) | ||
| 215 | MOVBLZX -1(R_SRC), R_LEN | ||
| 216 | JMP doLit | ||
| 217 | |||
| 218 | tagLit61: | ||
| 219 | // case x == 61: | ||
| 220 | // x = uint32(src[s-2]) | uint32(src[s-1])<<8 | ||
| 221 | MOVWLZX -2(R_SRC), R_LEN | ||
| 222 | JMP doLit | ||
| 223 | |||
| 224 | tagLit62Plus: | ||
| 225 | CMPL R_LEN, $62 | ||
| 226 | JA tagLit63 | ||
| 227 | |||
| 228 | // case x == 62: | ||
| 229 | // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 | ||
| 230 | // We read one byte, safe to read one back, since we are just reading tag. | ||
| 231 | // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8 | ||
| 232 | MOVL -4(R_SRC), R_LEN | ||
| 233 | SHRL $8, R_LEN | ||
| 234 | JMP doLit | ||
| 235 | |||
| 236 | tagLit63: | ||
| 237 | // case x == 63: | ||
| 238 | // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 | ||
| 239 | MOVL -4(R_SRC), R_LEN | ||
| 240 | JMP doLit | ||
| 241 | |||
| 242 | // The code above handles literal tags. | ||
| 243 | // ---------------------------------------- | ||
| 244 | // The code below handles copy tags. | ||
| 245 | |||
| 246 | tagCopy4: | ||
| 247 | // case tagCopy4: | ||
| 248 | // s += 5 | ||
| 249 | ADDQ $5, R_SRC | ||
| 250 | |||
| 251 | // if uint(s) > uint(len(src)) { etc } | ||
| 252 | CMPQ R_SRC, R_SEND | ||
| 253 | JA errCorrupt | ||
| 254 | |||
| 255 | // length = 1 + int(src[s-5])>>2 | ||
| 256 | SHRQ $2, R_LEN | ||
| 257 | INCQ R_LEN | ||
| 258 | |||
| 259 | // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) | ||
| 260 | MOVLQZX -4(R_SRC), R_OFF | ||
| 261 | JMP doCopy | ||
| 262 | |||
| 263 | tagCopy2: | ||
| 264 | // case tagCopy2: | ||
| 265 | // s += 3 | ||
| 266 | ADDQ $3, R_SRC | ||
| 267 | |||
| 268 | // if uint(s) > uint(len(src)) { etc } | ||
| 269 | CMPQ R_SRC, R_SEND | ||
| 270 | JA errCorrupt | ||
| 271 | |||
| 272 | // length = 1 + int(src[s-3])>>2 | ||
| 273 | SHRQ $2, R_LEN | ||
| 274 | INCQ R_LEN | ||
| 275 | |||
| 276 | // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) | ||
| 277 | MOVWQZX -2(R_SRC), R_OFF | ||
| 278 | JMP doCopy | ||
| 279 | |||
| 280 | tagCopy: | ||
| 281 | // We have a copy tag. We assume that: | ||
| 282 | // - R_TMP1 == src[s] & 0x03 | ||
| 283 | // - R_LEN == src[s] | ||
| 284 | CMPQ R_TMP1, $2 | ||
| 285 | JEQ tagCopy2 | ||
| 286 | JA tagCopy4 | ||
| 287 | |||
| 288 | // case tagCopy1: | ||
| 289 | // s += 2 | ||
| 290 | ADDQ $2, R_SRC | ||
| 291 | |||
| 292 | // if uint(s) > uint(len(src)) { etc } | ||
| 293 | CMPQ R_SRC, R_SEND | ||
| 294 | JA errCorrupt | ||
| 295 | |||
| 296 | // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) | ||
| 297 | // length = 4 + int(src[s-2])>>2&0x7 | ||
| 298 | MOVBQZX -1(R_SRC), R_TMP1 | ||
| 299 | MOVQ R_LEN, R_TMP0 | ||
| 300 | SHRQ $2, R_LEN | ||
| 301 | ANDQ $0xe0, R_TMP0 | ||
| 302 | ANDQ $7, R_LEN | ||
| 303 | SHLQ $3, R_TMP0 | ||
| 304 | ADDQ $4, R_LEN | ||
| 305 | ORQ R_TMP1, R_TMP0 | ||
| 306 | |||
| 307 | // check if repeat code, ZF set by ORQ. | ||
| 308 | JZ repeatCode | ||
| 309 | |||
| 310 | // This is a regular copy, transfer our temporary value to R_OFF (length) | ||
| 311 | MOVQ R_TMP0, R_OFF | ||
| 312 | JMP doCopy | ||
| 313 | |||
| 314 | // This is a repeat code. | ||
| 315 | repeatCode: | ||
| 316 | // If length < 9, reuse last offset, with the length already calculated. | ||
| 317 | CMPQ R_LEN, $9 | ||
| 318 | JL doCopyRepeat | ||
| 319 | |||
| 320 | // Read additional bytes for length. | ||
| 321 | JE repeatLen1 | ||
| 322 | |||
| 323 | // Rare, so the extra branch shouldn't hurt too much. | ||
| 324 | CMPQ R_LEN, $10 | ||
| 325 | JE repeatLen2 | ||
| 326 | JMP repeatLen3 | ||
| 327 | |||
| 328 | // Read repeat lengths. | ||
| 329 | repeatLen1: | ||
| 330 | // s ++ | ||
| 331 | ADDQ $1, R_SRC | ||
| 332 | |||
| 333 | // if uint(s) > uint(len(src)) { etc } | ||
| 334 | CMPQ R_SRC, R_SEND | ||
| 335 | JA errCorrupt | ||
| 336 | |||
| 337 | // length = src[s-1] + 8 | ||
| 338 | MOVBQZX -1(R_SRC), R_LEN | ||
| 339 | ADDL $8, R_LEN | ||
| 340 | JMP doCopyRepeat | ||
| 341 | |||
| 342 | repeatLen2: | ||
| 343 | // s +=2 | ||
| 344 | ADDQ $2, R_SRC | ||
| 345 | |||
| 346 | // if uint(s) > uint(len(src)) { etc } | ||
| 347 | CMPQ R_SRC, R_SEND | ||
| 348 | JA errCorrupt | ||
| 349 | |||
| 350 | // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8) | ||
| 351 | MOVWQZX -2(R_SRC), R_LEN | ||
| 352 | ADDL $260, R_LEN | ||
| 353 | JMP doCopyRepeat | ||
| 354 | |||
| 355 | repeatLen3: | ||
| 356 | // s +=3 | ||
| 357 | ADDQ $3, R_SRC | ||
| 358 | |||
| 359 | // if uint(s) > uint(len(src)) { etc } | ||
| 360 | CMPQ R_SRC, R_SEND | ||
| 361 | JA errCorrupt | ||
| 362 | |||
| 363 | // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16) | ||
| 364 | // Read one byte further back (just part of the tag, shifted out) | ||
| 365 | MOVL -4(R_SRC), R_LEN | ||
| 366 | SHRL $8, R_LEN | ||
| 367 | ADDL $65540, R_LEN | ||
| 368 | JMP doCopyRepeat | ||
| 369 | |||
| 370 | doCopy: | ||
| 371 | // This is the end of the outer "switch", when we have a copy tag. | ||
| 372 | // | ||
| 373 | // We assume that: | ||
| 374 | // - R_LEN == length && R_LEN > 0 | ||
| 375 | // - R_OFF == offset | ||
| 376 | |||
| 377 | // if d < offset { etc } | ||
| 378 | MOVQ R_DST, R_TMP1 | ||
| 379 | SUBQ R_DBASE, R_TMP1 | ||
| 380 | CMPQ R_TMP1, R_OFF | ||
| 381 | JLT errCorrupt | ||
| 382 | |||
| 383 | // Repeat values can skip the test above, since any offset > 0 will be in dst. | ||
| 384 | doCopyRepeat: | ||
| 385 | // if offset <= 0 { etc } | ||
| 386 | CMPQ R_OFF, $0 | ||
| 387 | JLE errCorrupt | ||
| 388 | |||
| 389 | // if length > len(dst)-d { etc } | ||
| 390 | MOVQ R_DEND, R_TMP1 | ||
| 391 | SUBQ R_DST, R_TMP1 | ||
| 392 | CMPQ R_LEN, R_TMP1 | ||
| 393 | JGT errCorrupt | ||
| 394 | |||
| 395 | // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length | ||
| 396 | // | ||
| 397 | // Set: | ||
| 398 | // - R_TMP2 = len(dst)-d | ||
| 399 | // - R_TMP3 = &dst[d-offset] | ||
| 400 | MOVQ R_DEND, R_TMP2 | ||
| 401 | SUBQ R_DST, R_TMP2 | ||
| 402 | MOVQ R_DST, R_TMP3 | ||
| 403 | SUBQ R_OFF, R_TMP3 | ||
| 404 | |||
| 405 | // !!! Try a faster technique for short (16 or fewer bytes) forward copies. | ||
| 406 | // | ||
| 407 | // First, try using two 8-byte load/stores, similar to the doLit technique | ||
| 408 | // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is | ||
| 409 | // still OK if offset >= 8. Note that this has to be two 8-byte load/stores | ||
| 410 | // and not one 16-byte load/store, and the first store has to be before the | ||
| 411 | // second load, due to the overlap if offset is in the range [8, 16). | ||
| 412 | // | ||
| 413 | // if length > 16 || offset < 8 || len(dst)-d < 16 { | ||
| 414 | // goto slowForwardCopy | ||
| 415 | // } | ||
| 416 | // copy 16 bytes | ||
| 417 | // d += length | ||
| 418 | CMPQ R_LEN, $16 | ||
| 419 | JGT slowForwardCopy | ||
| 420 | CMPQ R_OFF, $8 | ||
| 421 | JLT slowForwardCopy | ||
| 422 | CMPQ R_TMP2, $16 | ||
| 423 | JLT slowForwardCopy | ||
| 424 | MOVQ 0(R_TMP3), R_TMP0 | ||
| 425 | MOVQ R_TMP0, 0(R_DST) | ||
| 426 | MOVQ 8(R_TMP3), R_TMP1 | ||
| 427 | MOVQ R_TMP1, 8(R_DST) | ||
| 428 | ADDQ R_LEN, R_DST | ||
| 429 | JMP loop | ||
| 430 | |||
| 431 | slowForwardCopy: | ||
| 432 | // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we | ||
| 433 | // can still try 8-byte load stores, provided we can overrun up to 10 extra | ||
| 434 | // bytes. As above, the overrun will be fixed up by subsequent iterations | ||
| 435 | // of the outermost loop. | ||
| 436 | // | ||
| 437 | // The C++ snappy code calls this technique IncrementalCopyFastPath. Its | ||
| 438 | // commentary says: | ||
| 439 | // | ||
| 440 | // ---- | ||
| 441 | // | ||
| 442 | // The main part of this loop is a simple copy of eight bytes at a time | ||
| 443 | // until we've copied (at least) the requested amount of bytes. However, | ||
| 444 | // if d and d-offset are less than eight bytes apart (indicating a | ||
| 445 | // repeating pattern of length < 8), we first need to expand the pattern in | ||
| 446 | // order to get the correct results. For instance, if the buffer looks like | ||
| 447 | // this, with the eight-byte <d-offset> and <d> patterns marked as | ||
| 448 | // intervals: | ||
| 449 | // | ||
| 450 | // abxxxxxxxxxxxx | ||
| 451 | // [------] d-offset | ||
| 452 | // [------] d | ||
| 453 | // | ||
| 454 | // a single eight-byte copy from <d-offset> to <d> will repeat the pattern | ||
| 455 | // once, after which we can move <d> two bytes without moving <d-offset>: | ||
| 456 | // | ||
| 457 | // ababxxxxxxxxxx | ||
| 458 | // [------] d-offset | ||
| 459 | // [------] d | ||
| 460 | // | ||
| 461 | // and repeat the exercise until the two no longer overlap. | ||
| 462 | // | ||
| 463 | // This allows us to do very well in the special case of one single byte | ||
| 464 | // repeated many times, without taking a big hit for more general cases. | ||
| 465 | // | ||
| 466 | // The worst case of extra writing past the end of the match occurs when | ||
| 467 | // offset == 1 and length == 1; the last copy will read from byte positions | ||
| 468 | // [0..7] and write to [4..11], whereas it was only supposed to write to | ||
| 469 | // position 1. Thus, ten excess bytes. | ||
| 470 | // | ||
| 471 | // ---- | ||
| 472 | // | ||
| 473 | // That "10 byte overrun" worst case is confirmed by Go's | ||
| 474 | // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy | ||
| 475 | // and finishSlowForwardCopy algorithm. | ||
| 476 | // | ||
| 477 | // if length > len(dst)-d-10 { | ||
| 478 | // goto verySlowForwardCopy | ||
| 479 | // } | ||
| 480 | SUBQ $10, R_TMP2 | ||
| 481 | CMPQ R_LEN, R_TMP2 | ||
| 482 | JGT verySlowForwardCopy | ||
| 483 | |||
| 484 | // We want to keep the offset, so we use R_TMP2 from here. | ||
| 485 | MOVQ R_OFF, R_TMP2 | ||
| 486 | |||
| 487 | makeOffsetAtLeast8: | ||
| 488 | // !!! As above, expand the pattern so that offset >= 8 and we can use | ||
| 489 | // 8-byte load/stores. | ||
| 490 | // | ||
| 491 | // for offset < 8 { | ||
| 492 | // copy 8 bytes from dst[d-offset:] to dst[d:] | ||
| 493 | // length -= offset | ||
| 494 | // d += offset | ||
| 495 | // offset += offset | ||
| 496 | // // The two previous lines together means that d-offset, and therefore | ||
| 497 | // // R_TMP3, is unchanged. | ||
| 498 | // } | ||
| 499 | CMPQ R_TMP2, $8 | ||
| 500 | JGE fixUpSlowForwardCopy | ||
| 501 | MOVQ (R_TMP3), R_TMP1 | ||
| 502 | MOVQ R_TMP1, (R_DST) | ||
| 503 | SUBQ R_TMP2, R_LEN | ||
| 504 | ADDQ R_TMP2, R_DST | ||
| 505 | ADDQ R_TMP2, R_TMP2 | ||
| 506 | JMP makeOffsetAtLeast8 | ||
| 507 | |||
| 508 | fixUpSlowForwardCopy: | ||
| 509 | // !!! Add length (which might be negative now) to d (implied by R_DST being | ||
| 510 | // &dst[d]) so that d ends up at the right place when we jump back to the | ||
| 511 | // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if | ||
| 512 | // length is positive, copying the remaining length bytes will write to the | ||
| 513 | // right place. | ||
| 514 | MOVQ R_DST, R_TMP0 | ||
| 515 | ADDQ R_LEN, R_DST | ||
| 516 | |||
| 517 | finishSlowForwardCopy: | ||
| 518 | // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative | ||
| 519 | // length means that we overrun, but as above, that will be fixed up by | ||
| 520 | // subsequent iterations of the outermost loop. | ||
| 521 | CMPQ R_LEN, $0 | ||
| 522 | JLE loop | ||
| 523 | MOVQ (R_TMP3), R_TMP1 | ||
| 524 | MOVQ R_TMP1, (R_TMP0) | ||
| 525 | ADDQ $8, R_TMP3 | ||
| 526 | ADDQ $8, R_TMP0 | ||
| 527 | SUBQ $8, R_LEN | ||
| 528 | JMP finishSlowForwardCopy | ||
| 529 | |||
| 530 | verySlowForwardCopy: | ||
| 531 | // verySlowForwardCopy is a simple implementation of forward copy. In C | ||
| 532 | // parlance, this is a do/while loop instead of a while loop, since we know | ||
| 533 | // that length > 0. In Go syntax: | ||
| 534 | // | ||
| 535 | // for { | ||
| 536 | // dst[d] = dst[d - offset] | ||
| 537 | // d++ | ||
| 538 | // length-- | ||
| 539 | // if length == 0 { | ||
| 540 | // break | ||
| 541 | // } | ||
| 542 | // } | ||
| 543 | MOVB (R_TMP3), R_TMP1 | ||
| 544 | MOVB R_TMP1, (R_DST) | ||
| 545 | INCQ R_TMP3 | ||
| 546 | INCQ R_DST | ||
| 547 | DECQ R_LEN | ||
| 548 | JNZ verySlowForwardCopy | ||
| 549 | JMP loop | ||
| 550 | |||
| 551 | // The code above handles copy tags. | ||
| 552 | // ---------------------------------------- | ||
| 553 | |||
| 554 | end: | ||
| 555 | // This is the end of the "for s < len(src)". | ||
| 556 | // | ||
| 557 | // if d != len(dst) { etc } | ||
| 558 | CMPQ R_DST, R_DEND | ||
| 559 | JNE errCorrupt | ||
| 560 | |||
| 561 | // return 0 | ||
| 562 | MOVQ $0, ret+48(FP) | ||
| 563 | RET | ||
| 564 | |||
| 565 | errCorrupt: | ||
| 566 | // return decodeErrCodeCorrupt | ||
| 567 | MOVQ $1, ret+48(FP) | ||
| 568 | RET | ||