diff options
| author | Rutger Broekhoff | 2023-12-29 21:31:53 +0100 |
|---|---|---|
| committer | Rutger Broekhoff | 2023-12-29 21:31:53 +0100 |
| commit | 404aeae4545d2426c089a5f8d5e82dae56f5212b (patch) | |
| tree | 2d84e00af272b39fc04f3795ae06bc48970e57b5 /vendor/github.com/klauspost/compress/s2/decode_arm64.s | |
| parent | 209d8b0187ed025dec9ac149ebcced3462877bff (diff) | |
| download | gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.tar.gz gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.zip | |
Make Nix builds work
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/decode_arm64.s')
| -rw-r--r-- | vendor/github.com/klauspost/compress/s2/decode_arm64.s | 574 |
1 files changed, 574 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s new file mode 100644 index 0000000..4b63d50 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s | |||
| @@ -0,0 +1,574 @@ | |||
| 1 | // Copyright 2020 The Go Authors. All rights reserved. | ||
| 2 | // Use of this source code is governed by a BSD-style | ||
| 3 | // license that can be found in the LICENSE file. | ||
| 4 | |||
| 5 | // +build !appengine | ||
| 6 | // +build gc | ||
| 7 | // +build !noasm | ||
| 8 | |||
| 9 | #include "textflag.h" | ||
| 10 | |||
| 11 | #define R_TMP0 R2 | ||
| 12 | #define R_TMP1 R3 | ||
| 13 | #define R_LEN R4 | ||
| 14 | #define R_OFF R5 | ||
| 15 | #define R_SRC R6 | ||
| 16 | #define R_DST R7 | ||
| 17 | #define R_DBASE R8 | ||
| 18 | #define R_DLEN R9 | ||
| 19 | #define R_DEND R10 | ||
| 20 | #define R_SBASE R11 | ||
| 21 | #define R_SLEN R12 | ||
| 22 | #define R_SEND R13 | ||
| 23 | #define R_TMP2 R14 | ||
| 24 | #define R_TMP3 R15 | ||
| 25 | |||
| 26 | // TEST_SRC will check if R_SRC is <= SRC_END | ||
| 27 | #define TEST_SRC() \ | ||
| 28 | CMP R_SEND, R_SRC \ | ||
| 29 | BGT errCorrupt | ||
| 30 | |||
| 31 | // MOVD R_SRC, R_TMP1 | ||
| 32 | // SUB R_SBASE, R_TMP1, R_TMP1 | ||
| 33 | // CMP R_SLEN, R_TMP1 | ||
| 34 | // BGT errCorrupt | ||
| 35 | |||
| 36 | // The asm code generally follows the pure Go code in decode_other.go, except | ||
| 37 | // where marked with a "!!!". | ||
| 38 | |||
| 39 | // func decode(dst, src []byte) int | ||
| 40 | // | ||
| 41 | // All local variables fit into registers. The non-zero stack size is only to | ||
| 42 | // spill registers and push args when issuing a CALL. The register allocation: | ||
| 43 | // - R_TMP0 scratch | ||
| 44 | // - R_TMP1 scratch | ||
| 45 | // - R_LEN length or x | ||
| 46 | // - R_OFF offset | ||
| 47 | // - R_SRC &src[s] | ||
| 48 | // - R_DST &dst[d] | ||
| 49 | // + R_DBASE dst_base | ||
| 50 | // + R_DLEN dst_len | ||
| 51 | // + R_DEND dst_base + dst_len | ||
| 52 | // + R_SBASE src_base | ||
| 53 | // + R_SLEN src_len | ||
| 54 | // + R_SEND src_base + src_len | ||
| 55 | // - R_TMP2 used by doCopy | ||
| 56 | // - R_TMP3 used by doCopy | ||
| 57 | // | ||
| 58 | // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the | ||
| 59 | // function, and after a CALL returns, and are not otherwise modified. | ||
| 60 | // | ||
| 61 | // The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. | ||
| 62 | // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. | ||
| 63 | TEXT ·s2Decode(SB), NOSPLIT, $56-64 | ||
| 64 | // Initialize R_SRC, R_DST and R_DBASE-R_SEND. | ||
| 65 | MOVD dst_base+0(FP), R_DBASE | ||
| 66 | MOVD dst_len+8(FP), R_DLEN | ||
| 67 | MOVD R_DBASE, R_DST | ||
| 68 | MOVD R_DBASE, R_DEND | ||
| 69 | ADD R_DLEN, R_DEND, R_DEND | ||
| 70 | MOVD src_base+24(FP), R_SBASE | ||
| 71 | MOVD src_len+32(FP), R_SLEN | ||
| 72 | MOVD R_SBASE, R_SRC | ||
| 73 | MOVD R_SBASE, R_SEND | ||
| 74 | ADD R_SLEN, R_SEND, R_SEND | ||
| 75 | MOVD $0, R_OFF | ||
| 76 | |||
| 77 | loop: | ||
| 78 | // for s < len(src) | ||
| 79 | CMP R_SEND, R_SRC | ||
| 80 | BEQ end | ||
| 81 | |||
| 82 | // R_LEN = uint32(src[s]) | ||
| 83 | // | ||
| 84 | // switch src[s] & 0x03 | ||
| 85 | MOVBU (R_SRC), R_LEN | ||
| 86 | MOVW R_LEN, R_TMP1 | ||
| 87 | ANDW $3, R_TMP1 | ||
| 88 | MOVW $1, R1 | ||
| 89 | CMPW R1, R_TMP1 | ||
| 90 | BGE tagCopy | ||
| 91 | |||
| 92 | // ---------------------------------------- | ||
| 93 | // The code below handles literal tags. | ||
| 94 | |||
| 95 | // case tagLiteral: | ||
| 96 | // x := uint32(src[s] >> 2) | ||
| 97 | // switch | ||
| 98 | MOVW $60, R1 | ||
| 99 | LSRW $2, R_LEN, R_LEN | ||
| 100 | CMPW R_LEN, R1 | ||
| 101 | BLS tagLit60Plus | ||
| 102 | |||
| 103 | // case x < 60: | ||
| 104 | // s++ | ||
| 105 | ADD $1, R_SRC, R_SRC | ||
| 106 | |||
| 107 | doLit: | ||
| 108 | // This is the end of the inner "switch", when we have a literal tag. | ||
| 109 | // | ||
| 110 | // We assume that R_LEN == x and x fits in a uint32, where x is the variable | ||
| 111 | // used in the pure Go decode_other.go code. | ||
| 112 | |||
| 113 | // length = int(x) + 1 | ||
| 114 | // | ||
| 115 | // Unlike the pure Go code, we don't need to check if length <= 0 because | ||
| 116 | // R_LEN can hold 64 bits, so the increment cannot overflow. | ||
| 117 | ADD $1, R_LEN, R_LEN | ||
| 118 | |||
| 119 | // Prepare to check if copying length bytes will run past the end of dst or | ||
| 120 | // src. | ||
| 121 | // | ||
| 122 | // R_TMP0 = len(dst) - d | ||
| 123 | // R_TMP1 = len(src) - s | ||
| 124 | MOVD R_DEND, R_TMP0 | ||
| 125 | SUB R_DST, R_TMP0, R_TMP0 | ||
| 126 | MOVD R_SEND, R_TMP1 | ||
| 127 | SUB R_SRC, R_TMP1, R_TMP1 | ||
| 128 | |||
| 129 | // !!! Try a faster technique for short (16 or fewer bytes) copies. | ||
| 130 | // | ||
| 131 | // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { | ||
| 132 | // goto callMemmove // Fall back on calling runtime·memmove. | ||
| 133 | // } | ||
| 134 | // | ||
| 135 | // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s | ||
| 136 | // against 21 instead of 16, because it cannot assume that all of its input | ||
| 137 | // is contiguous in memory and so it needs to leave enough source bytes to | ||
| 138 | // read the next tag without refilling buffers, but Go's Decode assumes | ||
| 139 | // contiguousness (the src argument is a []byte). | ||
| 140 | CMP $16, R_LEN | ||
| 141 | BGT callMemmove | ||
| 142 | CMP $16, R_TMP0 | ||
| 143 | BLT callMemmove | ||
| 144 | CMP $16, R_TMP1 | ||
| 145 | BLT callMemmove | ||
| 146 | |||
| 147 | // !!! Implement the copy from src to dst as a 16-byte load and store. | ||
| 148 | // (Decode's documentation says that dst and src must not overlap.) | ||
| 149 | // | ||
| 150 | // This always copies 16 bytes, instead of only length bytes, but that's | ||
| 151 | // OK. If the input is a valid Snappy encoding then subsequent iterations | ||
| 152 | // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a | ||
| 153 | // non-nil error), so the overrun will be ignored. | ||
| 154 | // | ||
| 155 | // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or | ||
| 156 | // 16-byte loads and stores. This technique probably wouldn't be as | ||
| 157 | // effective on architectures that are fussier about alignment. | ||
| 158 | LDP 0(R_SRC), (R_TMP2, R_TMP3) | ||
| 159 | STP (R_TMP2, R_TMP3), 0(R_DST) | ||
| 160 | |||
| 161 | // d += length | ||
| 162 | // s += length | ||
| 163 | ADD R_LEN, R_DST, R_DST | ||
| 164 | ADD R_LEN, R_SRC, R_SRC | ||
| 165 | B loop | ||
| 166 | |||
| 167 | callMemmove: | ||
| 168 | // if length > len(dst)-d || length > len(src)-s { etc } | ||
| 169 | CMP R_TMP0, R_LEN | ||
| 170 | BGT errCorrupt | ||
| 171 | CMP R_TMP1, R_LEN | ||
| 172 | BGT errCorrupt | ||
| 173 | |||
| 174 | // copy(dst[d:], src[s:s+length]) | ||
| 175 | // | ||
| 176 | // This means calling runtime·memmove(&dst[d], &src[s], length), so we push | ||
| 177 | // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those | ||
| 178 | // three registers to the stack, to save local variables across the CALL. | ||
| 179 | MOVD R_DST, 8(RSP) | ||
| 180 | MOVD R_SRC, 16(RSP) | ||
| 181 | MOVD R_LEN, 24(RSP) | ||
| 182 | MOVD R_DST, 32(RSP) | ||
| 183 | MOVD R_SRC, 40(RSP) | ||
| 184 | MOVD R_LEN, 48(RSP) | ||
| 185 | MOVD R_OFF, 56(RSP) | ||
| 186 | CALL runtime·memmove(SB) | ||
| 187 | |||
| 188 | // Restore local variables: unspill registers from the stack and | ||
| 189 | // re-calculate R_DBASE-R_SEND. | ||
| 190 | MOVD 32(RSP), R_DST | ||
| 191 | MOVD 40(RSP), R_SRC | ||
| 192 | MOVD 48(RSP), R_LEN | ||
| 193 | MOVD 56(RSP), R_OFF | ||
| 194 | MOVD dst_base+0(FP), R_DBASE | ||
| 195 | MOVD dst_len+8(FP), R_DLEN | ||
| 196 | MOVD R_DBASE, R_DEND | ||
| 197 | ADD R_DLEN, R_DEND, R_DEND | ||
| 198 | MOVD src_base+24(FP), R_SBASE | ||
| 199 | MOVD src_len+32(FP), R_SLEN | ||
| 200 | MOVD R_SBASE, R_SEND | ||
| 201 | ADD R_SLEN, R_SEND, R_SEND | ||
| 202 | |||
| 203 | // d += length | ||
| 204 | // s += length | ||
| 205 | ADD R_LEN, R_DST, R_DST | ||
| 206 | ADD R_LEN, R_SRC, R_SRC | ||
| 207 | B loop | ||
| 208 | |||
| 209 | tagLit60Plus: | ||
| 210 | // !!! This fragment does the | ||
| 211 | // | ||
| 212 | // s += x - 58; if uint(s) > uint(len(src)) { etc } | ||
| 213 | // | ||
| 214 | // checks. In the asm version, we code it once instead of once per switch case. | ||
| 215 | ADD R_LEN, R_SRC, R_SRC | ||
| 216 | SUB $58, R_SRC, R_SRC | ||
| 217 | TEST_SRC() | ||
| 218 | |||
| 219 | // case x == 60: | ||
| 220 | MOVW $61, R1 | ||
| 221 | CMPW R1, R_LEN | ||
| 222 | BEQ tagLit61 | ||
| 223 | BGT tagLit62Plus | ||
| 224 | |||
| 225 | // x = uint32(src[s-1]) | ||
| 226 | MOVBU -1(R_SRC), R_LEN | ||
| 227 | B doLit | ||
| 228 | |||
| 229 | tagLit61: | ||
| 230 | // case x == 61: | ||
| 231 | // x = uint32(src[s-2]) | uint32(src[s-1])<<8 | ||
| 232 | MOVHU -2(R_SRC), R_LEN | ||
| 233 | B doLit | ||
| 234 | |||
| 235 | tagLit62Plus: | ||
| 236 | CMPW $62, R_LEN | ||
| 237 | BHI tagLit63 | ||
| 238 | |||
| 239 | // case x == 62: | ||
| 240 | // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 | ||
| 241 | MOVHU -3(R_SRC), R_LEN | ||
| 242 | MOVBU -1(R_SRC), R_TMP1 | ||
| 243 | ORR R_TMP1<<16, R_LEN | ||
| 244 | B doLit | ||
| 245 | |||
| 246 | tagLit63: | ||
| 247 | // case x == 63: | ||
| 248 | // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 | ||
| 249 | MOVWU -4(R_SRC), R_LEN | ||
| 250 | B doLit | ||
| 251 | |||
| 252 | // The code above handles literal tags. | ||
| 253 | // ---------------------------------------- | ||
| 254 | // The code below handles copy tags. | ||
| 255 | |||
| 256 | tagCopy4: | ||
| 257 | // case tagCopy4: | ||
| 258 | // s += 5 | ||
| 259 | ADD $5, R_SRC, R_SRC | ||
| 260 | |||
| 261 | // if uint(s) > uint(len(src)) { etc } | ||
| 262 | MOVD R_SRC, R_TMP1 | ||
| 263 | SUB R_SBASE, R_TMP1, R_TMP1 | ||
| 264 | CMP R_SLEN, R_TMP1 | ||
| 265 | BGT errCorrupt | ||
| 266 | |||
| 267 | // length = 1 + int(src[s-5])>>2 | ||
| 268 | MOVD $1, R1 | ||
| 269 | ADD R_LEN>>2, R1, R_LEN | ||
| 270 | |||
| 271 | // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) | ||
| 272 | MOVWU -4(R_SRC), R_OFF | ||
| 273 | B doCopy | ||
| 274 | |||
| 275 | tagCopy2: | ||
| 276 | // case tagCopy2: | ||
| 277 | // s += 3 | ||
| 278 | ADD $3, R_SRC, R_SRC | ||
| 279 | |||
| 280 | // if uint(s) > uint(len(src)) { etc } | ||
| 281 | TEST_SRC() | ||
| 282 | |||
| 283 | // length = 1 + int(src[s-3])>>2 | ||
| 284 | MOVD $1, R1 | ||
| 285 | ADD R_LEN>>2, R1, R_LEN | ||
| 286 | |||
| 287 | // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) | ||
| 288 | MOVHU -2(R_SRC), R_OFF | ||
| 289 | B doCopy | ||
| 290 | |||
| 291 | tagCopy: | ||
| 292 | // We have a copy tag. We assume that: | ||
| 293 | // - R_TMP1 == src[s] & 0x03 | ||
| 294 | // - R_LEN == src[s] | ||
| 295 | CMP $2, R_TMP1 | ||
| 296 | BEQ tagCopy2 | ||
| 297 | BGT tagCopy4 | ||
| 298 | |||
| 299 | // case tagCopy1: | ||
| 300 | // s += 2 | ||
| 301 | ADD $2, R_SRC, R_SRC | ||
| 302 | |||
| 303 | // if uint(s) > uint(len(src)) { etc } | ||
| 304 | TEST_SRC() | ||
| 305 | |||
| 306 | // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) | ||
| 307 | // Calculate offset in R_TMP0 in case it is a repeat. | ||
| 308 | MOVD R_LEN, R_TMP0 | ||
| 309 | AND $0xe0, R_TMP0 | ||
| 310 | MOVBU -1(R_SRC), R_TMP1 | ||
| 311 | ORR R_TMP0<<3, R_TMP1, R_TMP0 | ||
| 312 | |||
| 313 | // length = 4 + int(src[s-2])>>2&0x7 | ||
| 314 | MOVD $7, R1 | ||
| 315 | AND R_LEN>>2, R1, R_LEN | ||
| 316 | ADD $4, R_LEN, R_LEN | ||
| 317 | |||
| 318 | // check if repeat code with offset 0. | ||
| 319 | CMP $0, R_TMP0 | ||
| 320 | BEQ repeatCode | ||
| 321 | |||
| 322 | // This is a regular copy, transfer our temporary value to R_OFF (offset) | ||
| 323 | MOVD R_TMP0, R_OFF | ||
| 324 | B doCopy | ||
| 325 | |||
| 326 | // This is a repeat code. | ||
| 327 | repeatCode: | ||
| 328 | // If length < 9, reuse last offset, with the length already calculated. | ||
| 329 | CMP $9, R_LEN | ||
| 330 | BLT doCopyRepeat | ||
| 331 | BEQ repeatLen1 | ||
| 332 | CMP $10, R_LEN | ||
| 333 | BEQ repeatLen2 | ||
| 334 | |||
| 335 | repeatLen3: | ||
| 336 | // s +=3 | ||
| 337 | ADD $3, R_SRC, R_SRC | ||
| 338 | |||
| 339 | // if uint(s) > uint(len(src)) { etc } | ||
| 340 | TEST_SRC() | ||
| 341 | |||
| 342 | // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540 | ||
| 343 | MOVBU -1(R_SRC), R_TMP0 | ||
| 344 | MOVHU -3(R_SRC), R_LEN | ||
| 345 | ORR R_TMP0<<16, R_LEN, R_LEN | ||
| 346 | ADD $65540, R_LEN, R_LEN | ||
| 347 | B doCopyRepeat | ||
| 348 | |||
| 349 | repeatLen2: | ||
| 350 | // s +=2 | ||
| 351 | ADD $2, R_SRC, R_SRC | ||
| 352 | |||
| 353 | // if uint(s) > uint(len(src)) { etc } | ||
| 354 | TEST_SRC() | ||
| 355 | |||
| 356 | // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260 | ||
| 357 | MOVHU -2(R_SRC), R_LEN | ||
| 358 | ADD $260, R_LEN, R_LEN | ||
| 359 | B doCopyRepeat | ||
| 360 | |||
| 361 | repeatLen1: | ||
| 362 | // s +=1 | ||
| 363 | ADD $1, R_SRC, R_SRC | ||
| 364 | |||
| 365 | // if uint(s) > uint(len(src)) { etc } | ||
| 366 | TEST_SRC() | ||
| 367 | |||
| 368 | // length = src[s-1] + 8 | ||
| 369 | MOVBU -1(R_SRC), R_LEN | ||
| 370 | ADD $8, R_LEN, R_LEN | ||
| 371 | B doCopyRepeat | ||
| 372 | |||
| 373 | doCopy: | ||
| 374 | // This is the end of the outer "switch", when we have a copy tag. | ||
| 375 | // | ||
| 376 | // We assume that: | ||
| 377 | // - R_LEN == length && R_LEN > 0 | ||
| 378 | // - R_OFF == offset | ||
| 379 | |||
| 380 | // if d < offset { etc } | ||
| 381 | MOVD R_DST, R_TMP1 | ||
| 382 | SUB R_DBASE, R_TMP1, R_TMP1 | ||
| 383 | CMP R_OFF, R_TMP1 | ||
| 384 | BLT errCorrupt | ||
| 385 | |||
| 386 | // Repeat values can skip the test above, since any offset > 0 will be in dst. | ||
| 387 | doCopyRepeat: | ||
| 388 | |||
| 389 | // if offset <= 0 { etc } | ||
| 390 | CMP $0, R_OFF | ||
| 391 | BLE errCorrupt | ||
| 392 | |||
| 393 | // if length > len(dst)-d { etc } | ||
| 394 | MOVD R_DEND, R_TMP1 | ||
| 395 | SUB R_DST, R_TMP1, R_TMP1 | ||
| 396 | CMP R_TMP1, R_LEN | ||
| 397 | BGT errCorrupt | ||
| 398 | |||
| 399 | // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length | ||
| 400 | // | ||
| 401 | // Set: | ||
| 402 | // - R_TMP2 = len(dst)-d | ||
| 403 | // - R_TMP3 = &dst[d-offset] | ||
| 404 | MOVD R_DEND, R_TMP2 | ||
| 405 | SUB R_DST, R_TMP2, R_TMP2 | ||
| 406 | MOVD R_DST, R_TMP3 | ||
| 407 | SUB R_OFF, R_TMP3, R_TMP3 | ||
| 408 | |||
| 409 | // !!! Try a faster technique for short (16 or fewer bytes) forward copies. | ||
| 410 | // | ||
| 411 | // First, try using two 8-byte load/stores, similar to the doLit technique | ||
| 412 | // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is | ||
| 413 | // still OK if offset >= 8. Note that this has to be two 8-byte load/stores | ||
| 414 | // and not one 16-byte load/store, and the first store has to be before the | ||
| 415 | // second load, due to the overlap if offset is in the range [8, 16). | ||
| 416 | // | ||
| 417 | // if length > 16 || offset < 8 || len(dst)-d < 16 { | ||
| 418 | // goto slowForwardCopy | ||
| 419 | // } | ||
| 420 | // copy 16 bytes | ||
| 421 | // d += length | ||
| 422 | CMP $16, R_LEN | ||
| 423 | BGT slowForwardCopy | ||
| 424 | CMP $8, R_OFF | ||
| 425 | BLT slowForwardCopy | ||
| 426 | CMP $16, R_TMP2 | ||
| 427 | BLT slowForwardCopy | ||
| 428 | MOVD 0(R_TMP3), R_TMP0 | ||
| 429 | MOVD R_TMP0, 0(R_DST) | ||
| 430 | MOVD 8(R_TMP3), R_TMP1 | ||
| 431 | MOVD R_TMP1, 8(R_DST) | ||
| 432 | ADD R_LEN, R_DST, R_DST | ||
| 433 | B loop | ||
| 434 | |||
| 435 | slowForwardCopy: | ||
| 436 | // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we | ||
| 437 | // can still try 8-byte load stores, provided we can overrun up to 10 extra | ||
| 438 | // bytes. As above, the overrun will be fixed up by subsequent iterations | ||
| 439 | // of the outermost loop. | ||
| 440 | // | ||
| 441 | // The C++ snappy code calls this technique IncrementalCopyFastPath. Its | ||
| 442 | // commentary says: | ||
| 443 | // | ||
| 444 | // ---- | ||
| 445 | // | ||
| 446 | // The main part of this loop is a simple copy of eight bytes at a time | ||
| 447 | // until we've copied (at least) the requested amount of bytes. However, | ||
| 448 | // if d and d-offset are less than eight bytes apart (indicating a | ||
| 449 | // repeating pattern of length < 8), we first need to expand the pattern in | ||
| 450 | // order to get the correct results. For instance, if the buffer looks like | ||
| 451 | // this, with the eight-byte <d-offset> and <d> patterns marked as | ||
| 452 | // intervals: | ||
| 453 | // | ||
| 454 | // abxxxxxxxxxxxx | ||
| 455 | // [------] d-offset | ||
| 456 | // [------] d | ||
| 457 | // | ||
| 458 | // a single eight-byte copy from <d-offset> to <d> will repeat the pattern | ||
| 459 | // once, after which we can move <d> two bytes without moving <d-offset>: | ||
| 460 | // | ||
| 461 | // ababxxxxxxxxxx | ||
| 462 | // [------] d-offset | ||
| 463 | // [------] d | ||
| 464 | // | ||
| 465 | // and repeat the exercise until the two no longer overlap. | ||
| 466 | // | ||
| 467 | // This allows us to do very well in the special case of one single byte | ||
| 468 | // repeated many times, without taking a big hit for more general cases. | ||
| 469 | // | ||
| 470 | // The worst case of extra writing past the end of the match occurs when | ||
| 471 | // offset == 1 and length == 1; the last copy will read from byte positions | ||
| 472 | // [0..7] and write to [4..11], whereas it was only supposed to write to | ||
| 473 | // position 1. Thus, ten excess bytes. | ||
| 474 | // | ||
| 475 | // ---- | ||
| 476 | // | ||
| 477 | // That "10 byte overrun" worst case is confirmed by Go's | ||
| 478 | // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy | ||
| 479 | // and finishSlowForwardCopy algorithm. | ||
| 480 | // | ||
| 481 | // if length > len(dst)-d-10 { | ||
| 482 | // goto verySlowForwardCopy | ||
| 483 | // } | ||
| 484 | SUB $10, R_TMP2, R_TMP2 | ||
| 485 | CMP R_TMP2, R_LEN | ||
| 486 | BGT verySlowForwardCopy | ||
| 487 | |||
| 488 | // We want to keep the offset, so we use R_TMP2 from here. | ||
| 489 | MOVD R_OFF, R_TMP2 | ||
| 490 | |||
| 491 | makeOffsetAtLeast8: | ||
| 492 | // !!! As above, expand the pattern so that offset >= 8 and we can use | ||
| 493 | // 8-byte load/stores. | ||
| 494 | // | ||
| 495 | // for offset < 8 { | ||
| 496 | // copy 8 bytes from dst[d-offset:] to dst[d:] | ||
| 497 | // length -= offset | ||
| 498 | // d += offset | ||
| 499 | // offset += offset | ||
| 500 | // // The two previous lines together means that d-offset, and therefore | ||
| 501 | // // R_TMP3, is unchanged. | ||
| 502 | // } | ||
| 503 | CMP $8, R_TMP2 | ||
| 504 | BGE fixUpSlowForwardCopy | ||
| 505 | MOVD (R_TMP3), R_TMP1 | ||
| 506 | MOVD R_TMP1, (R_DST) | ||
| 507 | SUB R_TMP2, R_LEN, R_LEN | ||
| 508 | ADD R_TMP2, R_DST, R_DST | ||
| 509 | ADD R_TMP2, R_TMP2, R_TMP2 | ||
| 510 | B makeOffsetAtLeast8 | ||
| 511 | |||
| 512 | fixUpSlowForwardCopy: | ||
| 513 | // !!! Add length (which might be negative now) to d (implied by R_DST being | ||
| 514 | // &dst[d]) so that d ends up at the right place when we jump back to the | ||
| 515 | // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if | ||
| 516 | // length is positive, copying the remaining length bytes will write to the | ||
| 517 | // right place. | ||
| 518 | MOVD R_DST, R_TMP0 | ||
| 519 | ADD R_LEN, R_DST, R_DST | ||
| 520 | |||
| 521 | finishSlowForwardCopy: | ||
| 522 | // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative | ||
| 523 | // length means that we overrun, but as above, that will be fixed up by | ||
| 524 | // subsequent iterations of the outermost loop. | ||
| 525 | MOVD $0, R1 | ||
| 526 | CMP R1, R_LEN | ||
| 527 | BLE loop | ||
| 528 | MOVD (R_TMP3), R_TMP1 | ||
| 529 | MOVD R_TMP1, (R_TMP0) | ||
| 530 | ADD $8, R_TMP3, R_TMP3 | ||
| 531 | ADD $8, R_TMP0, R_TMP0 | ||
| 532 | SUB $8, R_LEN, R_LEN | ||
| 533 | B finishSlowForwardCopy | ||
| 534 | |||
| 535 | verySlowForwardCopy: | ||
| 536 | // verySlowForwardCopy is a simple implementation of forward copy. In C | ||
| 537 | // parlance, this is a do/while loop instead of a while loop, since we know | ||
| 538 | // that length > 0. In Go syntax: | ||
| 539 | // | ||
| 540 | // for { | ||
| 541 | // dst[d] = dst[d - offset] | ||
| 542 | // d++ | ||
| 543 | // length-- | ||
| 544 | // if length == 0 { | ||
| 545 | // break | ||
| 546 | // } | ||
| 547 | // } | ||
| 548 | MOVB (R_TMP3), R_TMP1 | ||
| 549 | MOVB R_TMP1, (R_DST) | ||
| 550 | ADD $1, R_TMP3, R_TMP3 | ||
| 551 | ADD $1, R_DST, R_DST | ||
| 552 | SUB $1, R_LEN, R_LEN | ||
| 553 | CBNZ R_LEN, verySlowForwardCopy | ||
| 554 | B loop | ||
| 555 | |||
| 556 | // The code above handles copy tags. | ||
| 557 | // ---------------------------------------- | ||
| 558 | |||
| 559 | end: | ||
| 560 | // This is the end of the "for s < len(src)". | ||
| 561 | // | ||
| 562 | // if d != len(dst) { etc } | ||
| 563 | CMP R_DEND, R_DST | ||
| 564 | BNE errCorrupt | ||
| 565 | |||
| 566 | // return 0 | ||
| 567 | MOVD $0, ret+48(FP) | ||
| 568 | RET | ||
| 569 | |||
| 570 | errCorrupt: | ||
| 571 | // return decodeErrCodeCorrupt | ||
| 572 | MOVD $1, R_TMP0 | ||
| 573 | MOVD R_TMP0, ret+48(FP) | ||
| 574 | RET | ||