diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/decode_amd64.s')
-rw-r--r-- | vendor/github.com/klauspost/compress/s2/decode_amd64.s | 568 |
1 files changed, 568 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s new file mode 100644 index 0000000..9b105e0 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s | |||
@@ -0,0 +1,568 @@ | |||
1 | // Copyright 2016 The Go Authors. All rights reserved. | ||
2 | // Copyright (c) 2019 Klaus Post. All rights reserved. | ||
3 | // Use of this source code is governed by a BSD-style | ||
4 | // license that can be found in the LICENSE file. | ||
5 | |||
6 | // +build !appengine | ||
7 | // +build gc | ||
8 | // +build !noasm | ||
9 | |||
10 | #include "textflag.h" | ||
11 | |||
12 | #define R_TMP0 AX | ||
13 | #define R_TMP1 BX | ||
14 | #define R_LEN CX | ||
15 | #define R_OFF DX | ||
16 | #define R_SRC SI | ||
17 | #define R_DST DI | ||
18 | #define R_DBASE R8 | ||
19 | #define R_DLEN R9 | ||
20 | #define R_DEND R10 | ||
21 | #define R_SBASE R11 | ||
22 | #define R_SLEN R12 | ||
23 | #define R_SEND R13 | ||
24 | #define R_TMP2 R14 | ||
25 | #define R_TMP3 R15 | ||
26 | |||
27 | // The asm code generally follows the pure Go code in decode_other.go, except | ||
28 | // where marked with a "!!!". | ||
29 | |||
30 | // func decode(dst, src []byte) int | ||
31 | // | ||
32 | // All local variables fit into registers. The non-zero stack size is only to | ||
33 | // spill registers and push args when issuing a CALL. The register allocation: | ||
34 | // - R_TMP0 scratch | ||
35 | // - R_TMP1 scratch | ||
36 | // - R_LEN length or x (shared) | ||
37 | // - R_OFF offset | ||
38 | // - R_SRC &src[s] | ||
39 | // - R_DST &dst[d] | ||
40 | // + R_DBASE dst_base | ||
41 | // + R_DLEN dst_len | ||
42 | // + R_DEND dst_base + dst_len | ||
43 | // + R_SBASE src_base | ||
44 | // + R_SLEN src_len | ||
45 | // + R_SEND src_base + src_len | ||
46 | // - R_TMP2 used by doCopy | ||
47 | // - R_TMP3 used by doCopy | ||
48 | // | ||
49 | // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the | ||
50 | // function, and after a CALL returns, and are not otherwise modified. | ||
51 | // | ||
52 | // The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. | ||
53 | // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. | ||
54 | TEXT ·s2Decode(SB), NOSPLIT, $48-56 | ||
55 | // Initialize R_SRC, R_DST and R_DBASE-R_SEND. | ||
56 | MOVQ dst_base+0(FP), R_DBASE | ||
57 | MOVQ dst_len+8(FP), R_DLEN | ||
58 | MOVQ R_DBASE, R_DST | ||
59 | MOVQ R_DBASE, R_DEND | ||
60 | ADDQ R_DLEN, R_DEND | ||
61 | MOVQ src_base+24(FP), R_SBASE | ||
62 | MOVQ src_len+32(FP), R_SLEN | ||
63 | MOVQ R_SBASE, R_SRC | ||
64 | MOVQ R_SBASE, R_SEND | ||
65 | ADDQ R_SLEN, R_SEND | ||
66 | XORQ R_OFF, R_OFF | ||
67 | |||
68 | loop: | ||
69 | // for s < len(src) | ||
70 | CMPQ R_SRC, R_SEND | ||
71 | JEQ end | ||
72 | |||
73 | // R_LEN = uint32(src[s]) | ||
74 | // | ||
75 | // switch src[s] & 0x03 | ||
76 | MOVBLZX (R_SRC), R_LEN | ||
77 | MOVL R_LEN, R_TMP1 | ||
78 | ANDL $3, R_TMP1 | ||
79 | CMPL R_TMP1, $1 | ||
80 | JAE tagCopy | ||
81 | |||
82 | // ---------------------------------------- | ||
83 | // The code below handles literal tags. | ||
84 | |||
85 | // case tagLiteral: | ||
86 | // x := uint32(src[s] >> 2) | ||
87 | // switch | ||
88 | SHRL $2, R_LEN | ||
89 | CMPL R_LEN, $60 | ||
90 | JAE tagLit60Plus | ||
91 | |||
92 | // case x < 60: | ||
93 | // s++ | ||
94 | INCQ R_SRC | ||
95 | |||
96 | doLit: | ||
97 | // This is the end of the inner "switch", when we have a literal tag. | ||
98 | // | ||
99 | // We assume that R_LEN == x and x fits in a uint32, where x is the variable | ||
100 | // used in the pure Go decode_other.go code. | ||
101 | |||
102 | // length = int(x) + 1 | ||
103 | // | ||
104 | // Unlike the pure Go code, we don't need to check if length <= 0 because | ||
105 | // R_LEN can hold 64 bits, so the increment cannot overflow. | ||
106 | INCQ R_LEN | ||
107 | |||
108 | // Prepare to check if copying length bytes will run past the end of dst or | ||
109 | // src. | ||
110 | // | ||
111 | // R_TMP0 = len(dst) - d | ||
112 | // R_TMP1 = len(src) - s | ||
113 | MOVQ R_DEND, R_TMP0 | ||
114 | SUBQ R_DST, R_TMP0 | ||
115 | MOVQ R_SEND, R_TMP1 | ||
116 | SUBQ R_SRC, R_TMP1 | ||
117 | |||
118 | // !!! Try a faster technique for short (16 or fewer bytes) copies. | ||
119 | // | ||
120 | // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { | ||
121 | // goto callMemmove // Fall back on calling runtime·memmove. | ||
122 | // } | ||
123 | // | ||
124 | // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s | ||
125 | // against 21 instead of 16, because it cannot assume that all of its input | ||
126 | // is contiguous in memory and so it needs to leave enough source bytes to | ||
127 | // read the next tag without refilling buffers, but Go's Decode assumes | ||
128 | // contiguousness (the src argument is a []byte). | ||
129 | CMPQ R_LEN, $16 | ||
130 | JGT callMemmove | ||
131 | CMPQ R_TMP0, $16 | ||
132 | JLT callMemmove | ||
133 | CMPQ R_TMP1, $16 | ||
134 | JLT callMemmove | ||
135 | |||
136 | // !!! Implement the copy from src to dst as a 16-byte load and store. | ||
137 | // (Decode's documentation says that dst and src must not overlap.) | ||
138 | // | ||
139 | // This always copies 16 bytes, instead of only length bytes, but that's | ||
140 | // OK. If the input is a valid Snappy encoding then subsequent iterations | ||
141 | // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a | ||
142 | // non-nil error), so the overrun will be ignored. | ||
143 | // | ||
144 | // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or | ||
145 | // 16-byte loads and stores. This technique probably wouldn't be as | ||
146 | // effective on architectures that are fussier about alignment. | ||
147 | MOVOU 0(R_SRC), X0 | ||
148 | MOVOU X0, 0(R_DST) | ||
149 | |||
150 | // d += length | ||
151 | // s += length | ||
152 | ADDQ R_LEN, R_DST | ||
153 | ADDQ R_LEN, R_SRC | ||
154 | JMP loop | ||
155 | |||
156 | callMemmove: | ||
157 | // if length > len(dst)-d || length > len(src)-s { etc } | ||
158 | CMPQ R_LEN, R_TMP0 | ||
159 | JGT errCorrupt | ||
160 | CMPQ R_LEN, R_TMP1 | ||
161 | JGT errCorrupt | ||
162 | |||
163 | // copy(dst[d:], src[s:s+length]) | ||
164 | // | ||
165 | // This means calling runtime·memmove(&dst[d], &src[s], length), so we push | ||
166 | // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those | ||
167 | // three registers to the stack, to save local variables across the CALL. | ||
168 | MOVQ R_DST, 0(SP) | ||
169 | MOVQ R_SRC, 8(SP) | ||
170 | MOVQ R_LEN, 16(SP) | ||
171 | MOVQ R_DST, 24(SP) | ||
172 | MOVQ R_SRC, 32(SP) | ||
173 | MOVQ R_LEN, 40(SP) | ||
174 | MOVQ R_OFF, 48(SP) | ||
175 | CALL runtime·memmove(SB) | ||
176 | |||
177 | // Restore local variables: unspill registers from the stack and | ||
178 | // re-calculate R_DBASE-R_SEND. | ||
179 | MOVQ 24(SP), R_DST | ||
180 | MOVQ 32(SP), R_SRC | ||
181 | MOVQ 40(SP), R_LEN | ||
182 | MOVQ 48(SP), R_OFF | ||
183 | MOVQ dst_base+0(FP), R_DBASE | ||
184 | MOVQ dst_len+8(FP), R_DLEN | ||
185 | MOVQ R_DBASE, R_DEND | ||
186 | ADDQ R_DLEN, R_DEND | ||
187 | MOVQ src_base+24(FP), R_SBASE | ||
188 | MOVQ src_len+32(FP), R_SLEN | ||
189 | MOVQ R_SBASE, R_SEND | ||
190 | ADDQ R_SLEN, R_SEND | ||
191 | |||
192 | // d += length | ||
193 | // s += length | ||
194 | ADDQ R_LEN, R_DST | ||
195 | ADDQ R_LEN, R_SRC | ||
196 | JMP loop | ||
197 | |||
198 | tagLit60Plus: | ||
199 | // !!! This fragment does the | ||
200 | // | ||
201 | // s += x - 58; if uint(s) > uint(len(src)) { etc } | ||
202 | // | ||
203 | // checks. In the asm version, we code it once instead of once per switch case. | ||
204 | ADDQ R_LEN, R_SRC | ||
205 | SUBQ $58, R_SRC | ||
206 | CMPQ R_SRC, R_SEND | ||
207 | JA errCorrupt | ||
208 | |||
209 | // case x == 60: | ||
210 | CMPL R_LEN, $61 | ||
211 | JEQ tagLit61 | ||
212 | JA tagLit62Plus | ||
213 | |||
214 | // x = uint32(src[s-1]) | ||
215 | MOVBLZX -1(R_SRC), R_LEN | ||
216 | JMP doLit | ||
217 | |||
218 | tagLit61: | ||
219 | // case x == 61: | ||
220 | // x = uint32(src[s-2]) | uint32(src[s-1])<<8 | ||
221 | MOVWLZX -2(R_SRC), R_LEN | ||
222 | JMP doLit | ||
223 | |||
224 | tagLit62Plus: | ||
225 | CMPL R_LEN, $62 | ||
226 | JA tagLit63 | ||
227 | |||
228 | // case x == 62: | ||
229 | // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 | ||
230 | // We read one byte, safe to read one back, since we are just reading tag. | ||
231 | // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8 | ||
232 | MOVL -4(R_SRC), R_LEN | ||
233 | SHRL $8, R_LEN | ||
234 | JMP doLit | ||
235 | |||
236 | tagLit63: | ||
237 | // case x == 63: | ||
238 | // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 | ||
239 | MOVL -4(R_SRC), R_LEN | ||
240 | JMP doLit | ||
241 | |||
242 | // The code above handles literal tags. | ||
243 | // ---------------------------------------- | ||
244 | // The code below handles copy tags. | ||
245 | |||
246 | tagCopy4: | ||
247 | // case tagCopy4: | ||
248 | // s += 5 | ||
249 | ADDQ $5, R_SRC | ||
250 | |||
251 | // if uint(s) > uint(len(src)) { etc } | ||
252 | CMPQ R_SRC, R_SEND | ||
253 | JA errCorrupt | ||
254 | |||
255 | // length = 1 + int(src[s-5])>>2 | ||
256 | SHRQ $2, R_LEN | ||
257 | INCQ R_LEN | ||
258 | |||
259 | // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) | ||
260 | MOVLQZX -4(R_SRC), R_OFF | ||
261 | JMP doCopy | ||
262 | |||
263 | tagCopy2: | ||
264 | // case tagCopy2: | ||
265 | // s += 3 | ||
266 | ADDQ $3, R_SRC | ||
267 | |||
268 | // if uint(s) > uint(len(src)) { etc } | ||
269 | CMPQ R_SRC, R_SEND | ||
270 | JA errCorrupt | ||
271 | |||
272 | // length = 1 + int(src[s-3])>>2 | ||
273 | SHRQ $2, R_LEN | ||
274 | INCQ R_LEN | ||
275 | |||
276 | // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) | ||
277 | MOVWQZX -2(R_SRC), R_OFF | ||
278 | JMP doCopy | ||
279 | |||
280 | tagCopy: | ||
281 | // We have a copy tag. We assume that: | ||
282 | // - R_TMP1 == src[s] & 0x03 | ||
283 | // - R_LEN == src[s] | ||
284 | CMPQ R_TMP1, $2 | ||
285 | JEQ tagCopy2 | ||
286 | JA tagCopy4 | ||
287 | |||
288 | // case tagCopy1: | ||
289 | // s += 2 | ||
290 | ADDQ $2, R_SRC | ||
291 | |||
292 | // if uint(s) > uint(len(src)) { etc } | ||
293 | CMPQ R_SRC, R_SEND | ||
294 | JA errCorrupt | ||
295 | |||
296 | // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) | ||
297 | // length = 4 + int(src[s-2])>>2&0x7 | ||
298 | MOVBQZX -1(R_SRC), R_TMP1 | ||
299 | MOVQ R_LEN, R_TMP0 | ||
300 | SHRQ $2, R_LEN | ||
301 | ANDQ $0xe0, R_TMP0 | ||
302 | ANDQ $7, R_LEN | ||
303 | SHLQ $3, R_TMP0 | ||
304 | ADDQ $4, R_LEN | ||
305 | ORQ R_TMP1, R_TMP0 | ||
306 | |||
307 | // check if repeat code, ZF set by ORQ. | ||
308 | JZ repeatCode | ||
309 | |||
310 | // This is a regular copy, transfer our temporary value to R_OFF (length) | ||
311 | MOVQ R_TMP0, R_OFF | ||
312 | JMP doCopy | ||
313 | |||
314 | // This is a repeat code. | ||
315 | repeatCode: | ||
316 | // If length < 9, reuse last offset, with the length already calculated. | ||
317 | CMPQ R_LEN, $9 | ||
318 | JL doCopyRepeat | ||
319 | |||
320 | // Read additional bytes for length. | ||
321 | JE repeatLen1 | ||
322 | |||
323 | // Rare, so the extra branch shouldn't hurt too much. | ||
324 | CMPQ R_LEN, $10 | ||
325 | JE repeatLen2 | ||
326 | JMP repeatLen3 | ||
327 | |||
328 | // Read repeat lengths. | ||
329 | repeatLen1: | ||
330 | // s ++ | ||
331 | ADDQ $1, R_SRC | ||
332 | |||
333 | // if uint(s) > uint(len(src)) { etc } | ||
334 | CMPQ R_SRC, R_SEND | ||
335 | JA errCorrupt | ||
336 | |||
337 | // length = src[s-1] + 8 | ||
338 | MOVBQZX -1(R_SRC), R_LEN | ||
339 | ADDL $8, R_LEN | ||
340 | JMP doCopyRepeat | ||
341 | |||
342 | repeatLen2: | ||
343 | // s +=2 | ||
344 | ADDQ $2, R_SRC | ||
345 | |||
346 | // if uint(s) > uint(len(src)) { etc } | ||
347 | CMPQ R_SRC, R_SEND | ||
348 | JA errCorrupt | ||
349 | |||
350 | // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8) | ||
351 | MOVWQZX -2(R_SRC), R_LEN | ||
352 | ADDL $260, R_LEN | ||
353 | JMP doCopyRepeat | ||
354 | |||
355 | repeatLen3: | ||
356 | // s +=3 | ||
357 | ADDQ $3, R_SRC | ||
358 | |||
359 | // if uint(s) > uint(len(src)) { etc } | ||
360 | CMPQ R_SRC, R_SEND | ||
361 | JA errCorrupt | ||
362 | |||
363 | // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16) | ||
364 | // Read one byte further back (just part of the tag, shifted out) | ||
365 | MOVL -4(R_SRC), R_LEN | ||
366 | SHRL $8, R_LEN | ||
367 | ADDL $65540, R_LEN | ||
368 | JMP doCopyRepeat | ||
369 | |||
370 | doCopy: | ||
371 | // This is the end of the outer "switch", when we have a copy tag. | ||
372 | // | ||
373 | // We assume that: | ||
374 | // - R_LEN == length && R_LEN > 0 | ||
375 | // - R_OFF == offset | ||
376 | |||
377 | // if d < offset { etc } | ||
378 | MOVQ R_DST, R_TMP1 | ||
379 | SUBQ R_DBASE, R_TMP1 | ||
380 | CMPQ R_TMP1, R_OFF | ||
381 | JLT errCorrupt | ||
382 | |||
383 | // Repeat values can skip the test above, since any offset > 0 will be in dst. | ||
384 | doCopyRepeat: | ||
385 | // if offset <= 0 { etc } | ||
386 | CMPQ R_OFF, $0 | ||
387 | JLE errCorrupt | ||
388 | |||
389 | // if length > len(dst)-d { etc } | ||
390 | MOVQ R_DEND, R_TMP1 | ||
391 | SUBQ R_DST, R_TMP1 | ||
392 | CMPQ R_LEN, R_TMP1 | ||
393 | JGT errCorrupt | ||
394 | |||
395 | // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length | ||
396 | // | ||
397 | // Set: | ||
398 | // - R_TMP2 = len(dst)-d | ||
399 | // - R_TMP3 = &dst[d-offset] | ||
400 | MOVQ R_DEND, R_TMP2 | ||
401 | SUBQ R_DST, R_TMP2 | ||
402 | MOVQ R_DST, R_TMP3 | ||
403 | SUBQ R_OFF, R_TMP3 | ||
404 | |||
405 | // !!! Try a faster technique for short (16 or fewer bytes) forward copies. | ||
406 | // | ||
407 | // First, try using two 8-byte load/stores, similar to the doLit technique | ||
408 | // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is | ||
409 | // still OK if offset >= 8. Note that this has to be two 8-byte load/stores | ||
410 | // and not one 16-byte load/store, and the first store has to be before the | ||
411 | // second load, due to the overlap if offset is in the range [8, 16). | ||
412 | // | ||
413 | // if length > 16 || offset < 8 || len(dst)-d < 16 { | ||
414 | // goto slowForwardCopy | ||
415 | // } | ||
416 | // copy 16 bytes | ||
417 | // d += length | ||
418 | CMPQ R_LEN, $16 | ||
419 | JGT slowForwardCopy | ||
420 | CMPQ R_OFF, $8 | ||
421 | JLT slowForwardCopy | ||
422 | CMPQ R_TMP2, $16 | ||
423 | JLT slowForwardCopy | ||
424 | MOVQ 0(R_TMP3), R_TMP0 | ||
425 | MOVQ R_TMP0, 0(R_DST) | ||
426 | MOVQ 8(R_TMP3), R_TMP1 | ||
427 | MOVQ R_TMP1, 8(R_DST) | ||
428 | ADDQ R_LEN, R_DST | ||
429 | JMP loop | ||
430 | |||
431 | slowForwardCopy: | ||
432 | // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we | ||
433 | // can still try 8-byte load stores, provided we can overrun up to 10 extra | ||
434 | // bytes. As above, the overrun will be fixed up by subsequent iterations | ||
435 | // of the outermost loop. | ||
436 | // | ||
437 | // The C++ snappy code calls this technique IncrementalCopyFastPath. Its | ||
438 | // commentary says: | ||
439 | // | ||
440 | // ---- | ||
441 | // | ||
442 | // The main part of this loop is a simple copy of eight bytes at a time | ||
443 | // until we've copied (at least) the requested amount of bytes. However, | ||
444 | // if d and d-offset are less than eight bytes apart (indicating a | ||
445 | // repeating pattern of length < 8), we first need to expand the pattern in | ||
446 | // order to get the correct results. For instance, if the buffer looks like | ||
447 | // this, with the eight-byte <d-offset> and <d> patterns marked as | ||
448 | // intervals: | ||
449 | // | ||
450 | // abxxxxxxxxxxxx | ||
451 | // [------] d-offset | ||
452 | // [------] d | ||
453 | // | ||
454 | // a single eight-byte copy from <d-offset> to <d> will repeat the pattern | ||
455 | // once, after which we can move <d> two bytes without moving <d-offset>: | ||
456 | // | ||
457 | // ababxxxxxxxxxx | ||
458 | // [------] d-offset | ||
459 | // [------] d | ||
460 | // | ||
461 | // and repeat the exercise until the two no longer overlap. | ||
462 | // | ||
463 | // This allows us to do very well in the special case of one single byte | ||
464 | // repeated many times, without taking a big hit for more general cases. | ||
465 | // | ||
466 | // The worst case of extra writing past the end of the match occurs when | ||
467 | // offset == 1 and length == 1; the last copy will read from byte positions | ||
468 | // [0..7] and write to [4..11], whereas it was only supposed to write to | ||
469 | // position 1. Thus, ten excess bytes. | ||
470 | // | ||
471 | // ---- | ||
472 | // | ||
473 | // That "10 byte overrun" worst case is confirmed by Go's | ||
474 | // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy | ||
475 | // and finishSlowForwardCopy algorithm. | ||
476 | // | ||
477 | // if length > len(dst)-d-10 { | ||
478 | // goto verySlowForwardCopy | ||
479 | // } | ||
480 | SUBQ $10, R_TMP2 | ||
481 | CMPQ R_LEN, R_TMP2 | ||
482 | JGT verySlowForwardCopy | ||
483 | |||
484 | // We want to keep the offset, so we use R_TMP2 from here. | ||
485 | MOVQ R_OFF, R_TMP2 | ||
486 | |||
487 | makeOffsetAtLeast8: | ||
488 | // !!! As above, expand the pattern so that offset >= 8 and we can use | ||
489 | // 8-byte load/stores. | ||
490 | // | ||
491 | // for offset < 8 { | ||
492 | // copy 8 bytes from dst[d-offset:] to dst[d:] | ||
493 | // length -= offset | ||
494 | // d += offset | ||
495 | // offset += offset | ||
496 | // // The two previous lines together means that d-offset, and therefore | ||
497 | // // R_TMP3, is unchanged. | ||
498 | // } | ||
499 | CMPQ R_TMP2, $8 | ||
500 | JGE fixUpSlowForwardCopy | ||
501 | MOVQ (R_TMP3), R_TMP1 | ||
502 | MOVQ R_TMP1, (R_DST) | ||
503 | SUBQ R_TMP2, R_LEN | ||
504 | ADDQ R_TMP2, R_DST | ||
505 | ADDQ R_TMP2, R_TMP2 | ||
506 | JMP makeOffsetAtLeast8 | ||
507 | |||
508 | fixUpSlowForwardCopy: | ||
509 | // !!! Add length (which might be negative now) to d (implied by R_DST being | ||
510 | // &dst[d]) so that d ends up at the right place when we jump back to the | ||
511 | // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if | ||
512 | // length is positive, copying the remaining length bytes will write to the | ||
513 | // right place. | ||
514 | MOVQ R_DST, R_TMP0 | ||
515 | ADDQ R_LEN, R_DST | ||
516 | |||
517 | finishSlowForwardCopy: | ||
518 | // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative | ||
519 | // length means that we overrun, but as above, that will be fixed up by | ||
520 | // subsequent iterations of the outermost loop. | ||
521 | CMPQ R_LEN, $0 | ||
522 | JLE loop | ||
523 | MOVQ (R_TMP3), R_TMP1 | ||
524 | MOVQ R_TMP1, (R_TMP0) | ||
525 | ADDQ $8, R_TMP3 | ||
526 | ADDQ $8, R_TMP0 | ||
527 | SUBQ $8, R_LEN | ||
528 | JMP finishSlowForwardCopy | ||
529 | |||
530 | verySlowForwardCopy: | ||
531 | // verySlowForwardCopy is a simple implementation of forward copy. In C | ||
532 | // parlance, this is a do/while loop instead of a while loop, since we know | ||
533 | // that length > 0. In Go syntax: | ||
534 | // | ||
535 | // for { | ||
536 | // dst[d] = dst[d - offset] | ||
537 | // d++ | ||
538 | // length-- | ||
539 | // if length == 0 { | ||
540 | // break | ||
541 | // } | ||
542 | // } | ||
543 | MOVB (R_TMP3), R_TMP1 | ||
544 | MOVB R_TMP1, (R_DST) | ||
545 | INCQ R_TMP3 | ||
546 | INCQ R_DST | ||
547 | DECQ R_LEN | ||
548 | JNZ verySlowForwardCopy | ||
549 | JMP loop | ||
550 | |||
551 | // The code above handles copy tags. | ||
552 | // ---------------------------------------- | ||
553 | |||
554 | end: | ||
555 | // This is the end of the "for s < len(src)". | ||
556 | // | ||
557 | // if d != len(dst) { etc } | ||
558 | CMPQ R_DST, R_DEND | ||
559 | JNE errCorrupt | ||
560 | |||
561 | // return 0 | ||
562 | MOVQ $0, ret+48(FP) | ||
563 | RET | ||
564 | |||
565 | errCorrupt: | ||
566 | // return decodeErrCodeCorrupt | ||
567 | MOVQ $1, ret+48(FP) | ||
568 | RET | ||