diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/decode_arm64.s')
-rw-r--r-- | vendor/github.com/klauspost/compress/s2/decode_arm64.s | 574 |
1 files changed, 574 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s new file mode 100644 index 0000000..4b63d50 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s | |||
@@ -0,0 +1,574 @@ | |||
1 | // Copyright 2020 The Go Authors. All rights reserved. | ||
2 | // Use of this source code is governed by a BSD-style | ||
3 | // license that can be found in the LICENSE file. | ||
4 | |||
5 | // +build !appengine | ||
6 | // +build gc | ||
7 | // +build !noasm | ||
8 | |||
9 | #include "textflag.h" | ||
10 | |||
11 | #define R_TMP0 R2 | ||
12 | #define R_TMP1 R3 | ||
13 | #define R_LEN R4 | ||
14 | #define R_OFF R5 | ||
15 | #define R_SRC R6 | ||
16 | #define R_DST R7 | ||
17 | #define R_DBASE R8 | ||
18 | #define R_DLEN R9 | ||
19 | #define R_DEND R10 | ||
20 | #define R_SBASE R11 | ||
21 | #define R_SLEN R12 | ||
22 | #define R_SEND R13 | ||
23 | #define R_TMP2 R14 | ||
24 | #define R_TMP3 R15 | ||
25 | |||
26 | // TEST_SRC will check if R_SRC is <= SRC_END | ||
27 | #define TEST_SRC() \ | ||
28 | CMP R_SEND, R_SRC \ | ||
29 | BGT errCorrupt | ||
30 | |||
31 | // MOVD R_SRC, R_TMP1 | ||
32 | // SUB R_SBASE, R_TMP1, R_TMP1 | ||
33 | // CMP R_SLEN, R_TMP1 | ||
34 | // BGT errCorrupt | ||
35 | |||
36 | // The asm code generally follows the pure Go code in decode_other.go, except | ||
37 | // where marked with a "!!!". | ||
38 | |||
39 | // func decode(dst, src []byte) int | ||
40 | // | ||
41 | // All local variables fit into registers. The non-zero stack size is only to | ||
42 | // spill registers and push args when issuing a CALL. The register allocation: | ||
43 | // - R_TMP0 scratch | ||
44 | // - R_TMP1 scratch | ||
45 | // - R_LEN length or x | ||
46 | // - R_OFF offset | ||
47 | // - R_SRC &src[s] | ||
48 | // - R_DST &dst[d] | ||
49 | // + R_DBASE dst_base | ||
50 | // + R_DLEN dst_len | ||
51 | // + R_DEND dst_base + dst_len | ||
52 | // + R_SBASE src_base | ||
53 | // + R_SLEN src_len | ||
54 | // + R_SEND src_base + src_len | ||
55 | // - R_TMP2 used by doCopy | ||
56 | // - R_TMP3 used by doCopy | ||
57 | // | ||
58 | // The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the | ||
59 | // function, and after a CALL returns, and are not otherwise modified. | ||
60 | // | ||
61 | // The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST. | ||
62 | // The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC. | ||
63 | TEXT ·s2Decode(SB), NOSPLIT, $56-64 | ||
64 | // Initialize R_SRC, R_DST and R_DBASE-R_SEND. | ||
65 | MOVD dst_base+0(FP), R_DBASE | ||
66 | MOVD dst_len+8(FP), R_DLEN | ||
67 | MOVD R_DBASE, R_DST | ||
68 | MOVD R_DBASE, R_DEND | ||
69 | ADD R_DLEN, R_DEND, R_DEND | ||
70 | MOVD src_base+24(FP), R_SBASE | ||
71 | MOVD src_len+32(FP), R_SLEN | ||
72 | MOVD R_SBASE, R_SRC | ||
73 | MOVD R_SBASE, R_SEND | ||
74 | ADD R_SLEN, R_SEND, R_SEND | ||
75 | MOVD $0, R_OFF | ||
76 | |||
77 | loop: | ||
78 | // for s < len(src) | ||
79 | CMP R_SEND, R_SRC | ||
80 | BEQ end | ||
81 | |||
82 | // R_LEN = uint32(src[s]) | ||
83 | // | ||
84 | // switch src[s] & 0x03 | ||
85 | MOVBU (R_SRC), R_LEN | ||
86 | MOVW R_LEN, R_TMP1 | ||
87 | ANDW $3, R_TMP1 | ||
88 | MOVW $1, R1 | ||
89 | CMPW R1, R_TMP1 | ||
90 | BGE tagCopy | ||
91 | |||
92 | // ---------------------------------------- | ||
93 | // The code below handles literal tags. | ||
94 | |||
95 | // case tagLiteral: | ||
96 | // x := uint32(src[s] >> 2) | ||
97 | // switch | ||
98 | MOVW $60, R1 | ||
99 | LSRW $2, R_LEN, R_LEN | ||
100 | CMPW R_LEN, R1 | ||
101 | BLS tagLit60Plus | ||
102 | |||
103 | // case x < 60: | ||
104 | // s++ | ||
105 | ADD $1, R_SRC, R_SRC | ||
106 | |||
107 | doLit: | ||
108 | // This is the end of the inner "switch", when we have a literal tag. | ||
109 | // | ||
110 | // We assume that R_LEN == x and x fits in a uint32, where x is the variable | ||
111 | // used in the pure Go decode_other.go code. | ||
112 | |||
113 | // length = int(x) + 1 | ||
114 | // | ||
115 | // Unlike the pure Go code, we don't need to check if length <= 0 because | ||
116 | // R_LEN can hold 64 bits, so the increment cannot overflow. | ||
117 | ADD $1, R_LEN, R_LEN | ||
118 | |||
119 | // Prepare to check if copying length bytes will run past the end of dst or | ||
120 | // src. | ||
121 | // | ||
122 | // R_TMP0 = len(dst) - d | ||
123 | // R_TMP1 = len(src) - s | ||
124 | MOVD R_DEND, R_TMP0 | ||
125 | SUB R_DST, R_TMP0, R_TMP0 | ||
126 | MOVD R_SEND, R_TMP1 | ||
127 | SUB R_SRC, R_TMP1, R_TMP1 | ||
128 | |||
129 | // !!! Try a faster technique for short (16 or fewer bytes) copies. | ||
130 | // | ||
131 | // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { | ||
132 | // goto callMemmove // Fall back on calling runtime·memmove. | ||
133 | // } | ||
134 | // | ||
135 | // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s | ||
136 | // against 21 instead of 16, because it cannot assume that all of its input | ||
137 | // is contiguous in memory and so it needs to leave enough source bytes to | ||
138 | // read the next tag without refilling buffers, but Go's Decode assumes | ||
139 | // contiguousness (the src argument is a []byte). | ||
140 | CMP $16, R_LEN | ||
141 | BGT callMemmove | ||
142 | CMP $16, R_TMP0 | ||
143 | BLT callMemmove | ||
144 | CMP $16, R_TMP1 | ||
145 | BLT callMemmove | ||
146 | |||
147 | // !!! Implement the copy from src to dst as a 16-byte load and store. | ||
148 | // (Decode's documentation says that dst and src must not overlap.) | ||
149 | // | ||
150 | // This always copies 16 bytes, instead of only length bytes, but that's | ||
151 | // OK. If the input is a valid Snappy encoding then subsequent iterations | ||
152 | // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a | ||
153 | // non-nil error), so the overrun will be ignored. | ||
154 | // | ||
155 | // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or | ||
156 | // 16-byte loads and stores. This technique probably wouldn't be as | ||
157 | // effective on architectures that are fussier about alignment. | ||
158 | LDP 0(R_SRC), (R_TMP2, R_TMP3) | ||
159 | STP (R_TMP2, R_TMP3), 0(R_DST) | ||
160 | |||
161 | // d += length | ||
162 | // s += length | ||
163 | ADD R_LEN, R_DST, R_DST | ||
164 | ADD R_LEN, R_SRC, R_SRC | ||
165 | B loop | ||
166 | |||
167 | callMemmove: | ||
168 | // if length > len(dst)-d || length > len(src)-s { etc } | ||
169 | CMP R_TMP0, R_LEN | ||
170 | BGT errCorrupt | ||
171 | CMP R_TMP1, R_LEN | ||
172 | BGT errCorrupt | ||
173 | |||
174 | // copy(dst[d:], src[s:s+length]) | ||
175 | // | ||
176 | // This means calling runtime·memmove(&dst[d], &src[s], length), so we push | ||
177 | // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those | ||
178 | // three registers to the stack, to save local variables across the CALL. | ||
179 | MOVD R_DST, 8(RSP) | ||
180 | MOVD R_SRC, 16(RSP) | ||
181 | MOVD R_LEN, 24(RSP) | ||
182 | MOVD R_DST, 32(RSP) | ||
183 | MOVD R_SRC, 40(RSP) | ||
184 | MOVD R_LEN, 48(RSP) | ||
185 | MOVD R_OFF, 56(RSP) | ||
186 | CALL runtime·memmove(SB) | ||
187 | |||
188 | // Restore local variables: unspill registers from the stack and | ||
189 | // re-calculate R_DBASE-R_SEND. | ||
190 | MOVD 32(RSP), R_DST | ||
191 | MOVD 40(RSP), R_SRC | ||
192 | MOVD 48(RSP), R_LEN | ||
193 | MOVD 56(RSP), R_OFF | ||
194 | MOVD dst_base+0(FP), R_DBASE | ||
195 | MOVD dst_len+8(FP), R_DLEN | ||
196 | MOVD R_DBASE, R_DEND | ||
197 | ADD R_DLEN, R_DEND, R_DEND | ||
198 | MOVD src_base+24(FP), R_SBASE | ||
199 | MOVD src_len+32(FP), R_SLEN | ||
200 | MOVD R_SBASE, R_SEND | ||
201 | ADD R_SLEN, R_SEND, R_SEND | ||
202 | |||
203 | // d += length | ||
204 | // s += length | ||
205 | ADD R_LEN, R_DST, R_DST | ||
206 | ADD R_LEN, R_SRC, R_SRC | ||
207 | B loop | ||
208 | |||
209 | tagLit60Plus: | ||
210 | // !!! This fragment does the | ||
211 | // | ||
212 | // s += x - 58; if uint(s) > uint(len(src)) { etc } | ||
213 | // | ||
214 | // checks. In the asm version, we code it once instead of once per switch case. | ||
215 | ADD R_LEN, R_SRC, R_SRC | ||
216 | SUB $58, R_SRC, R_SRC | ||
217 | TEST_SRC() | ||
218 | |||
219 | // case x == 60: | ||
220 | MOVW $61, R1 | ||
221 | CMPW R1, R_LEN | ||
222 | BEQ tagLit61 | ||
223 | BGT tagLit62Plus | ||
224 | |||
225 | // x = uint32(src[s-1]) | ||
226 | MOVBU -1(R_SRC), R_LEN | ||
227 | B doLit | ||
228 | |||
229 | tagLit61: | ||
230 | // case x == 61: | ||
231 | // x = uint32(src[s-2]) | uint32(src[s-1])<<8 | ||
232 | MOVHU -2(R_SRC), R_LEN | ||
233 | B doLit | ||
234 | |||
235 | tagLit62Plus: | ||
236 | CMPW $62, R_LEN | ||
237 | BHI tagLit63 | ||
238 | |||
239 | // case x == 62: | ||
240 | // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 | ||
241 | MOVHU -3(R_SRC), R_LEN | ||
242 | MOVBU -1(R_SRC), R_TMP1 | ||
243 | ORR R_TMP1<<16, R_LEN | ||
244 | B doLit | ||
245 | |||
246 | tagLit63: | ||
247 | // case x == 63: | ||
248 | // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 | ||
249 | MOVWU -4(R_SRC), R_LEN | ||
250 | B doLit | ||
251 | |||
252 | // The code above handles literal tags. | ||
253 | // ---------------------------------------- | ||
254 | // The code below handles copy tags. | ||
255 | |||
256 | tagCopy4: | ||
257 | // case tagCopy4: | ||
258 | // s += 5 | ||
259 | ADD $5, R_SRC, R_SRC | ||
260 | |||
261 | // if uint(s) > uint(len(src)) { etc } | ||
262 | MOVD R_SRC, R_TMP1 | ||
263 | SUB R_SBASE, R_TMP1, R_TMP1 | ||
264 | CMP R_SLEN, R_TMP1 | ||
265 | BGT errCorrupt | ||
266 | |||
267 | // length = 1 + int(src[s-5])>>2 | ||
268 | MOVD $1, R1 | ||
269 | ADD R_LEN>>2, R1, R_LEN | ||
270 | |||
271 | // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) | ||
272 | MOVWU -4(R_SRC), R_OFF | ||
273 | B doCopy | ||
274 | |||
275 | tagCopy2: | ||
276 | // case tagCopy2: | ||
277 | // s += 3 | ||
278 | ADD $3, R_SRC, R_SRC | ||
279 | |||
280 | // if uint(s) > uint(len(src)) { etc } | ||
281 | TEST_SRC() | ||
282 | |||
283 | // length = 1 + int(src[s-3])>>2 | ||
284 | MOVD $1, R1 | ||
285 | ADD R_LEN>>2, R1, R_LEN | ||
286 | |||
287 | // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) | ||
288 | MOVHU -2(R_SRC), R_OFF | ||
289 | B doCopy | ||
290 | |||
291 | tagCopy: | ||
292 | // We have a copy tag. We assume that: | ||
293 | // - R_TMP1 == src[s] & 0x03 | ||
294 | // - R_LEN == src[s] | ||
295 | CMP $2, R_TMP1 | ||
296 | BEQ tagCopy2 | ||
297 | BGT tagCopy4 | ||
298 | |||
299 | // case tagCopy1: | ||
300 | // s += 2 | ||
301 | ADD $2, R_SRC, R_SRC | ||
302 | |||
303 | // if uint(s) > uint(len(src)) { etc } | ||
304 | TEST_SRC() | ||
305 | |||
306 | // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) | ||
307 | // Calculate offset in R_TMP0 in case it is a repeat. | ||
308 | MOVD R_LEN, R_TMP0 | ||
309 | AND $0xe0, R_TMP0 | ||
310 | MOVBU -1(R_SRC), R_TMP1 | ||
311 | ORR R_TMP0<<3, R_TMP1, R_TMP0 | ||
312 | |||
313 | // length = 4 + int(src[s-2])>>2&0x7 | ||
314 | MOVD $7, R1 | ||
315 | AND R_LEN>>2, R1, R_LEN | ||
316 | ADD $4, R_LEN, R_LEN | ||
317 | |||
318 | // check if repeat code with offset 0. | ||
319 | CMP $0, R_TMP0 | ||
320 | BEQ repeatCode | ||
321 | |||
322 | // This is a regular copy, transfer our temporary value to R_OFF (offset) | ||
323 | MOVD R_TMP0, R_OFF | ||
324 | B doCopy | ||
325 | |||
326 | // This is a repeat code. | ||
327 | repeatCode: | ||
328 | // If length < 9, reuse last offset, with the length already calculated. | ||
329 | CMP $9, R_LEN | ||
330 | BLT doCopyRepeat | ||
331 | BEQ repeatLen1 | ||
332 | CMP $10, R_LEN | ||
333 | BEQ repeatLen2 | ||
334 | |||
335 | repeatLen3: | ||
336 | // s +=3 | ||
337 | ADD $3, R_SRC, R_SRC | ||
338 | |||
339 | // if uint(s) > uint(len(src)) { etc } | ||
340 | TEST_SRC() | ||
341 | |||
342 | // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540 | ||
343 | MOVBU -1(R_SRC), R_TMP0 | ||
344 | MOVHU -3(R_SRC), R_LEN | ||
345 | ORR R_TMP0<<16, R_LEN, R_LEN | ||
346 | ADD $65540, R_LEN, R_LEN | ||
347 | B doCopyRepeat | ||
348 | |||
349 | repeatLen2: | ||
350 | // s +=2 | ||
351 | ADD $2, R_SRC, R_SRC | ||
352 | |||
353 | // if uint(s) > uint(len(src)) { etc } | ||
354 | TEST_SRC() | ||
355 | |||
356 | // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260 | ||
357 | MOVHU -2(R_SRC), R_LEN | ||
358 | ADD $260, R_LEN, R_LEN | ||
359 | B doCopyRepeat | ||
360 | |||
361 | repeatLen1: | ||
362 | // s +=1 | ||
363 | ADD $1, R_SRC, R_SRC | ||
364 | |||
365 | // if uint(s) > uint(len(src)) { etc } | ||
366 | TEST_SRC() | ||
367 | |||
368 | // length = src[s-1] + 8 | ||
369 | MOVBU -1(R_SRC), R_LEN | ||
370 | ADD $8, R_LEN, R_LEN | ||
371 | B doCopyRepeat | ||
372 | |||
373 | doCopy: | ||
374 | // This is the end of the outer "switch", when we have a copy tag. | ||
375 | // | ||
376 | // We assume that: | ||
377 | // - R_LEN == length && R_LEN > 0 | ||
378 | // - R_OFF == offset | ||
379 | |||
380 | // if d < offset { etc } | ||
381 | MOVD R_DST, R_TMP1 | ||
382 | SUB R_DBASE, R_TMP1, R_TMP1 | ||
383 | CMP R_OFF, R_TMP1 | ||
384 | BLT errCorrupt | ||
385 | |||
386 | // Repeat values can skip the test above, since any offset > 0 will be in dst. | ||
387 | doCopyRepeat: | ||
388 | |||
389 | // if offset <= 0 { etc } | ||
390 | CMP $0, R_OFF | ||
391 | BLE errCorrupt | ||
392 | |||
393 | // if length > len(dst)-d { etc } | ||
394 | MOVD R_DEND, R_TMP1 | ||
395 | SUB R_DST, R_TMP1, R_TMP1 | ||
396 | CMP R_TMP1, R_LEN | ||
397 | BGT errCorrupt | ||
398 | |||
399 | // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length | ||
400 | // | ||
401 | // Set: | ||
402 | // - R_TMP2 = len(dst)-d | ||
403 | // - R_TMP3 = &dst[d-offset] | ||
404 | MOVD R_DEND, R_TMP2 | ||
405 | SUB R_DST, R_TMP2, R_TMP2 | ||
406 | MOVD R_DST, R_TMP3 | ||
407 | SUB R_OFF, R_TMP3, R_TMP3 | ||
408 | |||
409 | // !!! Try a faster technique for short (16 or fewer bytes) forward copies. | ||
410 | // | ||
411 | // First, try using two 8-byte load/stores, similar to the doLit technique | ||
412 | // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is | ||
413 | // still OK if offset >= 8. Note that this has to be two 8-byte load/stores | ||
414 | // and not one 16-byte load/store, and the first store has to be before the | ||
415 | // second load, due to the overlap if offset is in the range [8, 16). | ||
416 | // | ||
417 | // if length > 16 || offset < 8 || len(dst)-d < 16 { | ||
418 | // goto slowForwardCopy | ||
419 | // } | ||
420 | // copy 16 bytes | ||
421 | // d += length | ||
422 | CMP $16, R_LEN | ||
423 | BGT slowForwardCopy | ||
424 | CMP $8, R_OFF | ||
425 | BLT slowForwardCopy | ||
426 | CMP $16, R_TMP2 | ||
427 | BLT slowForwardCopy | ||
428 | MOVD 0(R_TMP3), R_TMP0 | ||
429 | MOVD R_TMP0, 0(R_DST) | ||
430 | MOVD 8(R_TMP3), R_TMP1 | ||
431 | MOVD R_TMP1, 8(R_DST) | ||
432 | ADD R_LEN, R_DST, R_DST | ||
433 | B loop | ||
434 | |||
435 | slowForwardCopy: | ||
436 | // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we | ||
437 | // can still try 8-byte load stores, provided we can overrun up to 10 extra | ||
438 | // bytes. As above, the overrun will be fixed up by subsequent iterations | ||
439 | // of the outermost loop. | ||
440 | // | ||
441 | // The C++ snappy code calls this technique IncrementalCopyFastPath. Its | ||
442 | // commentary says: | ||
443 | // | ||
444 | // ---- | ||
445 | // | ||
446 | // The main part of this loop is a simple copy of eight bytes at a time | ||
447 | // until we've copied (at least) the requested amount of bytes. However, | ||
448 | // if d and d-offset are less than eight bytes apart (indicating a | ||
449 | // repeating pattern of length < 8), we first need to expand the pattern in | ||
450 | // order to get the correct results. For instance, if the buffer looks like | ||
451 | // this, with the eight-byte <d-offset> and <d> patterns marked as | ||
452 | // intervals: | ||
453 | // | ||
454 | // abxxxxxxxxxxxx | ||
455 | // [------] d-offset | ||
456 | // [------] d | ||
457 | // | ||
458 | // a single eight-byte copy from <d-offset> to <d> will repeat the pattern | ||
459 | // once, after which we can move <d> two bytes without moving <d-offset>: | ||
460 | // | ||
461 | // ababxxxxxxxxxx | ||
462 | // [------] d-offset | ||
463 | // [------] d | ||
464 | // | ||
465 | // and repeat the exercise until the two no longer overlap. | ||
466 | // | ||
467 | // This allows us to do very well in the special case of one single byte | ||
468 | // repeated many times, without taking a big hit for more general cases. | ||
469 | // | ||
470 | // The worst case of extra writing past the end of the match occurs when | ||
471 | // offset == 1 and length == 1; the last copy will read from byte positions | ||
472 | // [0..7] and write to [4..11], whereas it was only supposed to write to | ||
473 | // position 1. Thus, ten excess bytes. | ||
474 | // | ||
475 | // ---- | ||
476 | // | ||
477 | // That "10 byte overrun" worst case is confirmed by Go's | ||
478 | // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy | ||
479 | // and finishSlowForwardCopy algorithm. | ||
480 | // | ||
481 | // if length > len(dst)-d-10 { | ||
482 | // goto verySlowForwardCopy | ||
483 | // } | ||
484 | SUB $10, R_TMP2, R_TMP2 | ||
485 | CMP R_TMP2, R_LEN | ||
486 | BGT verySlowForwardCopy | ||
487 | |||
488 | // We want to keep the offset, so we use R_TMP2 from here. | ||
489 | MOVD R_OFF, R_TMP2 | ||
490 | |||
491 | makeOffsetAtLeast8: | ||
492 | // !!! As above, expand the pattern so that offset >= 8 and we can use | ||
493 | // 8-byte load/stores. | ||
494 | // | ||
495 | // for offset < 8 { | ||
496 | // copy 8 bytes from dst[d-offset:] to dst[d:] | ||
497 | // length -= offset | ||
498 | // d += offset | ||
499 | // offset += offset | ||
500 | // // The two previous lines together means that d-offset, and therefore | ||
501 | // // R_TMP3, is unchanged. | ||
502 | // } | ||
503 | CMP $8, R_TMP2 | ||
504 | BGE fixUpSlowForwardCopy | ||
505 | MOVD (R_TMP3), R_TMP1 | ||
506 | MOVD R_TMP1, (R_DST) | ||
507 | SUB R_TMP2, R_LEN, R_LEN | ||
508 | ADD R_TMP2, R_DST, R_DST | ||
509 | ADD R_TMP2, R_TMP2, R_TMP2 | ||
510 | B makeOffsetAtLeast8 | ||
511 | |||
512 | fixUpSlowForwardCopy: | ||
513 | // !!! Add length (which might be negative now) to d (implied by R_DST being | ||
514 | // &dst[d]) so that d ends up at the right place when we jump back to the | ||
515 | // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if | ||
516 | // length is positive, copying the remaining length bytes will write to the | ||
517 | // right place. | ||
518 | MOVD R_DST, R_TMP0 | ||
519 | ADD R_LEN, R_DST, R_DST | ||
520 | |||
521 | finishSlowForwardCopy: | ||
522 | // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative | ||
523 | // length means that we overrun, but as above, that will be fixed up by | ||
524 | // subsequent iterations of the outermost loop. | ||
525 | MOVD $0, R1 | ||
526 | CMP R1, R_LEN | ||
527 | BLE loop | ||
528 | MOVD (R_TMP3), R_TMP1 | ||
529 | MOVD R_TMP1, (R_TMP0) | ||
530 | ADD $8, R_TMP3, R_TMP3 | ||
531 | ADD $8, R_TMP0, R_TMP0 | ||
532 | SUB $8, R_LEN, R_LEN | ||
533 | B finishSlowForwardCopy | ||
534 | |||
535 | verySlowForwardCopy: | ||
536 | // verySlowForwardCopy is a simple implementation of forward copy. In C | ||
537 | // parlance, this is a do/while loop instead of a while loop, since we know | ||
538 | // that length > 0. In Go syntax: | ||
539 | // | ||
540 | // for { | ||
541 | // dst[d] = dst[d - offset] | ||
542 | // d++ | ||
543 | // length-- | ||
544 | // if length == 0 { | ||
545 | // break | ||
546 | // } | ||
547 | // } | ||
548 | MOVB (R_TMP3), R_TMP1 | ||
549 | MOVB R_TMP1, (R_DST) | ||
550 | ADD $1, R_TMP3, R_TMP3 | ||
551 | ADD $1, R_DST, R_DST | ||
552 | SUB $1, R_LEN, R_LEN | ||
553 | CBNZ R_LEN, verySlowForwardCopy | ||
554 | B loop | ||
555 | |||
556 | // The code above handles copy tags. | ||
557 | // ---------------------------------------- | ||
558 | |||
559 | end: | ||
560 | // This is the end of the "for s < len(src)". | ||
561 | // | ||
562 | // if d != len(dst) { etc } | ||
563 | CMP R_DEND, R_DST | ||
564 | BNE errCorrupt | ||
565 | |||
566 | // return 0 | ||
567 | MOVD $0, ret+48(FP) | ||
568 | RET | ||
569 | |||
570 | errCorrupt: | ||
571 | // return decodeErrCodeCorrupt | ||
572 | MOVD $1, R_TMP0 | ||
573 | MOVD R_TMP0, ret+48(FP) | ||
574 | RET | ||