diff options
Diffstat (limited to 'vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s')
-rw-r--r-- | vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s | 21169 |
1 files changed, 21169 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s new file mode 100644 index 0000000..5f110d1 --- /dev/null +++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s | |||
@@ -0,0 +1,21169 @@ | |||
1 | // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. | ||
2 | |||
3 | //go:build !appengine && !noasm && gc && !noasm | ||
4 | |||
5 | #include "textflag.h" | ||
6 | |||
7 | // func _dummy_() | ||
8 | TEXT ·_dummy_(SB), $0 | ||
9 | #ifdef GOAMD64_v4 | ||
10 | #ifndef GOAMD64_v3 | ||
11 | #define GOAMD64_v3 | ||
12 | #endif | ||
13 | #endif | ||
14 | RET | ||
15 | |||
16 | // func encodeBlockAsm(dst []byte, src []byte) int | ||
17 | // Requires: BMI, SSE2 | ||
18 | TEXT ·encodeBlockAsm(SB), $65560-56 | ||
19 | MOVQ dst_base+0(FP), AX | ||
20 | MOVQ $0x00000200, CX | ||
21 | LEAQ 24(SP), DX | ||
22 | PXOR X0, X0 | ||
23 | |||
24 | zero_loop_encodeBlockAsm: | ||
25 | MOVOU X0, (DX) | ||
26 | MOVOU X0, 16(DX) | ||
27 | MOVOU X0, 32(DX) | ||
28 | MOVOU X0, 48(DX) | ||
29 | MOVOU X0, 64(DX) | ||
30 | MOVOU X0, 80(DX) | ||
31 | MOVOU X0, 96(DX) | ||
32 | MOVOU X0, 112(DX) | ||
33 | ADDQ $0x80, DX | ||
34 | DECQ CX | ||
35 | JNZ zero_loop_encodeBlockAsm | ||
36 | MOVL $0x00000000, 12(SP) | ||
37 | MOVQ src_len+32(FP), CX | ||
38 | LEAQ -9(CX), DX | ||
39 | LEAQ -8(CX), BX | ||
40 | MOVL BX, 8(SP) | ||
41 | SHRQ $0x05, CX | ||
42 | SUBL CX, DX | ||
43 | LEAQ (AX)(DX*1), DX | ||
44 | MOVQ DX, (SP) | ||
45 | MOVL $0x00000001, CX | ||
46 | MOVL CX, 16(SP) | ||
47 | MOVQ src_base+24(FP), DX | ||
48 | |||
49 | search_loop_encodeBlockAsm: | ||
50 | MOVL CX, BX | ||
51 | SUBL 12(SP), BX | ||
52 | SHRL $0x06, BX | ||
53 | LEAL 4(CX)(BX*1), BX | ||
54 | CMPL BX, 8(SP) | ||
55 | JAE emit_remainder_encodeBlockAsm | ||
56 | MOVQ (DX)(CX*1), SI | ||
57 | MOVL BX, 20(SP) | ||
58 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
59 | MOVQ SI, R9 | ||
60 | MOVQ SI, R10 | ||
61 | SHRQ $0x08, R10 | ||
62 | SHLQ $0x10, R9 | ||
63 | IMULQ R8, R9 | ||
64 | SHRQ $0x32, R9 | ||
65 | SHLQ $0x10, R10 | ||
66 | IMULQ R8, R10 | ||
67 | SHRQ $0x32, R10 | ||
68 | MOVL 24(SP)(R9*4), BX | ||
69 | MOVL 24(SP)(R10*4), DI | ||
70 | MOVL CX, 24(SP)(R9*4) | ||
71 | LEAL 1(CX), R9 | ||
72 | MOVL R9, 24(SP)(R10*4) | ||
73 | MOVQ SI, R9 | ||
74 | SHRQ $0x10, R9 | ||
75 | SHLQ $0x10, R9 | ||
76 | IMULQ R8, R9 | ||
77 | SHRQ $0x32, R9 | ||
78 | MOVL CX, R8 | ||
79 | SUBL 16(SP), R8 | ||
80 | MOVL 1(DX)(R8*1), R10 | ||
81 | MOVQ SI, R8 | ||
82 | SHRQ $0x08, R8 | ||
83 | CMPL R8, R10 | ||
84 | JNE no_repeat_found_encodeBlockAsm | ||
85 | LEAL 1(CX), SI | ||
86 | MOVL 12(SP), DI | ||
87 | MOVL SI, BX | ||
88 | SUBL 16(SP), BX | ||
89 | JZ repeat_extend_back_end_encodeBlockAsm | ||
90 | |||
91 | repeat_extend_back_loop_encodeBlockAsm: | ||
92 | CMPL SI, DI | ||
93 | JBE repeat_extend_back_end_encodeBlockAsm | ||
94 | MOVB -1(DX)(BX*1), R8 | ||
95 | MOVB -1(DX)(SI*1), R9 | ||
96 | CMPB R8, R9 | ||
97 | JNE repeat_extend_back_end_encodeBlockAsm | ||
98 | LEAL -1(SI), SI | ||
99 | DECL BX | ||
100 | JNZ repeat_extend_back_loop_encodeBlockAsm | ||
101 | |||
102 | repeat_extend_back_end_encodeBlockAsm: | ||
103 | MOVL 12(SP), BX | ||
104 | CMPL BX, SI | ||
105 | JEQ emit_literal_done_repeat_emit_encodeBlockAsm | ||
106 | MOVL SI, R8 | ||
107 | MOVL SI, 12(SP) | ||
108 | LEAQ (DX)(BX*1), R9 | ||
109 | SUBL BX, R8 | ||
110 | LEAL -1(R8), BX | ||
111 | CMPL BX, $0x3c | ||
112 | JB one_byte_repeat_emit_encodeBlockAsm | ||
113 | CMPL BX, $0x00000100 | ||
114 | JB two_bytes_repeat_emit_encodeBlockAsm | ||
115 | CMPL BX, $0x00010000 | ||
116 | JB three_bytes_repeat_emit_encodeBlockAsm | ||
117 | CMPL BX, $0x01000000 | ||
118 | JB four_bytes_repeat_emit_encodeBlockAsm | ||
119 | MOVB $0xfc, (AX) | ||
120 | MOVL BX, 1(AX) | ||
121 | ADDQ $0x05, AX | ||
122 | JMP memmove_long_repeat_emit_encodeBlockAsm | ||
123 | |||
124 | four_bytes_repeat_emit_encodeBlockAsm: | ||
125 | MOVL BX, R10 | ||
126 | SHRL $0x10, R10 | ||
127 | MOVB $0xf8, (AX) | ||
128 | MOVW BX, 1(AX) | ||
129 | MOVB R10, 3(AX) | ||
130 | ADDQ $0x04, AX | ||
131 | JMP memmove_long_repeat_emit_encodeBlockAsm | ||
132 | |||
133 | three_bytes_repeat_emit_encodeBlockAsm: | ||
134 | MOVB $0xf4, (AX) | ||
135 | MOVW BX, 1(AX) | ||
136 | ADDQ $0x03, AX | ||
137 | JMP memmove_long_repeat_emit_encodeBlockAsm | ||
138 | |||
139 | two_bytes_repeat_emit_encodeBlockAsm: | ||
140 | MOVB $0xf0, (AX) | ||
141 | MOVB BL, 1(AX) | ||
142 | ADDQ $0x02, AX | ||
143 | CMPL BX, $0x40 | ||
144 | JB memmove_repeat_emit_encodeBlockAsm | ||
145 | JMP memmove_long_repeat_emit_encodeBlockAsm | ||
146 | |||
147 | one_byte_repeat_emit_encodeBlockAsm: | ||
148 | SHLB $0x02, BL | ||
149 | MOVB BL, (AX) | ||
150 | ADDQ $0x01, AX | ||
151 | |||
152 | memmove_repeat_emit_encodeBlockAsm: | ||
153 | LEAQ (AX)(R8*1), BX | ||
154 | |||
155 | // genMemMoveShort | ||
156 | CMPQ R8, $0x08 | ||
157 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 | ||
158 | CMPQ R8, $0x10 | ||
159 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 | ||
160 | CMPQ R8, $0x20 | ||
161 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 | ||
162 | JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 | ||
163 | |||
164 | emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: | ||
165 | MOVQ (R9), R10 | ||
166 | MOVQ R10, (AX) | ||
167 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm | ||
168 | |||
169 | emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: | ||
170 | MOVQ (R9), R10 | ||
171 | MOVQ -8(R9)(R8*1), R9 | ||
172 | MOVQ R10, (AX) | ||
173 | MOVQ R9, -8(AX)(R8*1) | ||
174 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm | ||
175 | |||
176 | emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: | ||
177 | MOVOU (R9), X0 | ||
178 | MOVOU -16(R9)(R8*1), X1 | ||
179 | MOVOU X0, (AX) | ||
180 | MOVOU X1, -16(AX)(R8*1) | ||
181 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm | ||
182 | |||
183 | emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: | ||
184 | MOVOU (R9), X0 | ||
185 | MOVOU 16(R9), X1 | ||
186 | MOVOU -32(R9)(R8*1), X2 | ||
187 | MOVOU -16(R9)(R8*1), X3 | ||
188 | MOVOU X0, (AX) | ||
189 | MOVOU X1, 16(AX) | ||
190 | MOVOU X2, -32(AX)(R8*1) | ||
191 | MOVOU X3, -16(AX)(R8*1) | ||
192 | |||
193 | memmove_end_copy_repeat_emit_encodeBlockAsm: | ||
194 | MOVQ BX, AX | ||
195 | JMP emit_literal_done_repeat_emit_encodeBlockAsm | ||
196 | |||
197 | memmove_long_repeat_emit_encodeBlockAsm: | ||
198 | LEAQ (AX)(R8*1), BX | ||
199 | |||
200 | // genMemMoveLong | ||
201 | MOVOU (R9), X0 | ||
202 | MOVOU 16(R9), X1 | ||
203 | MOVOU -32(R9)(R8*1), X2 | ||
204 | MOVOU -16(R9)(R8*1), X3 | ||
205 | MOVQ R8, R11 | ||
206 | SHRQ $0x05, R11 | ||
207 | MOVQ AX, R10 | ||
208 | ANDL $0x0000001f, R10 | ||
209 | MOVQ $0x00000040, R12 | ||
210 | SUBQ R10, R12 | ||
211 | DECQ R11 | ||
212 | JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 | ||
213 | LEAQ -32(R9)(R12*1), R10 | ||
214 | LEAQ -32(AX)(R12*1), R13 | ||
215 | |||
216 | emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: | ||
217 | MOVOU (R10), X4 | ||
218 | MOVOU 16(R10), X5 | ||
219 | MOVOA X4, (R13) | ||
220 | MOVOA X5, 16(R13) | ||
221 | ADDQ $0x20, R13 | ||
222 | ADDQ $0x20, R10 | ||
223 | ADDQ $0x20, R12 | ||
224 | DECQ R11 | ||
225 | JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back | ||
226 | |||
227 | emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: | ||
228 | MOVOU -32(R9)(R12*1), X4 | ||
229 | MOVOU -16(R9)(R12*1), X5 | ||
230 | MOVOA X4, -32(AX)(R12*1) | ||
231 | MOVOA X5, -16(AX)(R12*1) | ||
232 | ADDQ $0x20, R12 | ||
233 | CMPQ R8, R12 | ||
234 | JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 | ||
235 | MOVOU X0, (AX) | ||
236 | MOVOU X1, 16(AX) | ||
237 | MOVOU X2, -32(AX)(R8*1) | ||
238 | MOVOU X3, -16(AX)(R8*1) | ||
239 | MOVQ BX, AX | ||
240 | |||
241 | emit_literal_done_repeat_emit_encodeBlockAsm: | ||
242 | ADDL $0x05, CX | ||
243 | MOVL CX, BX | ||
244 | SUBL 16(SP), BX | ||
245 | MOVQ src_len+32(FP), R8 | ||
246 | SUBL CX, R8 | ||
247 | LEAQ (DX)(CX*1), R9 | ||
248 | LEAQ (DX)(BX*1), BX | ||
249 | |||
250 | // matchLen | ||
251 | XORL R11, R11 | ||
252 | |||
253 | matchlen_loopback_16_repeat_extend_encodeBlockAsm: | ||
254 | CMPL R8, $0x10 | ||
255 | JB matchlen_match8_repeat_extend_encodeBlockAsm | ||
256 | MOVQ (R9)(R11*1), R10 | ||
257 | MOVQ 8(R9)(R11*1), R12 | ||
258 | XORQ (BX)(R11*1), R10 | ||
259 | JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm | ||
260 | XORQ 8(BX)(R11*1), R12 | ||
261 | JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm | ||
262 | LEAL -16(R8), R8 | ||
263 | LEAL 16(R11), R11 | ||
264 | JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm | ||
265 | |||
266 | matchlen_bsf_16repeat_extend_encodeBlockAsm: | ||
267 | #ifdef GOAMD64_v3 | ||
268 | TZCNTQ R12, R12 | ||
269 | |||
270 | #else | ||
271 | BSFQ R12, R12 | ||
272 | |||
273 | #endif | ||
274 | SARQ $0x03, R12 | ||
275 | LEAL 8(R11)(R12*1), R11 | ||
276 | JMP repeat_extend_forward_end_encodeBlockAsm | ||
277 | |||
278 | matchlen_match8_repeat_extend_encodeBlockAsm: | ||
279 | CMPL R8, $0x08 | ||
280 | JB matchlen_match4_repeat_extend_encodeBlockAsm | ||
281 | MOVQ (R9)(R11*1), R10 | ||
282 | XORQ (BX)(R11*1), R10 | ||
283 | JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm | ||
284 | LEAL -8(R8), R8 | ||
285 | LEAL 8(R11), R11 | ||
286 | JMP matchlen_match4_repeat_extend_encodeBlockAsm | ||
287 | |||
288 | matchlen_bsf_8_repeat_extend_encodeBlockAsm: | ||
289 | #ifdef GOAMD64_v3 | ||
290 | TZCNTQ R10, R10 | ||
291 | |||
292 | #else | ||
293 | BSFQ R10, R10 | ||
294 | |||
295 | #endif | ||
296 | SARQ $0x03, R10 | ||
297 | LEAL (R11)(R10*1), R11 | ||
298 | JMP repeat_extend_forward_end_encodeBlockAsm | ||
299 | |||
300 | matchlen_match4_repeat_extend_encodeBlockAsm: | ||
301 | CMPL R8, $0x04 | ||
302 | JB matchlen_match2_repeat_extend_encodeBlockAsm | ||
303 | MOVL (R9)(R11*1), R10 | ||
304 | CMPL (BX)(R11*1), R10 | ||
305 | JNE matchlen_match2_repeat_extend_encodeBlockAsm | ||
306 | LEAL -4(R8), R8 | ||
307 | LEAL 4(R11), R11 | ||
308 | |||
309 | matchlen_match2_repeat_extend_encodeBlockAsm: | ||
310 | CMPL R8, $0x01 | ||
311 | JE matchlen_match1_repeat_extend_encodeBlockAsm | ||
312 | JB repeat_extend_forward_end_encodeBlockAsm | ||
313 | MOVW (R9)(R11*1), R10 | ||
314 | CMPW (BX)(R11*1), R10 | ||
315 | JNE matchlen_match1_repeat_extend_encodeBlockAsm | ||
316 | LEAL 2(R11), R11 | ||
317 | SUBL $0x02, R8 | ||
318 | JZ repeat_extend_forward_end_encodeBlockAsm | ||
319 | |||
320 | matchlen_match1_repeat_extend_encodeBlockAsm: | ||
321 | MOVB (R9)(R11*1), R10 | ||
322 | CMPB (BX)(R11*1), R10 | ||
323 | JNE repeat_extend_forward_end_encodeBlockAsm | ||
324 | LEAL 1(R11), R11 | ||
325 | |||
326 | repeat_extend_forward_end_encodeBlockAsm: | ||
327 | ADDL R11, CX | ||
328 | MOVL CX, BX | ||
329 | SUBL SI, BX | ||
330 | MOVL 16(SP), SI | ||
331 | TESTL DI, DI | ||
332 | JZ repeat_as_copy_encodeBlockAsm | ||
333 | |||
334 | // emitRepeat | ||
335 | emit_repeat_again_match_repeat_encodeBlockAsm: | ||
336 | MOVL BX, DI | ||
337 | LEAL -4(BX), BX | ||
338 | CMPL DI, $0x08 | ||
339 | JBE repeat_two_match_repeat_encodeBlockAsm | ||
340 | CMPL DI, $0x0c | ||
341 | JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm | ||
342 | CMPL SI, $0x00000800 | ||
343 | JB repeat_two_offset_match_repeat_encodeBlockAsm | ||
344 | |||
345 | cant_repeat_two_offset_match_repeat_encodeBlockAsm: | ||
346 | CMPL BX, $0x00000104 | ||
347 | JB repeat_three_match_repeat_encodeBlockAsm | ||
348 | CMPL BX, $0x00010100 | ||
349 | JB repeat_four_match_repeat_encodeBlockAsm | ||
350 | CMPL BX, $0x0100ffff | ||
351 | JB repeat_five_match_repeat_encodeBlockAsm | ||
352 | LEAL -16842747(BX), BX | ||
353 | MOVL $0xfffb001d, (AX) | ||
354 | MOVB $0xff, 4(AX) | ||
355 | ADDQ $0x05, AX | ||
356 | JMP emit_repeat_again_match_repeat_encodeBlockAsm | ||
357 | |||
358 | repeat_five_match_repeat_encodeBlockAsm: | ||
359 | LEAL -65536(BX), BX | ||
360 | MOVL BX, SI | ||
361 | MOVW $0x001d, (AX) | ||
362 | MOVW BX, 2(AX) | ||
363 | SARL $0x10, SI | ||
364 | MOVB SI, 4(AX) | ||
365 | ADDQ $0x05, AX | ||
366 | JMP repeat_end_emit_encodeBlockAsm | ||
367 | |||
368 | repeat_four_match_repeat_encodeBlockAsm: | ||
369 | LEAL -256(BX), BX | ||
370 | MOVW $0x0019, (AX) | ||
371 | MOVW BX, 2(AX) | ||
372 | ADDQ $0x04, AX | ||
373 | JMP repeat_end_emit_encodeBlockAsm | ||
374 | |||
375 | repeat_three_match_repeat_encodeBlockAsm: | ||
376 | LEAL -4(BX), BX | ||
377 | MOVW $0x0015, (AX) | ||
378 | MOVB BL, 2(AX) | ||
379 | ADDQ $0x03, AX | ||
380 | JMP repeat_end_emit_encodeBlockAsm | ||
381 | |||
382 | repeat_two_match_repeat_encodeBlockAsm: | ||
383 | SHLL $0x02, BX | ||
384 | ORL $0x01, BX | ||
385 | MOVW BX, (AX) | ||
386 | ADDQ $0x02, AX | ||
387 | JMP repeat_end_emit_encodeBlockAsm | ||
388 | |||
389 | repeat_two_offset_match_repeat_encodeBlockAsm: | ||
390 | XORQ DI, DI | ||
391 | LEAL 1(DI)(BX*4), BX | ||
392 | MOVB SI, 1(AX) | ||
393 | SARL $0x08, SI | ||
394 | SHLL $0x05, SI | ||
395 | ORL SI, BX | ||
396 | MOVB BL, (AX) | ||
397 | ADDQ $0x02, AX | ||
398 | JMP repeat_end_emit_encodeBlockAsm | ||
399 | |||
400 | repeat_as_copy_encodeBlockAsm: | ||
401 | // emitCopy | ||
402 | CMPL SI, $0x00010000 | ||
403 | JB two_byte_offset_repeat_as_copy_encodeBlockAsm | ||
404 | CMPL BX, $0x40 | ||
405 | JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm | ||
406 | MOVB $0xff, (AX) | ||
407 | MOVL SI, 1(AX) | ||
408 | LEAL -64(BX), BX | ||
409 | ADDQ $0x05, AX | ||
410 | CMPL BX, $0x04 | ||
411 | JB four_bytes_remain_repeat_as_copy_encodeBlockAsm | ||
412 | |||
413 | // emitRepeat | ||
414 | emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: | ||
415 | MOVL BX, DI | ||
416 | LEAL -4(BX), BX | ||
417 | CMPL DI, $0x08 | ||
418 | JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy | ||
419 | CMPL DI, $0x0c | ||
420 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy | ||
421 | CMPL SI, $0x00000800 | ||
422 | JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy | ||
423 | |||
424 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: | ||
425 | CMPL BX, $0x00000104 | ||
426 | JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy | ||
427 | CMPL BX, $0x00010100 | ||
428 | JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy | ||
429 | CMPL BX, $0x0100ffff | ||
430 | JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy | ||
431 | LEAL -16842747(BX), BX | ||
432 | MOVL $0xfffb001d, (AX) | ||
433 | MOVB $0xff, 4(AX) | ||
434 | ADDQ $0x05, AX | ||
435 | JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy | ||
436 | |||
437 | repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: | ||
438 | LEAL -65536(BX), BX | ||
439 | MOVL BX, SI | ||
440 | MOVW $0x001d, (AX) | ||
441 | MOVW BX, 2(AX) | ||
442 | SARL $0x10, SI | ||
443 | MOVB SI, 4(AX) | ||
444 | ADDQ $0x05, AX | ||
445 | JMP repeat_end_emit_encodeBlockAsm | ||
446 | |||
447 | repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: | ||
448 | LEAL -256(BX), BX | ||
449 | MOVW $0x0019, (AX) | ||
450 | MOVW BX, 2(AX) | ||
451 | ADDQ $0x04, AX | ||
452 | JMP repeat_end_emit_encodeBlockAsm | ||
453 | |||
454 | repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: | ||
455 | LEAL -4(BX), BX | ||
456 | MOVW $0x0015, (AX) | ||
457 | MOVB BL, 2(AX) | ||
458 | ADDQ $0x03, AX | ||
459 | JMP repeat_end_emit_encodeBlockAsm | ||
460 | |||
461 | repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: | ||
462 | SHLL $0x02, BX | ||
463 | ORL $0x01, BX | ||
464 | MOVW BX, (AX) | ||
465 | ADDQ $0x02, AX | ||
466 | JMP repeat_end_emit_encodeBlockAsm | ||
467 | |||
468 | repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: | ||
469 | XORQ DI, DI | ||
470 | LEAL 1(DI)(BX*4), BX | ||
471 | MOVB SI, 1(AX) | ||
472 | SARL $0x08, SI | ||
473 | SHLL $0x05, SI | ||
474 | ORL SI, BX | ||
475 | MOVB BL, (AX) | ||
476 | ADDQ $0x02, AX | ||
477 | JMP repeat_end_emit_encodeBlockAsm | ||
478 | |||
479 | four_bytes_remain_repeat_as_copy_encodeBlockAsm: | ||
480 | TESTL BX, BX | ||
481 | JZ repeat_end_emit_encodeBlockAsm | ||
482 | XORL DI, DI | ||
483 | LEAL -1(DI)(BX*4), BX | ||
484 | MOVB BL, (AX) | ||
485 | MOVL SI, 1(AX) | ||
486 | ADDQ $0x05, AX | ||
487 | JMP repeat_end_emit_encodeBlockAsm | ||
488 | |||
489 | two_byte_offset_repeat_as_copy_encodeBlockAsm: | ||
490 | CMPL BX, $0x40 | ||
491 | JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm | ||
492 | CMPL SI, $0x00000800 | ||
493 | JAE long_offset_short_repeat_as_copy_encodeBlockAsm | ||
494 | MOVL $0x00000001, DI | ||
495 | LEAL 16(DI), DI | ||
496 | MOVB SI, 1(AX) | ||
497 | MOVL SI, R8 | ||
498 | SHRL $0x08, R8 | ||
499 | SHLL $0x05, R8 | ||
500 | ORL R8, DI | ||
501 | MOVB DI, (AX) | ||
502 | ADDQ $0x02, AX | ||
503 | SUBL $0x08, BX | ||
504 | |||
505 | // emitRepeat | ||
506 | LEAL -4(BX), BX | ||
507 | JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b | ||
508 | |||
509 | emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: | ||
510 | MOVL BX, DI | ||
511 | LEAL -4(BX), BX | ||
512 | CMPL DI, $0x08 | ||
513 | JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b | ||
514 | CMPL DI, $0x0c | ||
515 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b | ||
516 | CMPL SI, $0x00000800 | ||
517 | JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b | ||
518 | |||
519 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: | ||
520 | CMPL BX, $0x00000104 | ||
521 | JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b | ||
522 | CMPL BX, $0x00010100 | ||
523 | JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b | ||
524 | CMPL BX, $0x0100ffff | ||
525 | JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b | ||
526 | LEAL -16842747(BX), BX | ||
527 | MOVL $0xfffb001d, (AX) | ||
528 | MOVB $0xff, 4(AX) | ||
529 | ADDQ $0x05, AX | ||
530 | JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b | ||
531 | |||
532 | repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: | ||
533 | LEAL -65536(BX), BX | ||
534 | MOVL BX, SI | ||
535 | MOVW $0x001d, (AX) | ||
536 | MOVW BX, 2(AX) | ||
537 | SARL $0x10, SI | ||
538 | MOVB SI, 4(AX) | ||
539 | ADDQ $0x05, AX | ||
540 | JMP repeat_end_emit_encodeBlockAsm | ||
541 | |||
542 | repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: | ||
543 | LEAL -256(BX), BX | ||
544 | MOVW $0x0019, (AX) | ||
545 | MOVW BX, 2(AX) | ||
546 | ADDQ $0x04, AX | ||
547 | JMP repeat_end_emit_encodeBlockAsm | ||
548 | |||
549 | repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: | ||
550 | LEAL -4(BX), BX | ||
551 | MOVW $0x0015, (AX) | ||
552 | MOVB BL, 2(AX) | ||
553 | ADDQ $0x03, AX | ||
554 | JMP repeat_end_emit_encodeBlockAsm | ||
555 | |||
556 | repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: | ||
557 | SHLL $0x02, BX | ||
558 | ORL $0x01, BX | ||
559 | MOVW BX, (AX) | ||
560 | ADDQ $0x02, AX | ||
561 | JMP repeat_end_emit_encodeBlockAsm | ||
562 | |||
563 | repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: | ||
564 | XORQ DI, DI | ||
565 | LEAL 1(DI)(BX*4), BX | ||
566 | MOVB SI, 1(AX) | ||
567 | SARL $0x08, SI | ||
568 | SHLL $0x05, SI | ||
569 | ORL SI, BX | ||
570 | MOVB BL, (AX) | ||
571 | ADDQ $0x02, AX | ||
572 | JMP repeat_end_emit_encodeBlockAsm | ||
573 | |||
574 | long_offset_short_repeat_as_copy_encodeBlockAsm: | ||
575 | MOVB $0xee, (AX) | ||
576 | MOVW SI, 1(AX) | ||
577 | LEAL -60(BX), BX | ||
578 | ADDQ $0x03, AX | ||
579 | |||
580 | // emitRepeat | ||
581 | emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: | ||
582 | MOVL BX, DI | ||
583 | LEAL -4(BX), BX | ||
584 | CMPL DI, $0x08 | ||
585 | JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short | ||
586 | CMPL DI, $0x0c | ||
587 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short | ||
588 | CMPL SI, $0x00000800 | ||
589 | JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short | ||
590 | |||
591 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: | ||
592 | CMPL BX, $0x00000104 | ||
593 | JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short | ||
594 | CMPL BX, $0x00010100 | ||
595 | JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short | ||
596 | CMPL BX, $0x0100ffff | ||
597 | JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short | ||
598 | LEAL -16842747(BX), BX | ||
599 | MOVL $0xfffb001d, (AX) | ||
600 | MOVB $0xff, 4(AX) | ||
601 | ADDQ $0x05, AX | ||
602 | JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short | ||
603 | |||
604 | repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: | ||
605 | LEAL -65536(BX), BX | ||
606 | MOVL BX, SI | ||
607 | MOVW $0x001d, (AX) | ||
608 | MOVW BX, 2(AX) | ||
609 | SARL $0x10, SI | ||
610 | MOVB SI, 4(AX) | ||
611 | ADDQ $0x05, AX | ||
612 | JMP repeat_end_emit_encodeBlockAsm | ||
613 | |||
614 | repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: | ||
615 | LEAL -256(BX), BX | ||
616 | MOVW $0x0019, (AX) | ||
617 | MOVW BX, 2(AX) | ||
618 | ADDQ $0x04, AX | ||
619 | JMP repeat_end_emit_encodeBlockAsm | ||
620 | |||
621 | repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: | ||
622 | LEAL -4(BX), BX | ||
623 | MOVW $0x0015, (AX) | ||
624 | MOVB BL, 2(AX) | ||
625 | ADDQ $0x03, AX | ||
626 | JMP repeat_end_emit_encodeBlockAsm | ||
627 | |||
628 | repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: | ||
629 | SHLL $0x02, BX | ||
630 | ORL $0x01, BX | ||
631 | MOVW BX, (AX) | ||
632 | ADDQ $0x02, AX | ||
633 | JMP repeat_end_emit_encodeBlockAsm | ||
634 | |||
635 | repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: | ||
636 | XORQ DI, DI | ||
637 | LEAL 1(DI)(BX*4), BX | ||
638 | MOVB SI, 1(AX) | ||
639 | SARL $0x08, SI | ||
640 | SHLL $0x05, SI | ||
641 | ORL SI, BX | ||
642 | MOVB BL, (AX) | ||
643 | ADDQ $0x02, AX | ||
644 | JMP repeat_end_emit_encodeBlockAsm | ||
645 | |||
646 | two_byte_offset_short_repeat_as_copy_encodeBlockAsm: | ||
647 | MOVL BX, DI | ||
648 | SHLL $0x02, DI | ||
649 | CMPL BX, $0x0c | ||
650 | JAE emit_copy_three_repeat_as_copy_encodeBlockAsm | ||
651 | CMPL SI, $0x00000800 | ||
652 | JAE emit_copy_three_repeat_as_copy_encodeBlockAsm | ||
653 | LEAL -15(DI), DI | ||
654 | MOVB SI, 1(AX) | ||
655 | SHRL $0x08, SI | ||
656 | SHLL $0x05, SI | ||
657 | ORL SI, DI | ||
658 | MOVB DI, (AX) | ||
659 | ADDQ $0x02, AX | ||
660 | JMP repeat_end_emit_encodeBlockAsm | ||
661 | |||
662 | emit_copy_three_repeat_as_copy_encodeBlockAsm: | ||
663 | LEAL -2(DI), DI | ||
664 | MOVB DI, (AX) | ||
665 | MOVW SI, 1(AX) | ||
666 | ADDQ $0x03, AX | ||
667 | |||
668 | repeat_end_emit_encodeBlockAsm: | ||
669 | MOVL CX, 12(SP) | ||
670 | JMP search_loop_encodeBlockAsm | ||
671 | |||
672 | no_repeat_found_encodeBlockAsm: | ||
673 | CMPL (DX)(BX*1), SI | ||
674 | JEQ candidate_match_encodeBlockAsm | ||
675 | SHRQ $0x08, SI | ||
676 | MOVL 24(SP)(R9*4), BX | ||
677 | LEAL 2(CX), R8 | ||
678 | CMPL (DX)(DI*1), SI | ||
679 | JEQ candidate2_match_encodeBlockAsm | ||
680 | MOVL R8, 24(SP)(R9*4) | ||
681 | SHRQ $0x08, SI | ||
682 | CMPL (DX)(BX*1), SI | ||
683 | JEQ candidate3_match_encodeBlockAsm | ||
684 | MOVL 20(SP), CX | ||
685 | JMP search_loop_encodeBlockAsm | ||
686 | |||
687 | candidate3_match_encodeBlockAsm: | ||
688 | ADDL $0x02, CX | ||
689 | JMP candidate_match_encodeBlockAsm | ||
690 | |||
691 | candidate2_match_encodeBlockAsm: | ||
692 | MOVL R8, 24(SP)(R9*4) | ||
693 | INCL CX | ||
694 | MOVL DI, BX | ||
695 | |||
696 | candidate_match_encodeBlockAsm: | ||
697 | MOVL 12(SP), SI | ||
698 | TESTL BX, BX | ||
699 | JZ match_extend_back_end_encodeBlockAsm | ||
700 | |||
701 | match_extend_back_loop_encodeBlockAsm: | ||
702 | CMPL CX, SI | ||
703 | JBE match_extend_back_end_encodeBlockAsm | ||
704 | MOVB -1(DX)(BX*1), DI | ||
705 | MOVB -1(DX)(CX*1), R8 | ||
706 | CMPB DI, R8 | ||
707 | JNE match_extend_back_end_encodeBlockAsm | ||
708 | LEAL -1(CX), CX | ||
709 | DECL BX | ||
710 | JZ match_extend_back_end_encodeBlockAsm | ||
711 | JMP match_extend_back_loop_encodeBlockAsm | ||
712 | |||
713 | match_extend_back_end_encodeBlockAsm: | ||
714 | MOVL CX, SI | ||
715 | SUBL 12(SP), SI | ||
716 | LEAQ 5(AX)(SI*1), SI | ||
717 | CMPQ SI, (SP) | ||
718 | JB match_dst_size_check_encodeBlockAsm | ||
719 | MOVQ $0x00000000, ret+48(FP) | ||
720 | RET | ||
721 | |||
722 | match_dst_size_check_encodeBlockAsm: | ||
723 | MOVL CX, SI | ||
724 | MOVL 12(SP), DI | ||
725 | CMPL DI, SI | ||
726 | JEQ emit_literal_done_match_emit_encodeBlockAsm | ||
727 | MOVL SI, R8 | ||
728 | MOVL SI, 12(SP) | ||
729 | LEAQ (DX)(DI*1), SI | ||
730 | SUBL DI, R8 | ||
731 | LEAL -1(R8), DI | ||
732 | CMPL DI, $0x3c | ||
733 | JB one_byte_match_emit_encodeBlockAsm | ||
734 | CMPL DI, $0x00000100 | ||
735 | JB two_bytes_match_emit_encodeBlockAsm | ||
736 | CMPL DI, $0x00010000 | ||
737 | JB three_bytes_match_emit_encodeBlockAsm | ||
738 | CMPL DI, $0x01000000 | ||
739 | JB four_bytes_match_emit_encodeBlockAsm | ||
740 | MOVB $0xfc, (AX) | ||
741 | MOVL DI, 1(AX) | ||
742 | ADDQ $0x05, AX | ||
743 | JMP memmove_long_match_emit_encodeBlockAsm | ||
744 | |||
745 | four_bytes_match_emit_encodeBlockAsm: | ||
746 | MOVL DI, R9 | ||
747 | SHRL $0x10, R9 | ||
748 | MOVB $0xf8, (AX) | ||
749 | MOVW DI, 1(AX) | ||
750 | MOVB R9, 3(AX) | ||
751 | ADDQ $0x04, AX | ||
752 | JMP memmove_long_match_emit_encodeBlockAsm | ||
753 | |||
754 | three_bytes_match_emit_encodeBlockAsm: | ||
755 | MOVB $0xf4, (AX) | ||
756 | MOVW DI, 1(AX) | ||
757 | ADDQ $0x03, AX | ||
758 | JMP memmove_long_match_emit_encodeBlockAsm | ||
759 | |||
760 | two_bytes_match_emit_encodeBlockAsm: | ||
761 | MOVB $0xf0, (AX) | ||
762 | MOVB DI, 1(AX) | ||
763 | ADDQ $0x02, AX | ||
764 | CMPL DI, $0x40 | ||
765 | JB memmove_match_emit_encodeBlockAsm | ||
766 | JMP memmove_long_match_emit_encodeBlockAsm | ||
767 | |||
768 | one_byte_match_emit_encodeBlockAsm: | ||
769 | SHLB $0x02, DI | ||
770 | MOVB DI, (AX) | ||
771 | ADDQ $0x01, AX | ||
772 | |||
773 | memmove_match_emit_encodeBlockAsm: | ||
774 | LEAQ (AX)(R8*1), DI | ||
775 | |||
776 | // genMemMoveShort | ||
777 | CMPQ R8, $0x08 | ||
778 | JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 | ||
779 | CMPQ R8, $0x10 | ||
780 | JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 | ||
781 | CMPQ R8, $0x20 | ||
782 | JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 | ||
783 | JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 | ||
784 | |||
785 | emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: | ||
786 | MOVQ (SI), R9 | ||
787 | MOVQ R9, (AX) | ||
788 | JMP memmove_end_copy_match_emit_encodeBlockAsm | ||
789 | |||
790 | emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: | ||
791 | MOVQ (SI), R9 | ||
792 | MOVQ -8(SI)(R8*1), SI | ||
793 | MOVQ R9, (AX) | ||
794 | MOVQ SI, -8(AX)(R8*1) | ||
795 | JMP memmove_end_copy_match_emit_encodeBlockAsm | ||
796 | |||
797 | emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: | ||
798 | MOVOU (SI), X0 | ||
799 | MOVOU -16(SI)(R8*1), X1 | ||
800 | MOVOU X0, (AX) | ||
801 | MOVOU X1, -16(AX)(R8*1) | ||
802 | JMP memmove_end_copy_match_emit_encodeBlockAsm | ||
803 | |||
804 | emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: | ||
805 | MOVOU (SI), X0 | ||
806 | MOVOU 16(SI), X1 | ||
807 | MOVOU -32(SI)(R8*1), X2 | ||
808 | MOVOU -16(SI)(R8*1), X3 | ||
809 | MOVOU X0, (AX) | ||
810 | MOVOU X1, 16(AX) | ||
811 | MOVOU X2, -32(AX)(R8*1) | ||
812 | MOVOU X3, -16(AX)(R8*1) | ||
813 | |||
814 | memmove_end_copy_match_emit_encodeBlockAsm: | ||
815 | MOVQ DI, AX | ||
816 | JMP emit_literal_done_match_emit_encodeBlockAsm | ||
817 | |||
818 | memmove_long_match_emit_encodeBlockAsm: | ||
819 | LEAQ (AX)(R8*1), DI | ||
820 | |||
821 | // genMemMoveLong | ||
822 | MOVOU (SI), X0 | ||
823 | MOVOU 16(SI), X1 | ||
824 | MOVOU -32(SI)(R8*1), X2 | ||
825 | MOVOU -16(SI)(R8*1), X3 | ||
826 | MOVQ R8, R10 | ||
827 | SHRQ $0x05, R10 | ||
828 | MOVQ AX, R9 | ||
829 | ANDL $0x0000001f, R9 | ||
830 | MOVQ $0x00000040, R11 | ||
831 | SUBQ R9, R11 | ||
832 | DECQ R10 | ||
833 | JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 | ||
834 | LEAQ -32(SI)(R11*1), R9 | ||
835 | LEAQ -32(AX)(R11*1), R12 | ||
836 | |||
837 | emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: | ||
838 | MOVOU (R9), X4 | ||
839 | MOVOU 16(R9), X5 | ||
840 | MOVOA X4, (R12) | ||
841 | MOVOA X5, 16(R12) | ||
842 | ADDQ $0x20, R12 | ||
843 | ADDQ $0x20, R9 | ||
844 | ADDQ $0x20, R11 | ||
845 | DECQ R10 | ||
846 | JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back | ||
847 | |||
848 | emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: | ||
849 | MOVOU -32(SI)(R11*1), X4 | ||
850 | MOVOU -16(SI)(R11*1), X5 | ||
851 | MOVOA X4, -32(AX)(R11*1) | ||
852 | MOVOA X5, -16(AX)(R11*1) | ||
853 | ADDQ $0x20, R11 | ||
854 | CMPQ R8, R11 | ||
855 | JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 | ||
856 | MOVOU X0, (AX) | ||
857 | MOVOU X1, 16(AX) | ||
858 | MOVOU X2, -32(AX)(R8*1) | ||
859 | MOVOU X3, -16(AX)(R8*1) | ||
860 | MOVQ DI, AX | ||
861 | |||
862 | emit_literal_done_match_emit_encodeBlockAsm: | ||
863 | match_nolit_loop_encodeBlockAsm: | ||
864 | MOVL CX, SI | ||
865 | SUBL BX, SI | ||
866 | MOVL SI, 16(SP) | ||
867 | ADDL $0x04, CX | ||
868 | ADDL $0x04, BX | ||
869 | MOVQ src_len+32(FP), SI | ||
870 | SUBL CX, SI | ||
871 | LEAQ (DX)(CX*1), DI | ||
872 | LEAQ (DX)(BX*1), BX | ||
873 | |||
874 | // matchLen | ||
875 | XORL R9, R9 | ||
876 | |||
877 | matchlen_loopback_16_match_nolit_encodeBlockAsm: | ||
878 | CMPL SI, $0x10 | ||
879 | JB matchlen_match8_match_nolit_encodeBlockAsm | ||
880 | MOVQ (DI)(R9*1), R8 | ||
881 | MOVQ 8(DI)(R9*1), R10 | ||
882 | XORQ (BX)(R9*1), R8 | ||
883 | JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm | ||
884 | XORQ 8(BX)(R9*1), R10 | ||
885 | JNZ matchlen_bsf_16match_nolit_encodeBlockAsm | ||
886 | LEAL -16(SI), SI | ||
887 | LEAL 16(R9), R9 | ||
888 | JMP matchlen_loopback_16_match_nolit_encodeBlockAsm | ||
889 | |||
890 | matchlen_bsf_16match_nolit_encodeBlockAsm: | ||
891 | #ifdef GOAMD64_v3 | ||
892 | TZCNTQ R10, R10 | ||
893 | |||
894 | #else | ||
895 | BSFQ R10, R10 | ||
896 | |||
897 | #endif | ||
898 | SARQ $0x03, R10 | ||
899 | LEAL 8(R9)(R10*1), R9 | ||
900 | JMP match_nolit_end_encodeBlockAsm | ||
901 | |||
902 | matchlen_match8_match_nolit_encodeBlockAsm: | ||
903 | CMPL SI, $0x08 | ||
904 | JB matchlen_match4_match_nolit_encodeBlockAsm | ||
905 | MOVQ (DI)(R9*1), R8 | ||
906 | XORQ (BX)(R9*1), R8 | ||
907 | JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm | ||
908 | LEAL -8(SI), SI | ||
909 | LEAL 8(R9), R9 | ||
910 | JMP matchlen_match4_match_nolit_encodeBlockAsm | ||
911 | |||
912 | matchlen_bsf_8_match_nolit_encodeBlockAsm: | ||
913 | #ifdef GOAMD64_v3 | ||
914 | TZCNTQ R8, R8 | ||
915 | |||
916 | #else | ||
917 | BSFQ R8, R8 | ||
918 | |||
919 | #endif | ||
920 | SARQ $0x03, R8 | ||
921 | LEAL (R9)(R8*1), R9 | ||
922 | JMP match_nolit_end_encodeBlockAsm | ||
923 | |||
924 | matchlen_match4_match_nolit_encodeBlockAsm: | ||
925 | CMPL SI, $0x04 | ||
926 | JB matchlen_match2_match_nolit_encodeBlockAsm | ||
927 | MOVL (DI)(R9*1), R8 | ||
928 | CMPL (BX)(R9*1), R8 | ||
929 | JNE matchlen_match2_match_nolit_encodeBlockAsm | ||
930 | LEAL -4(SI), SI | ||
931 | LEAL 4(R9), R9 | ||
932 | |||
933 | matchlen_match2_match_nolit_encodeBlockAsm: | ||
934 | CMPL SI, $0x01 | ||
935 | JE matchlen_match1_match_nolit_encodeBlockAsm | ||
936 | JB match_nolit_end_encodeBlockAsm | ||
937 | MOVW (DI)(R9*1), R8 | ||
938 | CMPW (BX)(R9*1), R8 | ||
939 | JNE matchlen_match1_match_nolit_encodeBlockAsm | ||
940 | LEAL 2(R9), R9 | ||
941 | SUBL $0x02, SI | ||
942 | JZ match_nolit_end_encodeBlockAsm | ||
943 | |||
944 | matchlen_match1_match_nolit_encodeBlockAsm: | ||
945 | MOVB (DI)(R9*1), R8 | ||
946 | CMPB (BX)(R9*1), R8 | ||
947 | JNE match_nolit_end_encodeBlockAsm | ||
948 | LEAL 1(R9), R9 | ||
949 | |||
950 | match_nolit_end_encodeBlockAsm: | ||
951 | ADDL R9, CX | ||
952 | MOVL 16(SP), BX | ||
953 | ADDL $0x04, R9 | ||
954 | MOVL CX, 12(SP) | ||
955 | |||
956 | // emitCopy | ||
957 | CMPL BX, $0x00010000 | ||
958 | JB two_byte_offset_match_nolit_encodeBlockAsm | ||
959 | CMPL R9, $0x40 | ||
960 | JBE four_bytes_remain_match_nolit_encodeBlockAsm | ||
961 | MOVB $0xff, (AX) | ||
962 | MOVL BX, 1(AX) | ||
963 | LEAL -64(R9), R9 | ||
964 | ADDQ $0x05, AX | ||
965 | CMPL R9, $0x04 | ||
966 | JB four_bytes_remain_match_nolit_encodeBlockAsm | ||
967 | |||
968 | // emitRepeat | ||
969 | emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: | ||
970 | MOVL R9, SI | ||
971 | LEAL -4(R9), R9 | ||
972 | CMPL SI, $0x08 | ||
973 | JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy | ||
974 | CMPL SI, $0x0c | ||
975 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy | ||
976 | CMPL BX, $0x00000800 | ||
977 | JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy | ||
978 | |||
979 | cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: | ||
980 | CMPL R9, $0x00000104 | ||
981 | JB repeat_three_match_nolit_encodeBlockAsm_emit_copy | ||
982 | CMPL R9, $0x00010100 | ||
983 | JB repeat_four_match_nolit_encodeBlockAsm_emit_copy | ||
984 | CMPL R9, $0x0100ffff | ||
985 | JB repeat_five_match_nolit_encodeBlockAsm_emit_copy | ||
986 | LEAL -16842747(R9), R9 | ||
987 | MOVL $0xfffb001d, (AX) | ||
988 | MOVB $0xff, 4(AX) | ||
989 | ADDQ $0x05, AX | ||
990 | JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy | ||
991 | |||
992 | repeat_five_match_nolit_encodeBlockAsm_emit_copy: | ||
993 | LEAL -65536(R9), R9 | ||
994 | MOVL R9, BX | ||
995 | MOVW $0x001d, (AX) | ||
996 | MOVW R9, 2(AX) | ||
997 | SARL $0x10, BX | ||
998 | MOVB BL, 4(AX) | ||
999 | ADDQ $0x05, AX | ||
1000 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1001 | |||
1002 | repeat_four_match_nolit_encodeBlockAsm_emit_copy: | ||
1003 | LEAL -256(R9), R9 | ||
1004 | MOVW $0x0019, (AX) | ||
1005 | MOVW R9, 2(AX) | ||
1006 | ADDQ $0x04, AX | ||
1007 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1008 | |||
1009 | repeat_three_match_nolit_encodeBlockAsm_emit_copy: | ||
1010 | LEAL -4(R9), R9 | ||
1011 | MOVW $0x0015, (AX) | ||
1012 | MOVB R9, 2(AX) | ||
1013 | ADDQ $0x03, AX | ||
1014 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1015 | |||
1016 | repeat_two_match_nolit_encodeBlockAsm_emit_copy: | ||
1017 | SHLL $0x02, R9 | ||
1018 | ORL $0x01, R9 | ||
1019 | MOVW R9, (AX) | ||
1020 | ADDQ $0x02, AX | ||
1021 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1022 | |||
1023 | repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: | ||
1024 | XORQ SI, SI | ||
1025 | LEAL 1(SI)(R9*4), R9 | ||
1026 | MOVB BL, 1(AX) | ||
1027 | SARL $0x08, BX | ||
1028 | SHLL $0x05, BX | ||
1029 | ORL BX, R9 | ||
1030 | MOVB R9, (AX) | ||
1031 | ADDQ $0x02, AX | ||
1032 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1033 | |||
1034 | four_bytes_remain_match_nolit_encodeBlockAsm: | ||
1035 | TESTL R9, R9 | ||
1036 | JZ match_nolit_emitcopy_end_encodeBlockAsm | ||
1037 | XORL SI, SI | ||
1038 | LEAL -1(SI)(R9*4), R9 | ||
1039 | MOVB R9, (AX) | ||
1040 | MOVL BX, 1(AX) | ||
1041 | ADDQ $0x05, AX | ||
1042 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1043 | |||
1044 | two_byte_offset_match_nolit_encodeBlockAsm: | ||
1045 | CMPL R9, $0x40 | ||
1046 | JBE two_byte_offset_short_match_nolit_encodeBlockAsm | ||
1047 | CMPL BX, $0x00000800 | ||
1048 | JAE long_offset_short_match_nolit_encodeBlockAsm | ||
1049 | MOVL $0x00000001, SI | ||
1050 | LEAL 16(SI), SI | ||
1051 | MOVB BL, 1(AX) | ||
1052 | MOVL BX, DI | ||
1053 | SHRL $0x08, DI | ||
1054 | SHLL $0x05, DI | ||
1055 | ORL DI, SI | ||
1056 | MOVB SI, (AX) | ||
1057 | ADDQ $0x02, AX | ||
1058 | SUBL $0x08, R9 | ||
1059 | |||
1060 | // emitRepeat | ||
1061 | LEAL -4(R9), R9 | ||
1062 | JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b | ||
1063 | |||
1064 | emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b: | ||
1065 | MOVL R9, SI | ||
1066 | LEAL -4(R9), R9 | ||
1067 | CMPL SI, $0x08 | ||
1068 | JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b | ||
1069 | CMPL SI, $0x0c | ||
1070 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b | ||
1071 | CMPL BX, $0x00000800 | ||
1072 | JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b | ||
1073 | |||
1074 | cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: | ||
1075 | CMPL R9, $0x00000104 | ||
1076 | JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b | ||
1077 | CMPL R9, $0x00010100 | ||
1078 | JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b | ||
1079 | CMPL R9, $0x0100ffff | ||
1080 | JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b | ||
1081 | LEAL -16842747(R9), R9 | ||
1082 | MOVL $0xfffb001d, (AX) | ||
1083 | MOVB $0xff, 4(AX) | ||
1084 | ADDQ $0x05, AX | ||
1085 | JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b | ||
1086 | |||
1087 | repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b: | ||
1088 | LEAL -65536(R9), R9 | ||
1089 | MOVL R9, BX | ||
1090 | MOVW $0x001d, (AX) | ||
1091 | MOVW R9, 2(AX) | ||
1092 | SARL $0x10, BX | ||
1093 | MOVB BL, 4(AX) | ||
1094 | ADDQ $0x05, AX | ||
1095 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1096 | |||
1097 | repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b: | ||
1098 | LEAL -256(R9), R9 | ||
1099 | MOVW $0x0019, (AX) | ||
1100 | MOVW R9, 2(AX) | ||
1101 | ADDQ $0x04, AX | ||
1102 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1103 | |||
1104 | repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b: | ||
1105 | LEAL -4(R9), R9 | ||
1106 | MOVW $0x0015, (AX) | ||
1107 | MOVB R9, 2(AX) | ||
1108 | ADDQ $0x03, AX | ||
1109 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1110 | |||
1111 | repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b: | ||
1112 | SHLL $0x02, R9 | ||
1113 | ORL $0x01, R9 | ||
1114 | MOVW R9, (AX) | ||
1115 | ADDQ $0x02, AX | ||
1116 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1117 | |||
1118 | repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: | ||
1119 | XORQ SI, SI | ||
1120 | LEAL 1(SI)(R9*4), R9 | ||
1121 | MOVB BL, 1(AX) | ||
1122 | SARL $0x08, BX | ||
1123 | SHLL $0x05, BX | ||
1124 | ORL BX, R9 | ||
1125 | MOVB R9, (AX) | ||
1126 | ADDQ $0x02, AX | ||
1127 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1128 | |||
1129 | long_offset_short_match_nolit_encodeBlockAsm: | ||
1130 | MOVB $0xee, (AX) | ||
1131 | MOVW BX, 1(AX) | ||
1132 | LEAL -60(R9), R9 | ||
1133 | ADDQ $0x03, AX | ||
1134 | |||
1135 | // emitRepeat | ||
1136 | emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: | ||
1137 | MOVL R9, SI | ||
1138 | LEAL -4(R9), R9 | ||
1139 | CMPL SI, $0x08 | ||
1140 | JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short | ||
1141 | CMPL SI, $0x0c | ||
1142 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short | ||
1143 | CMPL BX, $0x00000800 | ||
1144 | JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short | ||
1145 | |||
1146 | cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: | ||
1147 | CMPL R9, $0x00000104 | ||
1148 | JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short | ||
1149 | CMPL R9, $0x00010100 | ||
1150 | JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short | ||
1151 | CMPL R9, $0x0100ffff | ||
1152 | JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short | ||
1153 | LEAL -16842747(R9), R9 | ||
1154 | MOVL $0xfffb001d, (AX) | ||
1155 | MOVB $0xff, 4(AX) | ||
1156 | ADDQ $0x05, AX | ||
1157 | JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short | ||
1158 | |||
1159 | repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: | ||
1160 | LEAL -65536(R9), R9 | ||
1161 | MOVL R9, BX | ||
1162 | MOVW $0x001d, (AX) | ||
1163 | MOVW R9, 2(AX) | ||
1164 | SARL $0x10, BX | ||
1165 | MOVB BL, 4(AX) | ||
1166 | ADDQ $0x05, AX | ||
1167 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1168 | |||
1169 | repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: | ||
1170 | LEAL -256(R9), R9 | ||
1171 | MOVW $0x0019, (AX) | ||
1172 | MOVW R9, 2(AX) | ||
1173 | ADDQ $0x04, AX | ||
1174 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1175 | |||
1176 | repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: | ||
1177 | LEAL -4(R9), R9 | ||
1178 | MOVW $0x0015, (AX) | ||
1179 | MOVB R9, 2(AX) | ||
1180 | ADDQ $0x03, AX | ||
1181 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1182 | |||
1183 | repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: | ||
1184 | SHLL $0x02, R9 | ||
1185 | ORL $0x01, R9 | ||
1186 | MOVW R9, (AX) | ||
1187 | ADDQ $0x02, AX | ||
1188 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1189 | |||
1190 | repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: | ||
1191 | XORQ SI, SI | ||
1192 | LEAL 1(SI)(R9*4), R9 | ||
1193 | MOVB BL, 1(AX) | ||
1194 | SARL $0x08, BX | ||
1195 | SHLL $0x05, BX | ||
1196 | ORL BX, R9 | ||
1197 | MOVB R9, (AX) | ||
1198 | ADDQ $0x02, AX | ||
1199 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1200 | |||
1201 | two_byte_offset_short_match_nolit_encodeBlockAsm: | ||
1202 | MOVL R9, SI | ||
1203 | SHLL $0x02, SI | ||
1204 | CMPL R9, $0x0c | ||
1205 | JAE emit_copy_three_match_nolit_encodeBlockAsm | ||
1206 | CMPL BX, $0x00000800 | ||
1207 | JAE emit_copy_three_match_nolit_encodeBlockAsm | ||
1208 | LEAL -15(SI), SI | ||
1209 | MOVB BL, 1(AX) | ||
1210 | SHRL $0x08, BX | ||
1211 | SHLL $0x05, BX | ||
1212 | ORL BX, SI | ||
1213 | MOVB SI, (AX) | ||
1214 | ADDQ $0x02, AX | ||
1215 | JMP match_nolit_emitcopy_end_encodeBlockAsm | ||
1216 | |||
1217 | emit_copy_three_match_nolit_encodeBlockAsm: | ||
1218 | LEAL -2(SI), SI | ||
1219 | MOVB SI, (AX) | ||
1220 | MOVW BX, 1(AX) | ||
1221 | ADDQ $0x03, AX | ||
1222 | |||
1223 | match_nolit_emitcopy_end_encodeBlockAsm: | ||
1224 | CMPL CX, 8(SP) | ||
1225 | JAE emit_remainder_encodeBlockAsm | ||
1226 | MOVQ -2(DX)(CX*1), SI | ||
1227 | CMPQ AX, (SP) | ||
1228 | JB match_nolit_dst_ok_encodeBlockAsm | ||
1229 | MOVQ $0x00000000, ret+48(FP) | ||
1230 | RET | ||
1231 | |||
1232 | match_nolit_dst_ok_encodeBlockAsm: | ||
1233 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
1234 | MOVQ SI, DI | ||
1235 | SHRQ $0x10, SI | ||
1236 | MOVQ SI, BX | ||
1237 | SHLQ $0x10, DI | ||
1238 | IMULQ R8, DI | ||
1239 | SHRQ $0x32, DI | ||
1240 | SHLQ $0x10, BX | ||
1241 | IMULQ R8, BX | ||
1242 | SHRQ $0x32, BX | ||
1243 | LEAL -2(CX), R8 | ||
1244 | LEAQ 24(SP)(BX*4), R9 | ||
1245 | MOVL (R9), BX | ||
1246 | MOVL R8, 24(SP)(DI*4) | ||
1247 | MOVL CX, (R9) | ||
1248 | CMPL (DX)(BX*1), SI | ||
1249 | JEQ match_nolit_loop_encodeBlockAsm | ||
1250 | INCL CX | ||
1251 | JMP search_loop_encodeBlockAsm | ||
1252 | |||
1253 | emit_remainder_encodeBlockAsm: | ||
1254 | MOVQ src_len+32(FP), CX | ||
1255 | SUBL 12(SP), CX | ||
1256 | LEAQ 5(AX)(CX*1), CX | ||
1257 | CMPQ CX, (SP) | ||
1258 | JB emit_remainder_ok_encodeBlockAsm | ||
1259 | MOVQ $0x00000000, ret+48(FP) | ||
1260 | RET | ||
1261 | |||
1262 | emit_remainder_ok_encodeBlockAsm: | ||
1263 | MOVQ src_len+32(FP), CX | ||
1264 | MOVL 12(SP), BX | ||
1265 | CMPL BX, CX | ||
1266 | JEQ emit_literal_done_emit_remainder_encodeBlockAsm | ||
1267 | MOVL CX, SI | ||
1268 | MOVL CX, 12(SP) | ||
1269 | LEAQ (DX)(BX*1), CX | ||
1270 | SUBL BX, SI | ||
1271 | LEAL -1(SI), DX | ||
1272 | CMPL DX, $0x3c | ||
1273 | JB one_byte_emit_remainder_encodeBlockAsm | ||
1274 | CMPL DX, $0x00000100 | ||
1275 | JB two_bytes_emit_remainder_encodeBlockAsm | ||
1276 | CMPL DX, $0x00010000 | ||
1277 | JB three_bytes_emit_remainder_encodeBlockAsm | ||
1278 | CMPL DX, $0x01000000 | ||
1279 | JB four_bytes_emit_remainder_encodeBlockAsm | ||
1280 | MOVB $0xfc, (AX) | ||
1281 | MOVL DX, 1(AX) | ||
1282 | ADDQ $0x05, AX | ||
1283 | JMP memmove_long_emit_remainder_encodeBlockAsm | ||
1284 | |||
1285 | four_bytes_emit_remainder_encodeBlockAsm: | ||
1286 | MOVL DX, BX | ||
1287 | SHRL $0x10, BX | ||
1288 | MOVB $0xf8, (AX) | ||
1289 | MOVW DX, 1(AX) | ||
1290 | MOVB BL, 3(AX) | ||
1291 | ADDQ $0x04, AX | ||
1292 | JMP memmove_long_emit_remainder_encodeBlockAsm | ||
1293 | |||
1294 | three_bytes_emit_remainder_encodeBlockAsm: | ||
1295 | MOVB $0xf4, (AX) | ||
1296 | MOVW DX, 1(AX) | ||
1297 | ADDQ $0x03, AX | ||
1298 | JMP memmove_long_emit_remainder_encodeBlockAsm | ||
1299 | |||
1300 | two_bytes_emit_remainder_encodeBlockAsm: | ||
1301 | MOVB $0xf0, (AX) | ||
1302 | MOVB DL, 1(AX) | ||
1303 | ADDQ $0x02, AX | ||
1304 | CMPL DX, $0x40 | ||
1305 | JB memmove_emit_remainder_encodeBlockAsm | ||
1306 | JMP memmove_long_emit_remainder_encodeBlockAsm | ||
1307 | |||
1308 | one_byte_emit_remainder_encodeBlockAsm: | ||
1309 | SHLB $0x02, DL | ||
1310 | MOVB DL, (AX) | ||
1311 | ADDQ $0x01, AX | ||
1312 | |||
1313 | memmove_emit_remainder_encodeBlockAsm: | ||
1314 | LEAQ (AX)(SI*1), DX | ||
1315 | MOVL SI, BX | ||
1316 | |||
1317 | // genMemMoveShort | ||
1318 | CMPQ BX, $0x03 | ||
1319 | JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 | ||
1320 | JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 | ||
1321 | CMPQ BX, $0x08 | ||
1322 | JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7 | ||
1323 | CMPQ BX, $0x10 | ||
1324 | JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 | ||
1325 | CMPQ BX, $0x20 | ||
1326 | JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 | ||
1327 | JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 | ||
1328 | |||
1329 | emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: | ||
1330 | MOVB (CX), SI | ||
1331 | MOVB -1(CX)(BX*1), CL | ||
1332 | MOVB SI, (AX) | ||
1333 | MOVB CL, -1(AX)(BX*1) | ||
1334 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm | ||
1335 | |||
1336 | emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: | ||
1337 | MOVW (CX), SI | ||
1338 | MOVB 2(CX), CL | ||
1339 | MOVW SI, (AX) | ||
1340 | MOVB CL, 2(AX) | ||
1341 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm | ||
1342 | |||
1343 | emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: | ||
1344 | MOVL (CX), SI | ||
1345 | MOVL -4(CX)(BX*1), CX | ||
1346 | MOVL SI, (AX) | ||
1347 | MOVL CX, -4(AX)(BX*1) | ||
1348 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm | ||
1349 | |||
1350 | emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: | ||
1351 | MOVQ (CX), SI | ||
1352 | MOVQ -8(CX)(BX*1), CX | ||
1353 | MOVQ SI, (AX) | ||
1354 | MOVQ CX, -8(AX)(BX*1) | ||
1355 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm | ||
1356 | |||
1357 | emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: | ||
1358 | MOVOU (CX), X0 | ||
1359 | MOVOU -16(CX)(BX*1), X1 | ||
1360 | MOVOU X0, (AX) | ||
1361 | MOVOU X1, -16(AX)(BX*1) | ||
1362 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm | ||
1363 | |||
1364 | emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: | ||
1365 | MOVOU (CX), X0 | ||
1366 | MOVOU 16(CX), X1 | ||
1367 | MOVOU -32(CX)(BX*1), X2 | ||
1368 | MOVOU -16(CX)(BX*1), X3 | ||
1369 | MOVOU X0, (AX) | ||
1370 | MOVOU X1, 16(AX) | ||
1371 | MOVOU X2, -32(AX)(BX*1) | ||
1372 | MOVOU X3, -16(AX)(BX*1) | ||
1373 | |||
1374 | memmove_end_copy_emit_remainder_encodeBlockAsm: | ||
1375 | MOVQ DX, AX | ||
1376 | JMP emit_literal_done_emit_remainder_encodeBlockAsm | ||
1377 | |||
1378 | memmove_long_emit_remainder_encodeBlockAsm: | ||
1379 | LEAQ (AX)(SI*1), DX | ||
1380 | MOVL SI, BX | ||
1381 | |||
1382 | // genMemMoveLong | ||
1383 | MOVOU (CX), X0 | ||
1384 | MOVOU 16(CX), X1 | ||
1385 | MOVOU -32(CX)(BX*1), X2 | ||
1386 | MOVOU -16(CX)(BX*1), X3 | ||
1387 | MOVQ BX, DI | ||
1388 | SHRQ $0x05, DI | ||
1389 | MOVQ AX, SI | ||
1390 | ANDL $0x0000001f, SI | ||
1391 | MOVQ $0x00000040, R8 | ||
1392 | SUBQ SI, R8 | ||
1393 | DECQ DI | ||
1394 | JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 | ||
1395 | LEAQ -32(CX)(R8*1), SI | ||
1396 | LEAQ -32(AX)(R8*1), R9 | ||
1397 | |||
1398 | emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: | ||
1399 | MOVOU (SI), X4 | ||
1400 | MOVOU 16(SI), X5 | ||
1401 | MOVOA X4, (R9) | ||
1402 | MOVOA X5, 16(R9) | ||
1403 | ADDQ $0x20, R9 | ||
1404 | ADDQ $0x20, SI | ||
1405 | ADDQ $0x20, R8 | ||
1406 | DECQ DI | ||
1407 | JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back | ||
1408 | |||
1409 | emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: | ||
1410 | MOVOU -32(CX)(R8*1), X4 | ||
1411 | MOVOU -16(CX)(R8*1), X5 | ||
1412 | MOVOA X4, -32(AX)(R8*1) | ||
1413 | MOVOA X5, -16(AX)(R8*1) | ||
1414 | ADDQ $0x20, R8 | ||
1415 | CMPQ BX, R8 | ||
1416 | JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 | ||
1417 | MOVOU X0, (AX) | ||
1418 | MOVOU X1, 16(AX) | ||
1419 | MOVOU X2, -32(AX)(BX*1) | ||
1420 | MOVOU X3, -16(AX)(BX*1) | ||
1421 | MOVQ DX, AX | ||
1422 | |||
1423 | emit_literal_done_emit_remainder_encodeBlockAsm: | ||
1424 | MOVQ dst_base+0(FP), CX | ||
1425 | SUBQ CX, AX | ||
1426 | MOVQ AX, ret+48(FP) | ||
1427 | RET | ||
1428 | |||
1429 | // func encodeBlockAsm4MB(dst []byte, src []byte) int | ||
1430 | // Requires: BMI, SSE2 | ||
1431 | TEXT ·encodeBlockAsm4MB(SB), $65560-56 | ||
1432 | MOVQ dst_base+0(FP), AX | ||
1433 | MOVQ $0x00000200, CX | ||
1434 | LEAQ 24(SP), DX | ||
1435 | PXOR X0, X0 | ||
1436 | |||
1437 | zero_loop_encodeBlockAsm4MB: | ||
1438 | MOVOU X0, (DX) | ||
1439 | MOVOU X0, 16(DX) | ||
1440 | MOVOU X0, 32(DX) | ||
1441 | MOVOU X0, 48(DX) | ||
1442 | MOVOU X0, 64(DX) | ||
1443 | MOVOU X0, 80(DX) | ||
1444 | MOVOU X0, 96(DX) | ||
1445 | MOVOU X0, 112(DX) | ||
1446 | ADDQ $0x80, DX | ||
1447 | DECQ CX | ||
1448 | JNZ zero_loop_encodeBlockAsm4MB | ||
1449 | MOVL $0x00000000, 12(SP) | ||
1450 | MOVQ src_len+32(FP), CX | ||
1451 | LEAQ -9(CX), DX | ||
1452 | LEAQ -8(CX), BX | ||
1453 | MOVL BX, 8(SP) | ||
1454 | SHRQ $0x05, CX | ||
1455 | SUBL CX, DX | ||
1456 | LEAQ (AX)(DX*1), DX | ||
1457 | MOVQ DX, (SP) | ||
1458 | MOVL $0x00000001, CX | ||
1459 | MOVL CX, 16(SP) | ||
1460 | MOVQ src_base+24(FP), DX | ||
1461 | |||
1462 | search_loop_encodeBlockAsm4MB: | ||
1463 | MOVL CX, BX | ||
1464 | SUBL 12(SP), BX | ||
1465 | SHRL $0x06, BX | ||
1466 | LEAL 4(CX)(BX*1), BX | ||
1467 | CMPL BX, 8(SP) | ||
1468 | JAE emit_remainder_encodeBlockAsm4MB | ||
1469 | MOVQ (DX)(CX*1), SI | ||
1470 | MOVL BX, 20(SP) | ||
1471 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
1472 | MOVQ SI, R9 | ||
1473 | MOVQ SI, R10 | ||
1474 | SHRQ $0x08, R10 | ||
1475 | SHLQ $0x10, R9 | ||
1476 | IMULQ R8, R9 | ||
1477 | SHRQ $0x32, R9 | ||
1478 | SHLQ $0x10, R10 | ||
1479 | IMULQ R8, R10 | ||
1480 | SHRQ $0x32, R10 | ||
1481 | MOVL 24(SP)(R9*4), BX | ||
1482 | MOVL 24(SP)(R10*4), DI | ||
1483 | MOVL CX, 24(SP)(R9*4) | ||
1484 | LEAL 1(CX), R9 | ||
1485 | MOVL R9, 24(SP)(R10*4) | ||
1486 | MOVQ SI, R9 | ||
1487 | SHRQ $0x10, R9 | ||
1488 | SHLQ $0x10, R9 | ||
1489 | IMULQ R8, R9 | ||
1490 | SHRQ $0x32, R9 | ||
1491 | MOVL CX, R8 | ||
1492 | SUBL 16(SP), R8 | ||
1493 | MOVL 1(DX)(R8*1), R10 | ||
1494 | MOVQ SI, R8 | ||
1495 | SHRQ $0x08, R8 | ||
1496 | CMPL R8, R10 | ||
1497 | JNE no_repeat_found_encodeBlockAsm4MB | ||
1498 | LEAL 1(CX), SI | ||
1499 | MOVL 12(SP), DI | ||
1500 | MOVL SI, BX | ||
1501 | SUBL 16(SP), BX | ||
1502 | JZ repeat_extend_back_end_encodeBlockAsm4MB | ||
1503 | |||
1504 | repeat_extend_back_loop_encodeBlockAsm4MB: | ||
1505 | CMPL SI, DI | ||
1506 | JBE repeat_extend_back_end_encodeBlockAsm4MB | ||
1507 | MOVB -1(DX)(BX*1), R8 | ||
1508 | MOVB -1(DX)(SI*1), R9 | ||
1509 | CMPB R8, R9 | ||
1510 | JNE repeat_extend_back_end_encodeBlockAsm4MB | ||
1511 | LEAL -1(SI), SI | ||
1512 | DECL BX | ||
1513 | JNZ repeat_extend_back_loop_encodeBlockAsm4MB | ||
1514 | |||
1515 | repeat_extend_back_end_encodeBlockAsm4MB: | ||
1516 | MOVL 12(SP), BX | ||
1517 | CMPL BX, SI | ||
1518 | JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB | ||
1519 | MOVL SI, R8 | ||
1520 | MOVL SI, 12(SP) | ||
1521 | LEAQ (DX)(BX*1), R9 | ||
1522 | SUBL BX, R8 | ||
1523 | LEAL -1(R8), BX | ||
1524 | CMPL BX, $0x3c | ||
1525 | JB one_byte_repeat_emit_encodeBlockAsm4MB | ||
1526 | CMPL BX, $0x00000100 | ||
1527 | JB two_bytes_repeat_emit_encodeBlockAsm4MB | ||
1528 | CMPL BX, $0x00010000 | ||
1529 | JB three_bytes_repeat_emit_encodeBlockAsm4MB | ||
1530 | MOVL BX, R10 | ||
1531 | SHRL $0x10, R10 | ||
1532 | MOVB $0xf8, (AX) | ||
1533 | MOVW BX, 1(AX) | ||
1534 | MOVB R10, 3(AX) | ||
1535 | ADDQ $0x04, AX | ||
1536 | JMP memmove_long_repeat_emit_encodeBlockAsm4MB | ||
1537 | |||
1538 | three_bytes_repeat_emit_encodeBlockAsm4MB: | ||
1539 | MOVB $0xf4, (AX) | ||
1540 | MOVW BX, 1(AX) | ||
1541 | ADDQ $0x03, AX | ||
1542 | JMP memmove_long_repeat_emit_encodeBlockAsm4MB | ||
1543 | |||
1544 | two_bytes_repeat_emit_encodeBlockAsm4MB: | ||
1545 | MOVB $0xf0, (AX) | ||
1546 | MOVB BL, 1(AX) | ||
1547 | ADDQ $0x02, AX | ||
1548 | CMPL BX, $0x40 | ||
1549 | JB memmove_repeat_emit_encodeBlockAsm4MB | ||
1550 | JMP memmove_long_repeat_emit_encodeBlockAsm4MB | ||
1551 | |||
1552 | one_byte_repeat_emit_encodeBlockAsm4MB: | ||
1553 | SHLB $0x02, BL | ||
1554 | MOVB BL, (AX) | ||
1555 | ADDQ $0x01, AX | ||
1556 | |||
1557 | memmove_repeat_emit_encodeBlockAsm4MB: | ||
1558 | LEAQ (AX)(R8*1), BX | ||
1559 | |||
1560 | // genMemMoveShort | ||
1561 | CMPQ R8, $0x08 | ||
1562 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 | ||
1563 | CMPQ R8, $0x10 | ||
1564 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 | ||
1565 | CMPQ R8, $0x20 | ||
1566 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 | ||
1567 | JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 | ||
1568 | |||
1569 | emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: | ||
1570 | MOVQ (R9), R10 | ||
1571 | MOVQ R10, (AX) | ||
1572 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB | ||
1573 | |||
1574 | emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: | ||
1575 | MOVQ (R9), R10 | ||
1576 | MOVQ -8(R9)(R8*1), R9 | ||
1577 | MOVQ R10, (AX) | ||
1578 | MOVQ R9, -8(AX)(R8*1) | ||
1579 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB | ||
1580 | |||
1581 | emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: | ||
1582 | MOVOU (R9), X0 | ||
1583 | MOVOU -16(R9)(R8*1), X1 | ||
1584 | MOVOU X0, (AX) | ||
1585 | MOVOU X1, -16(AX)(R8*1) | ||
1586 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB | ||
1587 | |||
1588 | emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: | ||
1589 | MOVOU (R9), X0 | ||
1590 | MOVOU 16(R9), X1 | ||
1591 | MOVOU -32(R9)(R8*1), X2 | ||
1592 | MOVOU -16(R9)(R8*1), X3 | ||
1593 | MOVOU X0, (AX) | ||
1594 | MOVOU X1, 16(AX) | ||
1595 | MOVOU X2, -32(AX)(R8*1) | ||
1596 | MOVOU X3, -16(AX)(R8*1) | ||
1597 | |||
1598 | memmove_end_copy_repeat_emit_encodeBlockAsm4MB: | ||
1599 | MOVQ BX, AX | ||
1600 | JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB | ||
1601 | |||
1602 | memmove_long_repeat_emit_encodeBlockAsm4MB: | ||
1603 | LEAQ (AX)(R8*1), BX | ||
1604 | |||
1605 | // genMemMoveLong | ||
1606 | MOVOU (R9), X0 | ||
1607 | MOVOU 16(R9), X1 | ||
1608 | MOVOU -32(R9)(R8*1), X2 | ||
1609 | MOVOU -16(R9)(R8*1), X3 | ||
1610 | MOVQ R8, R11 | ||
1611 | SHRQ $0x05, R11 | ||
1612 | MOVQ AX, R10 | ||
1613 | ANDL $0x0000001f, R10 | ||
1614 | MOVQ $0x00000040, R12 | ||
1615 | SUBQ R10, R12 | ||
1616 | DECQ R11 | ||
1617 | JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 | ||
1618 | LEAQ -32(R9)(R12*1), R10 | ||
1619 | LEAQ -32(AX)(R12*1), R13 | ||
1620 | |||
1621 | emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: | ||
1622 | MOVOU (R10), X4 | ||
1623 | MOVOU 16(R10), X5 | ||
1624 | MOVOA X4, (R13) | ||
1625 | MOVOA X5, 16(R13) | ||
1626 | ADDQ $0x20, R13 | ||
1627 | ADDQ $0x20, R10 | ||
1628 | ADDQ $0x20, R12 | ||
1629 | DECQ R11 | ||
1630 | JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back | ||
1631 | |||
1632 | emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: | ||
1633 | MOVOU -32(R9)(R12*1), X4 | ||
1634 | MOVOU -16(R9)(R12*1), X5 | ||
1635 | MOVOA X4, -32(AX)(R12*1) | ||
1636 | MOVOA X5, -16(AX)(R12*1) | ||
1637 | ADDQ $0x20, R12 | ||
1638 | CMPQ R8, R12 | ||
1639 | JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 | ||
1640 | MOVOU X0, (AX) | ||
1641 | MOVOU X1, 16(AX) | ||
1642 | MOVOU X2, -32(AX)(R8*1) | ||
1643 | MOVOU X3, -16(AX)(R8*1) | ||
1644 | MOVQ BX, AX | ||
1645 | |||
1646 | emit_literal_done_repeat_emit_encodeBlockAsm4MB: | ||
1647 | ADDL $0x05, CX | ||
1648 | MOVL CX, BX | ||
1649 | SUBL 16(SP), BX | ||
1650 | MOVQ src_len+32(FP), R8 | ||
1651 | SUBL CX, R8 | ||
1652 | LEAQ (DX)(CX*1), R9 | ||
1653 | LEAQ (DX)(BX*1), BX | ||
1654 | |||
1655 | // matchLen | ||
1656 | XORL R11, R11 | ||
1657 | |||
1658 | matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB: | ||
1659 | CMPL R8, $0x10 | ||
1660 | JB matchlen_match8_repeat_extend_encodeBlockAsm4MB | ||
1661 | MOVQ (R9)(R11*1), R10 | ||
1662 | MOVQ 8(R9)(R11*1), R12 | ||
1663 | XORQ (BX)(R11*1), R10 | ||
1664 | JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB | ||
1665 | XORQ 8(BX)(R11*1), R12 | ||
1666 | JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4MB | ||
1667 | LEAL -16(R8), R8 | ||
1668 | LEAL 16(R11), R11 | ||
1669 | JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB | ||
1670 | |||
1671 | matchlen_bsf_16repeat_extend_encodeBlockAsm4MB: | ||
1672 | #ifdef GOAMD64_v3 | ||
1673 | TZCNTQ R12, R12 | ||
1674 | |||
1675 | #else | ||
1676 | BSFQ R12, R12 | ||
1677 | |||
1678 | #endif | ||
1679 | SARQ $0x03, R12 | ||
1680 | LEAL 8(R11)(R12*1), R11 | ||
1681 | JMP repeat_extend_forward_end_encodeBlockAsm4MB | ||
1682 | |||
1683 | matchlen_match8_repeat_extend_encodeBlockAsm4MB: | ||
1684 | CMPL R8, $0x08 | ||
1685 | JB matchlen_match4_repeat_extend_encodeBlockAsm4MB | ||
1686 | MOVQ (R9)(R11*1), R10 | ||
1687 | XORQ (BX)(R11*1), R10 | ||
1688 | JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB | ||
1689 | LEAL -8(R8), R8 | ||
1690 | LEAL 8(R11), R11 | ||
1691 | JMP matchlen_match4_repeat_extend_encodeBlockAsm4MB | ||
1692 | |||
1693 | matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB: | ||
1694 | #ifdef GOAMD64_v3 | ||
1695 | TZCNTQ R10, R10 | ||
1696 | |||
1697 | #else | ||
1698 | BSFQ R10, R10 | ||
1699 | |||
1700 | #endif | ||
1701 | SARQ $0x03, R10 | ||
1702 | LEAL (R11)(R10*1), R11 | ||
1703 | JMP repeat_extend_forward_end_encodeBlockAsm4MB | ||
1704 | |||
1705 | matchlen_match4_repeat_extend_encodeBlockAsm4MB: | ||
1706 | CMPL R8, $0x04 | ||
1707 | JB matchlen_match2_repeat_extend_encodeBlockAsm4MB | ||
1708 | MOVL (R9)(R11*1), R10 | ||
1709 | CMPL (BX)(R11*1), R10 | ||
1710 | JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB | ||
1711 | LEAL -4(R8), R8 | ||
1712 | LEAL 4(R11), R11 | ||
1713 | |||
1714 | matchlen_match2_repeat_extend_encodeBlockAsm4MB: | ||
1715 | CMPL R8, $0x01 | ||
1716 | JE matchlen_match1_repeat_extend_encodeBlockAsm4MB | ||
1717 | JB repeat_extend_forward_end_encodeBlockAsm4MB | ||
1718 | MOVW (R9)(R11*1), R10 | ||
1719 | CMPW (BX)(R11*1), R10 | ||
1720 | JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB | ||
1721 | LEAL 2(R11), R11 | ||
1722 | SUBL $0x02, R8 | ||
1723 | JZ repeat_extend_forward_end_encodeBlockAsm4MB | ||
1724 | |||
1725 | matchlen_match1_repeat_extend_encodeBlockAsm4MB: | ||
1726 | MOVB (R9)(R11*1), R10 | ||
1727 | CMPB (BX)(R11*1), R10 | ||
1728 | JNE repeat_extend_forward_end_encodeBlockAsm4MB | ||
1729 | LEAL 1(R11), R11 | ||
1730 | |||
1731 | repeat_extend_forward_end_encodeBlockAsm4MB: | ||
1732 | ADDL R11, CX | ||
1733 | MOVL CX, BX | ||
1734 | SUBL SI, BX | ||
1735 | MOVL 16(SP), SI | ||
1736 | TESTL DI, DI | ||
1737 | JZ repeat_as_copy_encodeBlockAsm4MB | ||
1738 | |||
1739 | // emitRepeat | ||
1740 | MOVL BX, DI | ||
1741 | LEAL -4(BX), BX | ||
1742 | CMPL DI, $0x08 | ||
1743 | JBE repeat_two_match_repeat_encodeBlockAsm4MB | ||
1744 | CMPL DI, $0x0c | ||
1745 | JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB | ||
1746 | CMPL SI, $0x00000800 | ||
1747 | JB repeat_two_offset_match_repeat_encodeBlockAsm4MB | ||
1748 | |||
1749 | cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: | ||
1750 | CMPL BX, $0x00000104 | ||
1751 | JB repeat_three_match_repeat_encodeBlockAsm4MB | ||
1752 | CMPL BX, $0x00010100 | ||
1753 | JB repeat_four_match_repeat_encodeBlockAsm4MB | ||
1754 | LEAL -65536(BX), BX | ||
1755 | MOVL BX, SI | ||
1756 | MOVW $0x001d, (AX) | ||
1757 | MOVW BX, 2(AX) | ||
1758 | SARL $0x10, SI | ||
1759 | MOVB SI, 4(AX) | ||
1760 | ADDQ $0x05, AX | ||
1761 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1762 | |||
1763 | repeat_four_match_repeat_encodeBlockAsm4MB: | ||
1764 | LEAL -256(BX), BX | ||
1765 | MOVW $0x0019, (AX) | ||
1766 | MOVW BX, 2(AX) | ||
1767 | ADDQ $0x04, AX | ||
1768 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1769 | |||
1770 | repeat_three_match_repeat_encodeBlockAsm4MB: | ||
1771 | LEAL -4(BX), BX | ||
1772 | MOVW $0x0015, (AX) | ||
1773 | MOVB BL, 2(AX) | ||
1774 | ADDQ $0x03, AX | ||
1775 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1776 | |||
1777 | repeat_two_match_repeat_encodeBlockAsm4MB: | ||
1778 | SHLL $0x02, BX | ||
1779 | ORL $0x01, BX | ||
1780 | MOVW BX, (AX) | ||
1781 | ADDQ $0x02, AX | ||
1782 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1783 | |||
1784 | repeat_two_offset_match_repeat_encodeBlockAsm4MB: | ||
1785 | XORQ DI, DI | ||
1786 | LEAL 1(DI)(BX*4), BX | ||
1787 | MOVB SI, 1(AX) | ||
1788 | SARL $0x08, SI | ||
1789 | SHLL $0x05, SI | ||
1790 | ORL SI, BX | ||
1791 | MOVB BL, (AX) | ||
1792 | ADDQ $0x02, AX | ||
1793 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1794 | |||
1795 | repeat_as_copy_encodeBlockAsm4MB: | ||
1796 | // emitCopy | ||
1797 | CMPL SI, $0x00010000 | ||
1798 | JB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB | ||
1799 | CMPL BX, $0x40 | ||
1800 | JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB | ||
1801 | MOVB $0xff, (AX) | ||
1802 | MOVL SI, 1(AX) | ||
1803 | LEAL -64(BX), BX | ||
1804 | ADDQ $0x05, AX | ||
1805 | CMPL BX, $0x04 | ||
1806 | JB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB | ||
1807 | |||
1808 | // emitRepeat | ||
1809 | MOVL BX, DI | ||
1810 | LEAL -4(BX), BX | ||
1811 | CMPL DI, $0x08 | ||
1812 | JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy | ||
1813 | CMPL DI, $0x0c | ||
1814 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy | ||
1815 | CMPL SI, $0x00000800 | ||
1816 | JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy | ||
1817 | |||
1818 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: | ||
1819 | CMPL BX, $0x00000104 | ||
1820 | JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy | ||
1821 | CMPL BX, $0x00010100 | ||
1822 | JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy | ||
1823 | LEAL -65536(BX), BX | ||
1824 | MOVL BX, SI | ||
1825 | MOVW $0x001d, (AX) | ||
1826 | MOVW BX, 2(AX) | ||
1827 | SARL $0x10, SI | ||
1828 | MOVB SI, 4(AX) | ||
1829 | ADDQ $0x05, AX | ||
1830 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1831 | |||
1832 | repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: | ||
1833 | LEAL -256(BX), BX | ||
1834 | MOVW $0x0019, (AX) | ||
1835 | MOVW BX, 2(AX) | ||
1836 | ADDQ $0x04, AX | ||
1837 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1838 | |||
1839 | repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: | ||
1840 | LEAL -4(BX), BX | ||
1841 | MOVW $0x0015, (AX) | ||
1842 | MOVB BL, 2(AX) | ||
1843 | ADDQ $0x03, AX | ||
1844 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1845 | |||
1846 | repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: | ||
1847 | SHLL $0x02, BX | ||
1848 | ORL $0x01, BX | ||
1849 | MOVW BX, (AX) | ||
1850 | ADDQ $0x02, AX | ||
1851 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1852 | |||
1853 | repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: | ||
1854 | XORQ DI, DI | ||
1855 | LEAL 1(DI)(BX*4), BX | ||
1856 | MOVB SI, 1(AX) | ||
1857 | SARL $0x08, SI | ||
1858 | SHLL $0x05, SI | ||
1859 | ORL SI, BX | ||
1860 | MOVB BL, (AX) | ||
1861 | ADDQ $0x02, AX | ||
1862 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1863 | |||
1864 | four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: | ||
1865 | TESTL BX, BX | ||
1866 | JZ repeat_end_emit_encodeBlockAsm4MB | ||
1867 | XORL DI, DI | ||
1868 | LEAL -1(DI)(BX*4), BX | ||
1869 | MOVB BL, (AX) | ||
1870 | MOVL SI, 1(AX) | ||
1871 | ADDQ $0x05, AX | ||
1872 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1873 | |||
1874 | two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: | ||
1875 | CMPL BX, $0x40 | ||
1876 | JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB | ||
1877 | CMPL SI, $0x00000800 | ||
1878 | JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB | ||
1879 | MOVL $0x00000001, DI | ||
1880 | LEAL 16(DI), DI | ||
1881 | MOVB SI, 1(AX) | ||
1882 | SHRL $0x08, SI | ||
1883 | SHLL $0x05, SI | ||
1884 | ORL SI, DI | ||
1885 | MOVB DI, (AX) | ||
1886 | ADDQ $0x02, AX | ||
1887 | SUBL $0x08, BX | ||
1888 | |||
1889 | // emitRepeat | ||
1890 | LEAL -4(BX), BX | ||
1891 | JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b | ||
1892 | MOVL BX, DI | ||
1893 | LEAL -4(BX), BX | ||
1894 | CMPL DI, $0x08 | ||
1895 | JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b | ||
1896 | CMPL DI, $0x0c | ||
1897 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b | ||
1898 | CMPL SI, $0x00000800 | ||
1899 | JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b | ||
1900 | |||
1901 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: | ||
1902 | CMPL BX, $0x00000104 | ||
1903 | JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b | ||
1904 | CMPL BX, $0x00010100 | ||
1905 | JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b | ||
1906 | LEAL -65536(BX), BX | ||
1907 | MOVL BX, SI | ||
1908 | MOVW $0x001d, (AX) | ||
1909 | MOVW BX, 2(AX) | ||
1910 | SARL $0x10, SI | ||
1911 | MOVB SI, 4(AX) | ||
1912 | ADDQ $0x05, AX | ||
1913 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1914 | |||
1915 | repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: | ||
1916 | LEAL -256(BX), BX | ||
1917 | MOVW $0x0019, (AX) | ||
1918 | MOVW BX, 2(AX) | ||
1919 | ADDQ $0x04, AX | ||
1920 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1921 | |||
1922 | repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: | ||
1923 | LEAL -4(BX), BX | ||
1924 | MOVW $0x0015, (AX) | ||
1925 | MOVB BL, 2(AX) | ||
1926 | ADDQ $0x03, AX | ||
1927 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1928 | |||
1929 | repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: | ||
1930 | SHLL $0x02, BX | ||
1931 | ORL $0x01, BX | ||
1932 | MOVW BX, (AX) | ||
1933 | ADDQ $0x02, AX | ||
1934 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1935 | |||
1936 | repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: | ||
1937 | XORQ DI, DI | ||
1938 | LEAL 1(DI)(BX*4), BX | ||
1939 | MOVB SI, 1(AX) | ||
1940 | SARL $0x08, SI | ||
1941 | SHLL $0x05, SI | ||
1942 | ORL SI, BX | ||
1943 | MOVB BL, (AX) | ||
1944 | ADDQ $0x02, AX | ||
1945 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1946 | |||
1947 | long_offset_short_repeat_as_copy_encodeBlockAsm4MB: | ||
1948 | MOVB $0xee, (AX) | ||
1949 | MOVW SI, 1(AX) | ||
1950 | LEAL -60(BX), BX | ||
1951 | ADDQ $0x03, AX | ||
1952 | |||
1953 | // emitRepeat | ||
1954 | MOVL BX, DI | ||
1955 | LEAL -4(BX), BX | ||
1956 | CMPL DI, $0x08 | ||
1957 | JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short | ||
1958 | CMPL DI, $0x0c | ||
1959 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short | ||
1960 | CMPL SI, $0x00000800 | ||
1961 | JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short | ||
1962 | |||
1963 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: | ||
1964 | CMPL BX, $0x00000104 | ||
1965 | JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short | ||
1966 | CMPL BX, $0x00010100 | ||
1967 | JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short | ||
1968 | LEAL -65536(BX), BX | ||
1969 | MOVL BX, SI | ||
1970 | MOVW $0x001d, (AX) | ||
1971 | MOVW BX, 2(AX) | ||
1972 | SARL $0x10, SI | ||
1973 | MOVB SI, 4(AX) | ||
1974 | ADDQ $0x05, AX | ||
1975 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1976 | |||
1977 | repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: | ||
1978 | LEAL -256(BX), BX | ||
1979 | MOVW $0x0019, (AX) | ||
1980 | MOVW BX, 2(AX) | ||
1981 | ADDQ $0x04, AX | ||
1982 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1983 | |||
1984 | repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: | ||
1985 | LEAL -4(BX), BX | ||
1986 | MOVW $0x0015, (AX) | ||
1987 | MOVB BL, 2(AX) | ||
1988 | ADDQ $0x03, AX | ||
1989 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1990 | |||
1991 | repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: | ||
1992 | SHLL $0x02, BX | ||
1993 | ORL $0x01, BX | ||
1994 | MOVW BX, (AX) | ||
1995 | ADDQ $0x02, AX | ||
1996 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
1997 | |||
1998 | repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: | ||
1999 | XORQ DI, DI | ||
2000 | LEAL 1(DI)(BX*4), BX | ||
2001 | MOVB SI, 1(AX) | ||
2002 | SARL $0x08, SI | ||
2003 | SHLL $0x05, SI | ||
2004 | ORL SI, BX | ||
2005 | MOVB BL, (AX) | ||
2006 | ADDQ $0x02, AX | ||
2007 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
2008 | |||
2009 | two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: | ||
2010 | MOVL BX, DI | ||
2011 | SHLL $0x02, DI | ||
2012 | CMPL BX, $0x0c | ||
2013 | JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB | ||
2014 | CMPL SI, $0x00000800 | ||
2015 | JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB | ||
2016 | LEAL -15(DI), DI | ||
2017 | MOVB SI, 1(AX) | ||
2018 | SHRL $0x08, SI | ||
2019 | SHLL $0x05, SI | ||
2020 | ORL SI, DI | ||
2021 | MOVB DI, (AX) | ||
2022 | ADDQ $0x02, AX | ||
2023 | JMP repeat_end_emit_encodeBlockAsm4MB | ||
2024 | |||
2025 | emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: | ||
2026 | LEAL -2(DI), DI | ||
2027 | MOVB DI, (AX) | ||
2028 | MOVW SI, 1(AX) | ||
2029 | ADDQ $0x03, AX | ||
2030 | |||
2031 | repeat_end_emit_encodeBlockAsm4MB: | ||
2032 | MOVL CX, 12(SP) | ||
2033 | JMP search_loop_encodeBlockAsm4MB | ||
2034 | |||
2035 | no_repeat_found_encodeBlockAsm4MB: | ||
2036 | CMPL (DX)(BX*1), SI | ||
2037 | JEQ candidate_match_encodeBlockAsm4MB | ||
2038 | SHRQ $0x08, SI | ||
2039 | MOVL 24(SP)(R9*4), BX | ||
2040 | LEAL 2(CX), R8 | ||
2041 | CMPL (DX)(DI*1), SI | ||
2042 | JEQ candidate2_match_encodeBlockAsm4MB | ||
2043 | MOVL R8, 24(SP)(R9*4) | ||
2044 | SHRQ $0x08, SI | ||
2045 | CMPL (DX)(BX*1), SI | ||
2046 | JEQ candidate3_match_encodeBlockAsm4MB | ||
2047 | MOVL 20(SP), CX | ||
2048 | JMP search_loop_encodeBlockAsm4MB | ||
2049 | |||
2050 | candidate3_match_encodeBlockAsm4MB: | ||
2051 | ADDL $0x02, CX | ||
2052 | JMP candidate_match_encodeBlockAsm4MB | ||
2053 | |||
2054 | candidate2_match_encodeBlockAsm4MB: | ||
2055 | MOVL R8, 24(SP)(R9*4) | ||
2056 | INCL CX | ||
2057 | MOVL DI, BX | ||
2058 | |||
2059 | candidate_match_encodeBlockAsm4MB: | ||
2060 | MOVL 12(SP), SI | ||
2061 | TESTL BX, BX | ||
2062 | JZ match_extend_back_end_encodeBlockAsm4MB | ||
2063 | |||
2064 | match_extend_back_loop_encodeBlockAsm4MB: | ||
2065 | CMPL CX, SI | ||
2066 | JBE match_extend_back_end_encodeBlockAsm4MB | ||
2067 | MOVB -1(DX)(BX*1), DI | ||
2068 | MOVB -1(DX)(CX*1), R8 | ||
2069 | CMPB DI, R8 | ||
2070 | JNE match_extend_back_end_encodeBlockAsm4MB | ||
2071 | LEAL -1(CX), CX | ||
2072 | DECL BX | ||
2073 | JZ match_extend_back_end_encodeBlockAsm4MB | ||
2074 | JMP match_extend_back_loop_encodeBlockAsm4MB | ||
2075 | |||
2076 | match_extend_back_end_encodeBlockAsm4MB: | ||
2077 | MOVL CX, SI | ||
2078 | SUBL 12(SP), SI | ||
2079 | LEAQ 4(AX)(SI*1), SI | ||
2080 | CMPQ SI, (SP) | ||
2081 | JB match_dst_size_check_encodeBlockAsm4MB | ||
2082 | MOVQ $0x00000000, ret+48(FP) | ||
2083 | RET | ||
2084 | |||
2085 | match_dst_size_check_encodeBlockAsm4MB: | ||
2086 | MOVL CX, SI | ||
2087 | MOVL 12(SP), DI | ||
2088 | CMPL DI, SI | ||
2089 | JEQ emit_literal_done_match_emit_encodeBlockAsm4MB | ||
2090 | MOVL SI, R8 | ||
2091 | MOVL SI, 12(SP) | ||
2092 | LEAQ (DX)(DI*1), SI | ||
2093 | SUBL DI, R8 | ||
2094 | LEAL -1(R8), DI | ||
2095 | CMPL DI, $0x3c | ||
2096 | JB one_byte_match_emit_encodeBlockAsm4MB | ||
2097 | CMPL DI, $0x00000100 | ||
2098 | JB two_bytes_match_emit_encodeBlockAsm4MB | ||
2099 | CMPL DI, $0x00010000 | ||
2100 | JB three_bytes_match_emit_encodeBlockAsm4MB | ||
2101 | MOVL DI, R9 | ||
2102 | SHRL $0x10, R9 | ||
2103 | MOVB $0xf8, (AX) | ||
2104 | MOVW DI, 1(AX) | ||
2105 | MOVB R9, 3(AX) | ||
2106 | ADDQ $0x04, AX | ||
2107 | JMP memmove_long_match_emit_encodeBlockAsm4MB | ||
2108 | |||
2109 | three_bytes_match_emit_encodeBlockAsm4MB: | ||
2110 | MOVB $0xf4, (AX) | ||
2111 | MOVW DI, 1(AX) | ||
2112 | ADDQ $0x03, AX | ||
2113 | JMP memmove_long_match_emit_encodeBlockAsm4MB | ||
2114 | |||
2115 | two_bytes_match_emit_encodeBlockAsm4MB: | ||
2116 | MOVB $0xf0, (AX) | ||
2117 | MOVB DI, 1(AX) | ||
2118 | ADDQ $0x02, AX | ||
2119 | CMPL DI, $0x40 | ||
2120 | JB memmove_match_emit_encodeBlockAsm4MB | ||
2121 | JMP memmove_long_match_emit_encodeBlockAsm4MB | ||
2122 | |||
2123 | one_byte_match_emit_encodeBlockAsm4MB: | ||
2124 | SHLB $0x02, DI | ||
2125 | MOVB DI, (AX) | ||
2126 | ADDQ $0x01, AX | ||
2127 | |||
2128 | memmove_match_emit_encodeBlockAsm4MB: | ||
2129 | LEAQ (AX)(R8*1), DI | ||
2130 | |||
2131 | // genMemMoveShort | ||
2132 | CMPQ R8, $0x08 | ||
2133 | JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 | ||
2134 | CMPQ R8, $0x10 | ||
2135 | JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 | ||
2136 | CMPQ R8, $0x20 | ||
2137 | JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 | ||
2138 | JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 | ||
2139 | |||
2140 | emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: | ||
2141 | MOVQ (SI), R9 | ||
2142 | MOVQ R9, (AX) | ||
2143 | JMP memmove_end_copy_match_emit_encodeBlockAsm4MB | ||
2144 | |||
2145 | emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: | ||
2146 | MOVQ (SI), R9 | ||
2147 | MOVQ -8(SI)(R8*1), SI | ||
2148 | MOVQ R9, (AX) | ||
2149 | MOVQ SI, -8(AX)(R8*1) | ||
2150 | JMP memmove_end_copy_match_emit_encodeBlockAsm4MB | ||
2151 | |||
2152 | emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: | ||
2153 | MOVOU (SI), X0 | ||
2154 | MOVOU -16(SI)(R8*1), X1 | ||
2155 | MOVOU X0, (AX) | ||
2156 | MOVOU X1, -16(AX)(R8*1) | ||
2157 | JMP memmove_end_copy_match_emit_encodeBlockAsm4MB | ||
2158 | |||
2159 | emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: | ||
2160 | MOVOU (SI), X0 | ||
2161 | MOVOU 16(SI), X1 | ||
2162 | MOVOU -32(SI)(R8*1), X2 | ||
2163 | MOVOU -16(SI)(R8*1), X3 | ||
2164 | MOVOU X0, (AX) | ||
2165 | MOVOU X1, 16(AX) | ||
2166 | MOVOU X2, -32(AX)(R8*1) | ||
2167 | MOVOU X3, -16(AX)(R8*1) | ||
2168 | |||
2169 | memmove_end_copy_match_emit_encodeBlockAsm4MB: | ||
2170 | MOVQ DI, AX | ||
2171 | JMP emit_literal_done_match_emit_encodeBlockAsm4MB | ||
2172 | |||
2173 | memmove_long_match_emit_encodeBlockAsm4MB: | ||
2174 | LEAQ (AX)(R8*1), DI | ||
2175 | |||
2176 | // genMemMoveLong | ||
2177 | MOVOU (SI), X0 | ||
2178 | MOVOU 16(SI), X1 | ||
2179 | MOVOU -32(SI)(R8*1), X2 | ||
2180 | MOVOU -16(SI)(R8*1), X3 | ||
2181 | MOVQ R8, R10 | ||
2182 | SHRQ $0x05, R10 | ||
2183 | MOVQ AX, R9 | ||
2184 | ANDL $0x0000001f, R9 | ||
2185 | MOVQ $0x00000040, R11 | ||
2186 | SUBQ R9, R11 | ||
2187 | DECQ R10 | ||
2188 | JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 | ||
2189 | LEAQ -32(SI)(R11*1), R9 | ||
2190 | LEAQ -32(AX)(R11*1), R12 | ||
2191 | |||
2192 | emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: | ||
2193 | MOVOU (R9), X4 | ||
2194 | MOVOU 16(R9), X5 | ||
2195 | MOVOA X4, (R12) | ||
2196 | MOVOA X5, 16(R12) | ||
2197 | ADDQ $0x20, R12 | ||
2198 | ADDQ $0x20, R9 | ||
2199 | ADDQ $0x20, R11 | ||
2200 | DECQ R10 | ||
2201 | JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back | ||
2202 | |||
2203 | emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: | ||
2204 | MOVOU -32(SI)(R11*1), X4 | ||
2205 | MOVOU -16(SI)(R11*1), X5 | ||
2206 | MOVOA X4, -32(AX)(R11*1) | ||
2207 | MOVOA X5, -16(AX)(R11*1) | ||
2208 | ADDQ $0x20, R11 | ||
2209 | CMPQ R8, R11 | ||
2210 | JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 | ||
2211 | MOVOU X0, (AX) | ||
2212 | MOVOU X1, 16(AX) | ||
2213 | MOVOU X2, -32(AX)(R8*1) | ||
2214 | MOVOU X3, -16(AX)(R8*1) | ||
2215 | MOVQ DI, AX | ||
2216 | |||
2217 | emit_literal_done_match_emit_encodeBlockAsm4MB: | ||
2218 | match_nolit_loop_encodeBlockAsm4MB: | ||
2219 | MOVL CX, SI | ||
2220 | SUBL BX, SI | ||
2221 | MOVL SI, 16(SP) | ||
2222 | ADDL $0x04, CX | ||
2223 | ADDL $0x04, BX | ||
2224 | MOVQ src_len+32(FP), SI | ||
2225 | SUBL CX, SI | ||
2226 | LEAQ (DX)(CX*1), DI | ||
2227 | LEAQ (DX)(BX*1), BX | ||
2228 | |||
2229 | // matchLen | ||
2230 | XORL R9, R9 | ||
2231 | |||
2232 | matchlen_loopback_16_match_nolit_encodeBlockAsm4MB: | ||
2233 | CMPL SI, $0x10 | ||
2234 | JB matchlen_match8_match_nolit_encodeBlockAsm4MB | ||
2235 | MOVQ (DI)(R9*1), R8 | ||
2236 | MOVQ 8(DI)(R9*1), R10 | ||
2237 | XORQ (BX)(R9*1), R8 | ||
2238 | JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB | ||
2239 | XORQ 8(BX)(R9*1), R10 | ||
2240 | JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4MB | ||
2241 | LEAL -16(SI), SI | ||
2242 | LEAL 16(R9), R9 | ||
2243 | JMP matchlen_loopback_16_match_nolit_encodeBlockAsm4MB | ||
2244 | |||
2245 | matchlen_bsf_16match_nolit_encodeBlockAsm4MB: | ||
2246 | #ifdef GOAMD64_v3 | ||
2247 | TZCNTQ R10, R10 | ||
2248 | |||
2249 | #else | ||
2250 | BSFQ R10, R10 | ||
2251 | |||
2252 | #endif | ||
2253 | SARQ $0x03, R10 | ||
2254 | LEAL 8(R9)(R10*1), R9 | ||
2255 | JMP match_nolit_end_encodeBlockAsm4MB | ||
2256 | |||
2257 | matchlen_match8_match_nolit_encodeBlockAsm4MB: | ||
2258 | CMPL SI, $0x08 | ||
2259 | JB matchlen_match4_match_nolit_encodeBlockAsm4MB | ||
2260 | MOVQ (DI)(R9*1), R8 | ||
2261 | XORQ (BX)(R9*1), R8 | ||
2262 | JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB | ||
2263 | LEAL -8(SI), SI | ||
2264 | LEAL 8(R9), R9 | ||
2265 | JMP matchlen_match4_match_nolit_encodeBlockAsm4MB | ||
2266 | |||
2267 | matchlen_bsf_8_match_nolit_encodeBlockAsm4MB: | ||
2268 | #ifdef GOAMD64_v3 | ||
2269 | TZCNTQ R8, R8 | ||
2270 | |||
2271 | #else | ||
2272 | BSFQ R8, R8 | ||
2273 | |||
2274 | #endif | ||
2275 | SARQ $0x03, R8 | ||
2276 | LEAL (R9)(R8*1), R9 | ||
2277 | JMP match_nolit_end_encodeBlockAsm4MB | ||
2278 | |||
2279 | matchlen_match4_match_nolit_encodeBlockAsm4MB: | ||
2280 | CMPL SI, $0x04 | ||
2281 | JB matchlen_match2_match_nolit_encodeBlockAsm4MB | ||
2282 | MOVL (DI)(R9*1), R8 | ||
2283 | CMPL (BX)(R9*1), R8 | ||
2284 | JNE matchlen_match2_match_nolit_encodeBlockAsm4MB | ||
2285 | LEAL -4(SI), SI | ||
2286 | LEAL 4(R9), R9 | ||
2287 | |||
2288 | matchlen_match2_match_nolit_encodeBlockAsm4MB: | ||
2289 | CMPL SI, $0x01 | ||
2290 | JE matchlen_match1_match_nolit_encodeBlockAsm4MB | ||
2291 | JB match_nolit_end_encodeBlockAsm4MB | ||
2292 | MOVW (DI)(R9*1), R8 | ||
2293 | CMPW (BX)(R9*1), R8 | ||
2294 | JNE matchlen_match1_match_nolit_encodeBlockAsm4MB | ||
2295 | LEAL 2(R9), R9 | ||
2296 | SUBL $0x02, SI | ||
2297 | JZ match_nolit_end_encodeBlockAsm4MB | ||
2298 | |||
2299 | matchlen_match1_match_nolit_encodeBlockAsm4MB: | ||
2300 | MOVB (DI)(R9*1), R8 | ||
2301 | CMPB (BX)(R9*1), R8 | ||
2302 | JNE match_nolit_end_encodeBlockAsm4MB | ||
2303 | LEAL 1(R9), R9 | ||
2304 | |||
2305 | match_nolit_end_encodeBlockAsm4MB: | ||
2306 | ADDL R9, CX | ||
2307 | MOVL 16(SP), BX | ||
2308 | ADDL $0x04, R9 | ||
2309 | MOVL CX, 12(SP) | ||
2310 | |||
2311 | // emitCopy | ||
2312 | CMPL BX, $0x00010000 | ||
2313 | JB two_byte_offset_match_nolit_encodeBlockAsm4MB | ||
2314 | CMPL R9, $0x40 | ||
2315 | JBE four_bytes_remain_match_nolit_encodeBlockAsm4MB | ||
2316 | MOVB $0xff, (AX) | ||
2317 | MOVL BX, 1(AX) | ||
2318 | LEAL -64(R9), R9 | ||
2319 | ADDQ $0x05, AX | ||
2320 | CMPL R9, $0x04 | ||
2321 | JB four_bytes_remain_match_nolit_encodeBlockAsm4MB | ||
2322 | |||
2323 | // emitRepeat | ||
2324 | MOVL R9, SI | ||
2325 | LEAL -4(R9), R9 | ||
2326 | CMPL SI, $0x08 | ||
2327 | JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy | ||
2328 | CMPL SI, $0x0c | ||
2329 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy | ||
2330 | CMPL BX, $0x00000800 | ||
2331 | JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy | ||
2332 | |||
2333 | cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: | ||
2334 | CMPL R9, $0x00000104 | ||
2335 | JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy | ||
2336 | CMPL R9, $0x00010100 | ||
2337 | JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy | ||
2338 | LEAL -65536(R9), R9 | ||
2339 | MOVL R9, BX | ||
2340 | MOVW $0x001d, (AX) | ||
2341 | MOVW R9, 2(AX) | ||
2342 | SARL $0x10, BX | ||
2343 | MOVB BL, 4(AX) | ||
2344 | ADDQ $0x05, AX | ||
2345 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2346 | |||
2347 | repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: | ||
2348 | LEAL -256(R9), R9 | ||
2349 | MOVW $0x0019, (AX) | ||
2350 | MOVW R9, 2(AX) | ||
2351 | ADDQ $0x04, AX | ||
2352 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2353 | |||
2354 | repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: | ||
2355 | LEAL -4(R9), R9 | ||
2356 | MOVW $0x0015, (AX) | ||
2357 | MOVB R9, 2(AX) | ||
2358 | ADDQ $0x03, AX | ||
2359 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2360 | |||
2361 | repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: | ||
2362 | SHLL $0x02, R9 | ||
2363 | ORL $0x01, R9 | ||
2364 | MOVW R9, (AX) | ||
2365 | ADDQ $0x02, AX | ||
2366 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2367 | |||
2368 | repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: | ||
2369 | XORQ SI, SI | ||
2370 | LEAL 1(SI)(R9*4), R9 | ||
2371 | MOVB BL, 1(AX) | ||
2372 | SARL $0x08, BX | ||
2373 | SHLL $0x05, BX | ||
2374 | ORL BX, R9 | ||
2375 | MOVB R9, (AX) | ||
2376 | ADDQ $0x02, AX | ||
2377 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2378 | |||
2379 | four_bytes_remain_match_nolit_encodeBlockAsm4MB: | ||
2380 | TESTL R9, R9 | ||
2381 | JZ match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2382 | XORL SI, SI | ||
2383 | LEAL -1(SI)(R9*4), R9 | ||
2384 | MOVB R9, (AX) | ||
2385 | MOVL BX, 1(AX) | ||
2386 | ADDQ $0x05, AX | ||
2387 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2388 | |||
2389 | two_byte_offset_match_nolit_encodeBlockAsm4MB: | ||
2390 | CMPL R9, $0x40 | ||
2391 | JBE two_byte_offset_short_match_nolit_encodeBlockAsm4MB | ||
2392 | CMPL BX, $0x00000800 | ||
2393 | JAE long_offset_short_match_nolit_encodeBlockAsm4MB | ||
2394 | MOVL $0x00000001, SI | ||
2395 | LEAL 16(SI), SI | ||
2396 | MOVB BL, 1(AX) | ||
2397 | SHRL $0x08, BX | ||
2398 | SHLL $0x05, BX | ||
2399 | ORL BX, SI | ||
2400 | MOVB SI, (AX) | ||
2401 | ADDQ $0x02, AX | ||
2402 | SUBL $0x08, R9 | ||
2403 | |||
2404 | // emitRepeat | ||
2405 | LEAL -4(R9), R9 | ||
2406 | JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b | ||
2407 | MOVL R9, SI | ||
2408 | LEAL -4(R9), R9 | ||
2409 | CMPL SI, $0x08 | ||
2410 | JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b | ||
2411 | CMPL SI, $0x0c | ||
2412 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b | ||
2413 | CMPL BX, $0x00000800 | ||
2414 | JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b | ||
2415 | |||
2416 | cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: | ||
2417 | CMPL R9, $0x00000104 | ||
2418 | JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b | ||
2419 | CMPL R9, $0x00010100 | ||
2420 | JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b | ||
2421 | LEAL -65536(R9), R9 | ||
2422 | MOVL R9, BX | ||
2423 | MOVW $0x001d, (AX) | ||
2424 | MOVW R9, 2(AX) | ||
2425 | SARL $0x10, BX | ||
2426 | MOVB BL, 4(AX) | ||
2427 | ADDQ $0x05, AX | ||
2428 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2429 | |||
2430 | repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: | ||
2431 | LEAL -256(R9), R9 | ||
2432 | MOVW $0x0019, (AX) | ||
2433 | MOVW R9, 2(AX) | ||
2434 | ADDQ $0x04, AX | ||
2435 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2436 | |||
2437 | repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: | ||
2438 | LEAL -4(R9), R9 | ||
2439 | MOVW $0x0015, (AX) | ||
2440 | MOVB R9, 2(AX) | ||
2441 | ADDQ $0x03, AX | ||
2442 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2443 | |||
2444 | repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: | ||
2445 | SHLL $0x02, R9 | ||
2446 | ORL $0x01, R9 | ||
2447 | MOVW R9, (AX) | ||
2448 | ADDQ $0x02, AX | ||
2449 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2450 | |||
2451 | repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: | ||
2452 | XORQ SI, SI | ||
2453 | LEAL 1(SI)(R9*4), R9 | ||
2454 | MOVB BL, 1(AX) | ||
2455 | SARL $0x08, BX | ||
2456 | SHLL $0x05, BX | ||
2457 | ORL BX, R9 | ||
2458 | MOVB R9, (AX) | ||
2459 | ADDQ $0x02, AX | ||
2460 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2461 | |||
2462 | long_offset_short_match_nolit_encodeBlockAsm4MB: | ||
2463 | MOVB $0xee, (AX) | ||
2464 | MOVW BX, 1(AX) | ||
2465 | LEAL -60(R9), R9 | ||
2466 | ADDQ $0x03, AX | ||
2467 | |||
2468 | // emitRepeat | ||
2469 | MOVL R9, SI | ||
2470 | LEAL -4(R9), R9 | ||
2471 | CMPL SI, $0x08 | ||
2472 | JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short | ||
2473 | CMPL SI, $0x0c | ||
2474 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short | ||
2475 | CMPL BX, $0x00000800 | ||
2476 | JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short | ||
2477 | |||
2478 | cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: | ||
2479 | CMPL R9, $0x00000104 | ||
2480 | JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short | ||
2481 | CMPL R9, $0x00010100 | ||
2482 | JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short | ||
2483 | LEAL -65536(R9), R9 | ||
2484 | MOVL R9, BX | ||
2485 | MOVW $0x001d, (AX) | ||
2486 | MOVW R9, 2(AX) | ||
2487 | SARL $0x10, BX | ||
2488 | MOVB BL, 4(AX) | ||
2489 | ADDQ $0x05, AX | ||
2490 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2491 | |||
2492 | repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: | ||
2493 | LEAL -256(R9), R9 | ||
2494 | MOVW $0x0019, (AX) | ||
2495 | MOVW R9, 2(AX) | ||
2496 | ADDQ $0x04, AX | ||
2497 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2498 | |||
2499 | repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: | ||
2500 | LEAL -4(R9), R9 | ||
2501 | MOVW $0x0015, (AX) | ||
2502 | MOVB R9, 2(AX) | ||
2503 | ADDQ $0x03, AX | ||
2504 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2505 | |||
2506 | repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: | ||
2507 | SHLL $0x02, R9 | ||
2508 | ORL $0x01, R9 | ||
2509 | MOVW R9, (AX) | ||
2510 | ADDQ $0x02, AX | ||
2511 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2512 | |||
2513 | repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: | ||
2514 | XORQ SI, SI | ||
2515 | LEAL 1(SI)(R9*4), R9 | ||
2516 | MOVB BL, 1(AX) | ||
2517 | SARL $0x08, BX | ||
2518 | SHLL $0x05, BX | ||
2519 | ORL BX, R9 | ||
2520 | MOVB R9, (AX) | ||
2521 | ADDQ $0x02, AX | ||
2522 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2523 | |||
2524 | two_byte_offset_short_match_nolit_encodeBlockAsm4MB: | ||
2525 | MOVL R9, SI | ||
2526 | SHLL $0x02, SI | ||
2527 | CMPL R9, $0x0c | ||
2528 | JAE emit_copy_three_match_nolit_encodeBlockAsm4MB | ||
2529 | CMPL BX, $0x00000800 | ||
2530 | JAE emit_copy_three_match_nolit_encodeBlockAsm4MB | ||
2531 | LEAL -15(SI), SI | ||
2532 | MOVB BL, 1(AX) | ||
2533 | SHRL $0x08, BX | ||
2534 | SHLL $0x05, BX | ||
2535 | ORL BX, SI | ||
2536 | MOVB SI, (AX) | ||
2537 | ADDQ $0x02, AX | ||
2538 | JMP match_nolit_emitcopy_end_encodeBlockAsm4MB | ||
2539 | |||
2540 | emit_copy_three_match_nolit_encodeBlockAsm4MB: | ||
2541 | LEAL -2(SI), SI | ||
2542 | MOVB SI, (AX) | ||
2543 | MOVW BX, 1(AX) | ||
2544 | ADDQ $0x03, AX | ||
2545 | |||
2546 | match_nolit_emitcopy_end_encodeBlockAsm4MB: | ||
2547 | CMPL CX, 8(SP) | ||
2548 | JAE emit_remainder_encodeBlockAsm4MB | ||
2549 | MOVQ -2(DX)(CX*1), SI | ||
2550 | CMPQ AX, (SP) | ||
2551 | JB match_nolit_dst_ok_encodeBlockAsm4MB | ||
2552 | MOVQ $0x00000000, ret+48(FP) | ||
2553 | RET | ||
2554 | |||
2555 | match_nolit_dst_ok_encodeBlockAsm4MB: | ||
2556 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
2557 | MOVQ SI, DI | ||
2558 | SHRQ $0x10, SI | ||
2559 | MOVQ SI, BX | ||
2560 | SHLQ $0x10, DI | ||
2561 | IMULQ R8, DI | ||
2562 | SHRQ $0x32, DI | ||
2563 | SHLQ $0x10, BX | ||
2564 | IMULQ R8, BX | ||
2565 | SHRQ $0x32, BX | ||
2566 | LEAL -2(CX), R8 | ||
2567 | LEAQ 24(SP)(BX*4), R9 | ||
2568 | MOVL (R9), BX | ||
2569 | MOVL R8, 24(SP)(DI*4) | ||
2570 | MOVL CX, (R9) | ||
2571 | CMPL (DX)(BX*1), SI | ||
2572 | JEQ match_nolit_loop_encodeBlockAsm4MB | ||
2573 | INCL CX | ||
2574 | JMP search_loop_encodeBlockAsm4MB | ||
2575 | |||
2576 | emit_remainder_encodeBlockAsm4MB: | ||
2577 | MOVQ src_len+32(FP), CX | ||
2578 | SUBL 12(SP), CX | ||
2579 | LEAQ 4(AX)(CX*1), CX | ||
2580 | CMPQ CX, (SP) | ||
2581 | JB emit_remainder_ok_encodeBlockAsm4MB | ||
2582 | MOVQ $0x00000000, ret+48(FP) | ||
2583 | RET | ||
2584 | |||
2585 | emit_remainder_ok_encodeBlockAsm4MB: | ||
2586 | MOVQ src_len+32(FP), CX | ||
2587 | MOVL 12(SP), BX | ||
2588 | CMPL BX, CX | ||
2589 | JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB | ||
2590 | MOVL CX, SI | ||
2591 | MOVL CX, 12(SP) | ||
2592 | LEAQ (DX)(BX*1), CX | ||
2593 | SUBL BX, SI | ||
2594 | LEAL -1(SI), DX | ||
2595 | CMPL DX, $0x3c | ||
2596 | JB one_byte_emit_remainder_encodeBlockAsm4MB | ||
2597 | CMPL DX, $0x00000100 | ||
2598 | JB two_bytes_emit_remainder_encodeBlockAsm4MB | ||
2599 | CMPL DX, $0x00010000 | ||
2600 | JB three_bytes_emit_remainder_encodeBlockAsm4MB | ||
2601 | MOVL DX, BX | ||
2602 | SHRL $0x10, BX | ||
2603 | MOVB $0xf8, (AX) | ||
2604 | MOVW DX, 1(AX) | ||
2605 | MOVB BL, 3(AX) | ||
2606 | ADDQ $0x04, AX | ||
2607 | JMP memmove_long_emit_remainder_encodeBlockAsm4MB | ||
2608 | |||
2609 | three_bytes_emit_remainder_encodeBlockAsm4MB: | ||
2610 | MOVB $0xf4, (AX) | ||
2611 | MOVW DX, 1(AX) | ||
2612 | ADDQ $0x03, AX | ||
2613 | JMP memmove_long_emit_remainder_encodeBlockAsm4MB | ||
2614 | |||
2615 | two_bytes_emit_remainder_encodeBlockAsm4MB: | ||
2616 | MOVB $0xf0, (AX) | ||
2617 | MOVB DL, 1(AX) | ||
2618 | ADDQ $0x02, AX | ||
2619 | CMPL DX, $0x40 | ||
2620 | JB memmove_emit_remainder_encodeBlockAsm4MB | ||
2621 | JMP memmove_long_emit_remainder_encodeBlockAsm4MB | ||
2622 | |||
2623 | one_byte_emit_remainder_encodeBlockAsm4MB: | ||
2624 | SHLB $0x02, DL | ||
2625 | MOVB DL, (AX) | ||
2626 | ADDQ $0x01, AX | ||
2627 | |||
2628 | memmove_emit_remainder_encodeBlockAsm4MB: | ||
2629 | LEAQ (AX)(SI*1), DX | ||
2630 | MOVL SI, BX | ||
2631 | |||
2632 | // genMemMoveShort | ||
2633 | CMPQ BX, $0x03 | ||
2634 | JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 | ||
2635 | JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 | ||
2636 | CMPQ BX, $0x08 | ||
2637 | JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 | ||
2638 | CMPQ BX, $0x10 | ||
2639 | JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 | ||
2640 | CMPQ BX, $0x20 | ||
2641 | JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 | ||
2642 | JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 | ||
2643 | |||
2644 | emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: | ||
2645 | MOVB (CX), SI | ||
2646 | MOVB -1(CX)(BX*1), CL | ||
2647 | MOVB SI, (AX) | ||
2648 | MOVB CL, -1(AX)(BX*1) | ||
2649 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB | ||
2650 | |||
2651 | emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: | ||
2652 | MOVW (CX), SI | ||
2653 | MOVB 2(CX), CL | ||
2654 | MOVW SI, (AX) | ||
2655 | MOVB CL, 2(AX) | ||
2656 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB | ||
2657 | |||
2658 | emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: | ||
2659 | MOVL (CX), SI | ||
2660 | MOVL -4(CX)(BX*1), CX | ||
2661 | MOVL SI, (AX) | ||
2662 | MOVL CX, -4(AX)(BX*1) | ||
2663 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB | ||
2664 | |||
2665 | emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: | ||
2666 | MOVQ (CX), SI | ||
2667 | MOVQ -8(CX)(BX*1), CX | ||
2668 | MOVQ SI, (AX) | ||
2669 | MOVQ CX, -8(AX)(BX*1) | ||
2670 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB | ||
2671 | |||
2672 | emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: | ||
2673 | MOVOU (CX), X0 | ||
2674 | MOVOU -16(CX)(BX*1), X1 | ||
2675 | MOVOU X0, (AX) | ||
2676 | MOVOU X1, -16(AX)(BX*1) | ||
2677 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB | ||
2678 | |||
2679 | emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: | ||
2680 | MOVOU (CX), X0 | ||
2681 | MOVOU 16(CX), X1 | ||
2682 | MOVOU -32(CX)(BX*1), X2 | ||
2683 | MOVOU -16(CX)(BX*1), X3 | ||
2684 | MOVOU X0, (AX) | ||
2685 | MOVOU X1, 16(AX) | ||
2686 | MOVOU X2, -32(AX)(BX*1) | ||
2687 | MOVOU X3, -16(AX)(BX*1) | ||
2688 | |||
2689 | memmove_end_copy_emit_remainder_encodeBlockAsm4MB: | ||
2690 | MOVQ DX, AX | ||
2691 | JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB | ||
2692 | |||
2693 | memmove_long_emit_remainder_encodeBlockAsm4MB: | ||
2694 | LEAQ (AX)(SI*1), DX | ||
2695 | MOVL SI, BX | ||
2696 | |||
2697 | // genMemMoveLong | ||
2698 | MOVOU (CX), X0 | ||
2699 | MOVOU 16(CX), X1 | ||
2700 | MOVOU -32(CX)(BX*1), X2 | ||
2701 | MOVOU -16(CX)(BX*1), X3 | ||
2702 | MOVQ BX, DI | ||
2703 | SHRQ $0x05, DI | ||
2704 | MOVQ AX, SI | ||
2705 | ANDL $0x0000001f, SI | ||
2706 | MOVQ $0x00000040, R8 | ||
2707 | SUBQ SI, R8 | ||
2708 | DECQ DI | ||
2709 | JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 | ||
2710 | LEAQ -32(CX)(R8*1), SI | ||
2711 | LEAQ -32(AX)(R8*1), R9 | ||
2712 | |||
2713 | emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: | ||
2714 | MOVOU (SI), X4 | ||
2715 | MOVOU 16(SI), X5 | ||
2716 | MOVOA X4, (R9) | ||
2717 | MOVOA X5, 16(R9) | ||
2718 | ADDQ $0x20, R9 | ||
2719 | ADDQ $0x20, SI | ||
2720 | ADDQ $0x20, R8 | ||
2721 | DECQ DI | ||
2722 | JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back | ||
2723 | |||
2724 | emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: | ||
2725 | MOVOU -32(CX)(R8*1), X4 | ||
2726 | MOVOU -16(CX)(R8*1), X5 | ||
2727 | MOVOA X4, -32(AX)(R8*1) | ||
2728 | MOVOA X5, -16(AX)(R8*1) | ||
2729 | ADDQ $0x20, R8 | ||
2730 | CMPQ BX, R8 | ||
2731 | JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 | ||
2732 | MOVOU X0, (AX) | ||
2733 | MOVOU X1, 16(AX) | ||
2734 | MOVOU X2, -32(AX)(BX*1) | ||
2735 | MOVOU X3, -16(AX)(BX*1) | ||
2736 | MOVQ DX, AX | ||
2737 | |||
2738 | emit_literal_done_emit_remainder_encodeBlockAsm4MB: | ||
2739 | MOVQ dst_base+0(FP), CX | ||
2740 | SUBQ CX, AX | ||
2741 | MOVQ AX, ret+48(FP) | ||
2742 | RET | ||
2743 | |||
2744 | // func encodeBlockAsm12B(dst []byte, src []byte) int | ||
2745 | // Requires: BMI, SSE2 | ||
2746 | TEXT ·encodeBlockAsm12B(SB), $16408-56 | ||
2747 | MOVQ dst_base+0(FP), AX | ||
2748 | MOVQ $0x00000080, CX | ||
2749 | LEAQ 24(SP), DX | ||
2750 | PXOR X0, X0 | ||
2751 | |||
2752 | zero_loop_encodeBlockAsm12B: | ||
2753 | MOVOU X0, (DX) | ||
2754 | MOVOU X0, 16(DX) | ||
2755 | MOVOU X0, 32(DX) | ||
2756 | MOVOU X0, 48(DX) | ||
2757 | MOVOU X0, 64(DX) | ||
2758 | MOVOU X0, 80(DX) | ||
2759 | MOVOU X0, 96(DX) | ||
2760 | MOVOU X0, 112(DX) | ||
2761 | ADDQ $0x80, DX | ||
2762 | DECQ CX | ||
2763 | JNZ zero_loop_encodeBlockAsm12B | ||
2764 | MOVL $0x00000000, 12(SP) | ||
2765 | MOVQ src_len+32(FP), CX | ||
2766 | LEAQ -9(CX), DX | ||
2767 | LEAQ -8(CX), BX | ||
2768 | MOVL BX, 8(SP) | ||
2769 | SHRQ $0x05, CX | ||
2770 | SUBL CX, DX | ||
2771 | LEAQ (AX)(DX*1), DX | ||
2772 | MOVQ DX, (SP) | ||
2773 | MOVL $0x00000001, CX | ||
2774 | MOVL CX, 16(SP) | ||
2775 | MOVQ src_base+24(FP), DX | ||
2776 | |||
2777 | search_loop_encodeBlockAsm12B: | ||
2778 | MOVL CX, BX | ||
2779 | SUBL 12(SP), BX | ||
2780 | SHRL $0x05, BX | ||
2781 | LEAL 4(CX)(BX*1), BX | ||
2782 | CMPL BX, 8(SP) | ||
2783 | JAE emit_remainder_encodeBlockAsm12B | ||
2784 | MOVQ (DX)(CX*1), SI | ||
2785 | MOVL BX, 20(SP) | ||
2786 | MOVQ $0x000000cf1bbcdcbb, R8 | ||
2787 | MOVQ SI, R9 | ||
2788 | MOVQ SI, R10 | ||
2789 | SHRQ $0x08, R10 | ||
2790 | SHLQ $0x18, R9 | ||
2791 | IMULQ R8, R9 | ||
2792 | SHRQ $0x34, R9 | ||
2793 | SHLQ $0x18, R10 | ||
2794 | IMULQ R8, R10 | ||
2795 | SHRQ $0x34, R10 | ||
2796 | MOVL 24(SP)(R9*4), BX | ||
2797 | MOVL 24(SP)(R10*4), DI | ||
2798 | MOVL CX, 24(SP)(R9*4) | ||
2799 | LEAL 1(CX), R9 | ||
2800 | MOVL R9, 24(SP)(R10*4) | ||
2801 | MOVQ SI, R9 | ||
2802 | SHRQ $0x10, R9 | ||
2803 | SHLQ $0x18, R9 | ||
2804 | IMULQ R8, R9 | ||
2805 | SHRQ $0x34, R9 | ||
2806 | MOVL CX, R8 | ||
2807 | SUBL 16(SP), R8 | ||
2808 | MOVL 1(DX)(R8*1), R10 | ||
2809 | MOVQ SI, R8 | ||
2810 | SHRQ $0x08, R8 | ||
2811 | CMPL R8, R10 | ||
2812 | JNE no_repeat_found_encodeBlockAsm12B | ||
2813 | LEAL 1(CX), SI | ||
2814 | MOVL 12(SP), DI | ||
2815 | MOVL SI, BX | ||
2816 | SUBL 16(SP), BX | ||
2817 | JZ repeat_extend_back_end_encodeBlockAsm12B | ||
2818 | |||
2819 | repeat_extend_back_loop_encodeBlockAsm12B: | ||
2820 | CMPL SI, DI | ||
2821 | JBE repeat_extend_back_end_encodeBlockAsm12B | ||
2822 | MOVB -1(DX)(BX*1), R8 | ||
2823 | MOVB -1(DX)(SI*1), R9 | ||
2824 | CMPB R8, R9 | ||
2825 | JNE repeat_extend_back_end_encodeBlockAsm12B | ||
2826 | LEAL -1(SI), SI | ||
2827 | DECL BX | ||
2828 | JNZ repeat_extend_back_loop_encodeBlockAsm12B | ||
2829 | |||
2830 | repeat_extend_back_end_encodeBlockAsm12B: | ||
2831 | MOVL 12(SP), BX | ||
2832 | CMPL BX, SI | ||
2833 | JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B | ||
2834 | MOVL SI, R8 | ||
2835 | MOVL SI, 12(SP) | ||
2836 | LEAQ (DX)(BX*1), R9 | ||
2837 | SUBL BX, R8 | ||
2838 | LEAL -1(R8), BX | ||
2839 | CMPL BX, $0x3c | ||
2840 | JB one_byte_repeat_emit_encodeBlockAsm12B | ||
2841 | CMPL BX, $0x00000100 | ||
2842 | JB two_bytes_repeat_emit_encodeBlockAsm12B | ||
2843 | JB three_bytes_repeat_emit_encodeBlockAsm12B | ||
2844 | |||
2845 | three_bytes_repeat_emit_encodeBlockAsm12B: | ||
2846 | MOVB $0xf4, (AX) | ||
2847 | MOVW BX, 1(AX) | ||
2848 | ADDQ $0x03, AX | ||
2849 | JMP memmove_long_repeat_emit_encodeBlockAsm12B | ||
2850 | |||
2851 | two_bytes_repeat_emit_encodeBlockAsm12B: | ||
2852 | MOVB $0xf0, (AX) | ||
2853 | MOVB BL, 1(AX) | ||
2854 | ADDQ $0x02, AX | ||
2855 | CMPL BX, $0x40 | ||
2856 | JB memmove_repeat_emit_encodeBlockAsm12B | ||
2857 | JMP memmove_long_repeat_emit_encodeBlockAsm12B | ||
2858 | |||
2859 | one_byte_repeat_emit_encodeBlockAsm12B: | ||
2860 | SHLB $0x02, BL | ||
2861 | MOVB BL, (AX) | ||
2862 | ADDQ $0x01, AX | ||
2863 | |||
2864 | memmove_repeat_emit_encodeBlockAsm12B: | ||
2865 | LEAQ (AX)(R8*1), BX | ||
2866 | |||
2867 | // genMemMoveShort | ||
2868 | CMPQ R8, $0x08 | ||
2869 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 | ||
2870 | CMPQ R8, $0x10 | ||
2871 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 | ||
2872 | CMPQ R8, $0x20 | ||
2873 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 | ||
2874 | JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 | ||
2875 | |||
2876 | emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: | ||
2877 | MOVQ (R9), R10 | ||
2878 | MOVQ R10, (AX) | ||
2879 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B | ||
2880 | |||
2881 | emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: | ||
2882 | MOVQ (R9), R10 | ||
2883 | MOVQ -8(R9)(R8*1), R9 | ||
2884 | MOVQ R10, (AX) | ||
2885 | MOVQ R9, -8(AX)(R8*1) | ||
2886 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B | ||
2887 | |||
2888 | emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: | ||
2889 | MOVOU (R9), X0 | ||
2890 | MOVOU -16(R9)(R8*1), X1 | ||
2891 | MOVOU X0, (AX) | ||
2892 | MOVOU X1, -16(AX)(R8*1) | ||
2893 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B | ||
2894 | |||
2895 | emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: | ||
2896 | MOVOU (R9), X0 | ||
2897 | MOVOU 16(R9), X1 | ||
2898 | MOVOU -32(R9)(R8*1), X2 | ||
2899 | MOVOU -16(R9)(R8*1), X3 | ||
2900 | MOVOU X0, (AX) | ||
2901 | MOVOU X1, 16(AX) | ||
2902 | MOVOU X2, -32(AX)(R8*1) | ||
2903 | MOVOU X3, -16(AX)(R8*1) | ||
2904 | |||
2905 | memmove_end_copy_repeat_emit_encodeBlockAsm12B: | ||
2906 | MOVQ BX, AX | ||
2907 | JMP emit_literal_done_repeat_emit_encodeBlockAsm12B | ||
2908 | |||
2909 | memmove_long_repeat_emit_encodeBlockAsm12B: | ||
2910 | LEAQ (AX)(R8*1), BX | ||
2911 | |||
2912 | // genMemMoveLong | ||
2913 | MOVOU (R9), X0 | ||
2914 | MOVOU 16(R9), X1 | ||
2915 | MOVOU -32(R9)(R8*1), X2 | ||
2916 | MOVOU -16(R9)(R8*1), X3 | ||
2917 | MOVQ R8, R11 | ||
2918 | SHRQ $0x05, R11 | ||
2919 | MOVQ AX, R10 | ||
2920 | ANDL $0x0000001f, R10 | ||
2921 | MOVQ $0x00000040, R12 | ||
2922 | SUBQ R10, R12 | ||
2923 | DECQ R11 | ||
2924 | JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 | ||
2925 | LEAQ -32(R9)(R12*1), R10 | ||
2926 | LEAQ -32(AX)(R12*1), R13 | ||
2927 | |||
2928 | emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: | ||
2929 | MOVOU (R10), X4 | ||
2930 | MOVOU 16(R10), X5 | ||
2931 | MOVOA X4, (R13) | ||
2932 | MOVOA X5, 16(R13) | ||
2933 | ADDQ $0x20, R13 | ||
2934 | ADDQ $0x20, R10 | ||
2935 | ADDQ $0x20, R12 | ||
2936 | DECQ R11 | ||
2937 | JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back | ||
2938 | |||
2939 | emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: | ||
2940 | MOVOU -32(R9)(R12*1), X4 | ||
2941 | MOVOU -16(R9)(R12*1), X5 | ||
2942 | MOVOA X4, -32(AX)(R12*1) | ||
2943 | MOVOA X5, -16(AX)(R12*1) | ||
2944 | ADDQ $0x20, R12 | ||
2945 | CMPQ R8, R12 | ||
2946 | JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 | ||
2947 | MOVOU X0, (AX) | ||
2948 | MOVOU X1, 16(AX) | ||
2949 | MOVOU X2, -32(AX)(R8*1) | ||
2950 | MOVOU X3, -16(AX)(R8*1) | ||
2951 | MOVQ BX, AX | ||
2952 | |||
2953 | emit_literal_done_repeat_emit_encodeBlockAsm12B: | ||
2954 | ADDL $0x05, CX | ||
2955 | MOVL CX, BX | ||
2956 | SUBL 16(SP), BX | ||
2957 | MOVQ src_len+32(FP), R8 | ||
2958 | SUBL CX, R8 | ||
2959 | LEAQ (DX)(CX*1), R9 | ||
2960 | LEAQ (DX)(BX*1), BX | ||
2961 | |||
2962 | // matchLen | ||
2963 | XORL R11, R11 | ||
2964 | |||
2965 | matchlen_loopback_16_repeat_extend_encodeBlockAsm12B: | ||
2966 | CMPL R8, $0x10 | ||
2967 | JB matchlen_match8_repeat_extend_encodeBlockAsm12B | ||
2968 | MOVQ (R9)(R11*1), R10 | ||
2969 | MOVQ 8(R9)(R11*1), R12 | ||
2970 | XORQ (BX)(R11*1), R10 | ||
2971 | JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B | ||
2972 | XORQ 8(BX)(R11*1), R12 | ||
2973 | JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm12B | ||
2974 | LEAL -16(R8), R8 | ||
2975 | LEAL 16(R11), R11 | ||
2976 | JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm12B | ||
2977 | |||
2978 | matchlen_bsf_16repeat_extend_encodeBlockAsm12B: | ||
2979 | #ifdef GOAMD64_v3 | ||
2980 | TZCNTQ R12, R12 | ||
2981 | |||
2982 | #else | ||
2983 | BSFQ R12, R12 | ||
2984 | |||
2985 | #endif | ||
2986 | SARQ $0x03, R12 | ||
2987 | LEAL 8(R11)(R12*1), R11 | ||
2988 | JMP repeat_extend_forward_end_encodeBlockAsm12B | ||
2989 | |||
2990 | matchlen_match8_repeat_extend_encodeBlockAsm12B: | ||
2991 | CMPL R8, $0x08 | ||
2992 | JB matchlen_match4_repeat_extend_encodeBlockAsm12B | ||
2993 | MOVQ (R9)(R11*1), R10 | ||
2994 | XORQ (BX)(R11*1), R10 | ||
2995 | JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B | ||
2996 | LEAL -8(R8), R8 | ||
2997 | LEAL 8(R11), R11 | ||
2998 | JMP matchlen_match4_repeat_extend_encodeBlockAsm12B | ||
2999 | |||
3000 | matchlen_bsf_8_repeat_extend_encodeBlockAsm12B: | ||
3001 | #ifdef GOAMD64_v3 | ||
3002 | TZCNTQ R10, R10 | ||
3003 | |||
3004 | #else | ||
3005 | BSFQ R10, R10 | ||
3006 | |||
3007 | #endif | ||
3008 | SARQ $0x03, R10 | ||
3009 | LEAL (R11)(R10*1), R11 | ||
3010 | JMP repeat_extend_forward_end_encodeBlockAsm12B | ||
3011 | |||
3012 | matchlen_match4_repeat_extend_encodeBlockAsm12B: | ||
3013 | CMPL R8, $0x04 | ||
3014 | JB matchlen_match2_repeat_extend_encodeBlockAsm12B | ||
3015 | MOVL (R9)(R11*1), R10 | ||
3016 | CMPL (BX)(R11*1), R10 | ||
3017 | JNE matchlen_match2_repeat_extend_encodeBlockAsm12B | ||
3018 | LEAL -4(R8), R8 | ||
3019 | LEAL 4(R11), R11 | ||
3020 | |||
3021 | matchlen_match2_repeat_extend_encodeBlockAsm12B: | ||
3022 | CMPL R8, $0x01 | ||
3023 | JE matchlen_match1_repeat_extend_encodeBlockAsm12B | ||
3024 | JB repeat_extend_forward_end_encodeBlockAsm12B | ||
3025 | MOVW (R9)(R11*1), R10 | ||
3026 | CMPW (BX)(R11*1), R10 | ||
3027 | JNE matchlen_match1_repeat_extend_encodeBlockAsm12B | ||
3028 | LEAL 2(R11), R11 | ||
3029 | SUBL $0x02, R8 | ||
3030 | JZ repeat_extend_forward_end_encodeBlockAsm12B | ||
3031 | |||
3032 | matchlen_match1_repeat_extend_encodeBlockAsm12B: | ||
3033 | MOVB (R9)(R11*1), R10 | ||
3034 | CMPB (BX)(R11*1), R10 | ||
3035 | JNE repeat_extend_forward_end_encodeBlockAsm12B | ||
3036 | LEAL 1(R11), R11 | ||
3037 | |||
3038 | repeat_extend_forward_end_encodeBlockAsm12B: | ||
3039 | ADDL R11, CX | ||
3040 | MOVL CX, BX | ||
3041 | SUBL SI, BX | ||
3042 | MOVL 16(SP), SI | ||
3043 | TESTL DI, DI | ||
3044 | JZ repeat_as_copy_encodeBlockAsm12B | ||
3045 | |||
3046 | // emitRepeat | ||
3047 | MOVL BX, DI | ||
3048 | LEAL -4(BX), BX | ||
3049 | CMPL DI, $0x08 | ||
3050 | JBE repeat_two_match_repeat_encodeBlockAsm12B | ||
3051 | CMPL DI, $0x0c | ||
3052 | JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B | ||
3053 | CMPL SI, $0x00000800 | ||
3054 | JB repeat_two_offset_match_repeat_encodeBlockAsm12B | ||
3055 | |||
3056 | cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: | ||
3057 | CMPL BX, $0x00000104 | ||
3058 | JB repeat_three_match_repeat_encodeBlockAsm12B | ||
3059 | LEAL -256(BX), BX | ||
3060 | MOVW $0x0019, (AX) | ||
3061 | MOVW BX, 2(AX) | ||
3062 | ADDQ $0x04, AX | ||
3063 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3064 | |||
3065 | repeat_three_match_repeat_encodeBlockAsm12B: | ||
3066 | LEAL -4(BX), BX | ||
3067 | MOVW $0x0015, (AX) | ||
3068 | MOVB BL, 2(AX) | ||
3069 | ADDQ $0x03, AX | ||
3070 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3071 | |||
3072 | repeat_two_match_repeat_encodeBlockAsm12B: | ||
3073 | SHLL $0x02, BX | ||
3074 | ORL $0x01, BX | ||
3075 | MOVW BX, (AX) | ||
3076 | ADDQ $0x02, AX | ||
3077 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3078 | |||
3079 | repeat_two_offset_match_repeat_encodeBlockAsm12B: | ||
3080 | XORQ DI, DI | ||
3081 | LEAL 1(DI)(BX*4), BX | ||
3082 | MOVB SI, 1(AX) | ||
3083 | SARL $0x08, SI | ||
3084 | SHLL $0x05, SI | ||
3085 | ORL SI, BX | ||
3086 | MOVB BL, (AX) | ||
3087 | ADDQ $0x02, AX | ||
3088 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3089 | |||
3090 | repeat_as_copy_encodeBlockAsm12B: | ||
3091 | // emitCopy | ||
3092 | CMPL BX, $0x40 | ||
3093 | JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B | ||
3094 | CMPL SI, $0x00000800 | ||
3095 | JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B | ||
3096 | MOVL $0x00000001, DI | ||
3097 | LEAL 16(DI), DI | ||
3098 | MOVB SI, 1(AX) | ||
3099 | SHRL $0x08, SI | ||
3100 | SHLL $0x05, SI | ||
3101 | ORL SI, DI | ||
3102 | MOVB DI, (AX) | ||
3103 | ADDQ $0x02, AX | ||
3104 | SUBL $0x08, BX | ||
3105 | |||
3106 | // emitRepeat | ||
3107 | LEAL -4(BX), BX | ||
3108 | JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b | ||
3109 | MOVL BX, DI | ||
3110 | LEAL -4(BX), BX | ||
3111 | CMPL DI, $0x08 | ||
3112 | JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b | ||
3113 | CMPL DI, $0x0c | ||
3114 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b | ||
3115 | CMPL SI, $0x00000800 | ||
3116 | JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b | ||
3117 | |||
3118 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: | ||
3119 | CMPL BX, $0x00000104 | ||
3120 | JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b | ||
3121 | LEAL -256(BX), BX | ||
3122 | MOVW $0x0019, (AX) | ||
3123 | MOVW BX, 2(AX) | ||
3124 | ADDQ $0x04, AX | ||
3125 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3126 | |||
3127 | repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: | ||
3128 | LEAL -4(BX), BX | ||
3129 | MOVW $0x0015, (AX) | ||
3130 | MOVB BL, 2(AX) | ||
3131 | ADDQ $0x03, AX | ||
3132 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3133 | |||
3134 | repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: | ||
3135 | SHLL $0x02, BX | ||
3136 | ORL $0x01, BX | ||
3137 | MOVW BX, (AX) | ||
3138 | ADDQ $0x02, AX | ||
3139 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3140 | |||
3141 | repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: | ||
3142 | XORQ DI, DI | ||
3143 | LEAL 1(DI)(BX*4), BX | ||
3144 | MOVB SI, 1(AX) | ||
3145 | SARL $0x08, SI | ||
3146 | SHLL $0x05, SI | ||
3147 | ORL SI, BX | ||
3148 | MOVB BL, (AX) | ||
3149 | ADDQ $0x02, AX | ||
3150 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3151 | |||
3152 | long_offset_short_repeat_as_copy_encodeBlockAsm12B: | ||
3153 | MOVB $0xee, (AX) | ||
3154 | MOVW SI, 1(AX) | ||
3155 | LEAL -60(BX), BX | ||
3156 | ADDQ $0x03, AX | ||
3157 | |||
3158 | // emitRepeat | ||
3159 | MOVL BX, DI | ||
3160 | LEAL -4(BX), BX | ||
3161 | CMPL DI, $0x08 | ||
3162 | JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short | ||
3163 | CMPL DI, $0x0c | ||
3164 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short | ||
3165 | CMPL SI, $0x00000800 | ||
3166 | JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short | ||
3167 | |||
3168 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: | ||
3169 | CMPL BX, $0x00000104 | ||
3170 | JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short | ||
3171 | LEAL -256(BX), BX | ||
3172 | MOVW $0x0019, (AX) | ||
3173 | MOVW BX, 2(AX) | ||
3174 | ADDQ $0x04, AX | ||
3175 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3176 | |||
3177 | repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: | ||
3178 | LEAL -4(BX), BX | ||
3179 | MOVW $0x0015, (AX) | ||
3180 | MOVB BL, 2(AX) | ||
3181 | ADDQ $0x03, AX | ||
3182 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3183 | |||
3184 | repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: | ||
3185 | SHLL $0x02, BX | ||
3186 | ORL $0x01, BX | ||
3187 | MOVW BX, (AX) | ||
3188 | ADDQ $0x02, AX | ||
3189 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3190 | |||
3191 | repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: | ||
3192 | XORQ DI, DI | ||
3193 | LEAL 1(DI)(BX*4), BX | ||
3194 | MOVB SI, 1(AX) | ||
3195 | SARL $0x08, SI | ||
3196 | SHLL $0x05, SI | ||
3197 | ORL SI, BX | ||
3198 | MOVB BL, (AX) | ||
3199 | ADDQ $0x02, AX | ||
3200 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3201 | |||
3202 | two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: | ||
3203 | MOVL BX, DI | ||
3204 | SHLL $0x02, DI | ||
3205 | CMPL BX, $0x0c | ||
3206 | JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B | ||
3207 | CMPL SI, $0x00000800 | ||
3208 | JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B | ||
3209 | LEAL -15(DI), DI | ||
3210 | MOVB SI, 1(AX) | ||
3211 | SHRL $0x08, SI | ||
3212 | SHLL $0x05, SI | ||
3213 | ORL SI, DI | ||
3214 | MOVB DI, (AX) | ||
3215 | ADDQ $0x02, AX | ||
3216 | JMP repeat_end_emit_encodeBlockAsm12B | ||
3217 | |||
3218 | emit_copy_three_repeat_as_copy_encodeBlockAsm12B: | ||
3219 | LEAL -2(DI), DI | ||
3220 | MOVB DI, (AX) | ||
3221 | MOVW SI, 1(AX) | ||
3222 | ADDQ $0x03, AX | ||
3223 | |||
3224 | repeat_end_emit_encodeBlockAsm12B: | ||
3225 | MOVL CX, 12(SP) | ||
3226 | JMP search_loop_encodeBlockAsm12B | ||
3227 | |||
3228 | no_repeat_found_encodeBlockAsm12B: | ||
3229 | CMPL (DX)(BX*1), SI | ||
3230 | JEQ candidate_match_encodeBlockAsm12B | ||
3231 | SHRQ $0x08, SI | ||
3232 | MOVL 24(SP)(R9*4), BX | ||
3233 | LEAL 2(CX), R8 | ||
3234 | CMPL (DX)(DI*1), SI | ||
3235 | JEQ candidate2_match_encodeBlockAsm12B | ||
3236 | MOVL R8, 24(SP)(R9*4) | ||
3237 | SHRQ $0x08, SI | ||
3238 | CMPL (DX)(BX*1), SI | ||
3239 | JEQ candidate3_match_encodeBlockAsm12B | ||
3240 | MOVL 20(SP), CX | ||
3241 | JMP search_loop_encodeBlockAsm12B | ||
3242 | |||
3243 | candidate3_match_encodeBlockAsm12B: | ||
3244 | ADDL $0x02, CX | ||
3245 | JMP candidate_match_encodeBlockAsm12B | ||
3246 | |||
3247 | candidate2_match_encodeBlockAsm12B: | ||
3248 | MOVL R8, 24(SP)(R9*4) | ||
3249 | INCL CX | ||
3250 | MOVL DI, BX | ||
3251 | |||
3252 | candidate_match_encodeBlockAsm12B: | ||
3253 | MOVL 12(SP), SI | ||
3254 | TESTL BX, BX | ||
3255 | JZ match_extend_back_end_encodeBlockAsm12B | ||
3256 | |||
3257 | match_extend_back_loop_encodeBlockAsm12B: | ||
3258 | CMPL CX, SI | ||
3259 | JBE match_extend_back_end_encodeBlockAsm12B | ||
3260 | MOVB -1(DX)(BX*1), DI | ||
3261 | MOVB -1(DX)(CX*1), R8 | ||
3262 | CMPB DI, R8 | ||
3263 | JNE match_extend_back_end_encodeBlockAsm12B | ||
3264 | LEAL -1(CX), CX | ||
3265 | DECL BX | ||
3266 | JZ match_extend_back_end_encodeBlockAsm12B | ||
3267 | JMP match_extend_back_loop_encodeBlockAsm12B | ||
3268 | |||
3269 | match_extend_back_end_encodeBlockAsm12B: | ||
3270 | MOVL CX, SI | ||
3271 | SUBL 12(SP), SI | ||
3272 | LEAQ 3(AX)(SI*1), SI | ||
3273 | CMPQ SI, (SP) | ||
3274 | JB match_dst_size_check_encodeBlockAsm12B | ||
3275 | MOVQ $0x00000000, ret+48(FP) | ||
3276 | RET | ||
3277 | |||
3278 | match_dst_size_check_encodeBlockAsm12B: | ||
3279 | MOVL CX, SI | ||
3280 | MOVL 12(SP), DI | ||
3281 | CMPL DI, SI | ||
3282 | JEQ emit_literal_done_match_emit_encodeBlockAsm12B | ||
3283 | MOVL SI, R8 | ||
3284 | MOVL SI, 12(SP) | ||
3285 | LEAQ (DX)(DI*1), SI | ||
3286 | SUBL DI, R8 | ||
3287 | LEAL -1(R8), DI | ||
3288 | CMPL DI, $0x3c | ||
3289 | JB one_byte_match_emit_encodeBlockAsm12B | ||
3290 | CMPL DI, $0x00000100 | ||
3291 | JB two_bytes_match_emit_encodeBlockAsm12B | ||
3292 | JB three_bytes_match_emit_encodeBlockAsm12B | ||
3293 | |||
3294 | three_bytes_match_emit_encodeBlockAsm12B: | ||
3295 | MOVB $0xf4, (AX) | ||
3296 | MOVW DI, 1(AX) | ||
3297 | ADDQ $0x03, AX | ||
3298 | JMP memmove_long_match_emit_encodeBlockAsm12B | ||
3299 | |||
3300 | two_bytes_match_emit_encodeBlockAsm12B: | ||
3301 | MOVB $0xf0, (AX) | ||
3302 | MOVB DI, 1(AX) | ||
3303 | ADDQ $0x02, AX | ||
3304 | CMPL DI, $0x40 | ||
3305 | JB memmove_match_emit_encodeBlockAsm12B | ||
3306 | JMP memmove_long_match_emit_encodeBlockAsm12B | ||
3307 | |||
3308 | one_byte_match_emit_encodeBlockAsm12B: | ||
3309 | SHLB $0x02, DI | ||
3310 | MOVB DI, (AX) | ||
3311 | ADDQ $0x01, AX | ||
3312 | |||
3313 | memmove_match_emit_encodeBlockAsm12B: | ||
3314 | LEAQ (AX)(R8*1), DI | ||
3315 | |||
3316 | // genMemMoveShort | ||
3317 | CMPQ R8, $0x08 | ||
3318 | JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 | ||
3319 | CMPQ R8, $0x10 | ||
3320 | JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 | ||
3321 | CMPQ R8, $0x20 | ||
3322 | JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 | ||
3323 | JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 | ||
3324 | |||
3325 | emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: | ||
3326 | MOVQ (SI), R9 | ||
3327 | MOVQ R9, (AX) | ||
3328 | JMP memmove_end_copy_match_emit_encodeBlockAsm12B | ||
3329 | |||
3330 | emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: | ||
3331 | MOVQ (SI), R9 | ||
3332 | MOVQ -8(SI)(R8*1), SI | ||
3333 | MOVQ R9, (AX) | ||
3334 | MOVQ SI, -8(AX)(R8*1) | ||
3335 | JMP memmove_end_copy_match_emit_encodeBlockAsm12B | ||
3336 | |||
3337 | emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: | ||
3338 | MOVOU (SI), X0 | ||
3339 | MOVOU -16(SI)(R8*1), X1 | ||
3340 | MOVOU X0, (AX) | ||
3341 | MOVOU X1, -16(AX)(R8*1) | ||
3342 | JMP memmove_end_copy_match_emit_encodeBlockAsm12B | ||
3343 | |||
3344 | emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: | ||
3345 | MOVOU (SI), X0 | ||
3346 | MOVOU 16(SI), X1 | ||
3347 | MOVOU -32(SI)(R8*1), X2 | ||
3348 | MOVOU -16(SI)(R8*1), X3 | ||
3349 | MOVOU X0, (AX) | ||
3350 | MOVOU X1, 16(AX) | ||
3351 | MOVOU X2, -32(AX)(R8*1) | ||
3352 | MOVOU X3, -16(AX)(R8*1) | ||
3353 | |||
3354 | memmove_end_copy_match_emit_encodeBlockAsm12B: | ||
3355 | MOVQ DI, AX | ||
3356 | JMP emit_literal_done_match_emit_encodeBlockAsm12B | ||
3357 | |||
3358 | memmove_long_match_emit_encodeBlockAsm12B: | ||
3359 | LEAQ (AX)(R8*1), DI | ||
3360 | |||
3361 | // genMemMoveLong | ||
3362 | MOVOU (SI), X0 | ||
3363 | MOVOU 16(SI), X1 | ||
3364 | MOVOU -32(SI)(R8*1), X2 | ||
3365 | MOVOU -16(SI)(R8*1), X3 | ||
3366 | MOVQ R8, R10 | ||
3367 | SHRQ $0x05, R10 | ||
3368 | MOVQ AX, R9 | ||
3369 | ANDL $0x0000001f, R9 | ||
3370 | MOVQ $0x00000040, R11 | ||
3371 | SUBQ R9, R11 | ||
3372 | DECQ R10 | ||
3373 | JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 | ||
3374 | LEAQ -32(SI)(R11*1), R9 | ||
3375 | LEAQ -32(AX)(R11*1), R12 | ||
3376 | |||
3377 | emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: | ||
3378 | MOVOU (R9), X4 | ||
3379 | MOVOU 16(R9), X5 | ||
3380 | MOVOA X4, (R12) | ||
3381 | MOVOA X5, 16(R12) | ||
3382 | ADDQ $0x20, R12 | ||
3383 | ADDQ $0x20, R9 | ||
3384 | ADDQ $0x20, R11 | ||
3385 | DECQ R10 | ||
3386 | JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back | ||
3387 | |||
3388 | emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: | ||
3389 | MOVOU -32(SI)(R11*1), X4 | ||
3390 | MOVOU -16(SI)(R11*1), X5 | ||
3391 | MOVOA X4, -32(AX)(R11*1) | ||
3392 | MOVOA X5, -16(AX)(R11*1) | ||
3393 | ADDQ $0x20, R11 | ||
3394 | CMPQ R8, R11 | ||
3395 | JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 | ||
3396 | MOVOU X0, (AX) | ||
3397 | MOVOU X1, 16(AX) | ||
3398 | MOVOU X2, -32(AX)(R8*1) | ||
3399 | MOVOU X3, -16(AX)(R8*1) | ||
3400 | MOVQ DI, AX | ||
3401 | |||
3402 | emit_literal_done_match_emit_encodeBlockAsm12B: | ||
3403 | match_nolit_loop_encodeBlockAsm12B: | ||
3404 | MOVL CX, SI | ||
3405 | SUBL BX, SI | ||
3406 | MOVL SI, 16(SP) | ||
3407 | ADDL $0x04, CX | ||
3408 | ADDL $0x04, BX | ||
3409 | MOVQ src_len+32(FP), SI | ||
3410 | SUBL CX, SI | ||
3411 | LEAQ (DX)(CX*1), DI | ||
3412 | LEAQ (DX)(BX*1), BX | ||
3413 | |||
3414 | // matchLen | ||
3415 | XORL R9, R9 | ||
3416 | |||
3417 | matchlen_loopback_16_match_nolit_encodeBlockAsm12B: | ||
3418 | CMPL SI, $0x10 | ||
3419 | JB matchlen_match8_match_nolit_encodeBlockAsm12B | ||
3420 | MOVQ (DI)(R9*1), R8 | ||
3421 | MOVQ 8(DI)(R9*1), R10 | ||
3422 | XORQ (BX)(R9*1), R8 | ||
3423 | JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B | ||
3424 | XORQ 8(BX)(R9*1), R10 | ||
3425 | JNZ matchlen_bsf_16match_nolit_encodeBlockAsm12B | ||
3426 | LEAL -16(SI), SI | ||
3427 | LEAL 16(R9), R9 | ||
3428 | JMP matchlen_loopback_16_match_nolit_encodeBlockAsm12B | ||
3429 | |||
3430 | matchlen_bsf_16match_nolit_encodeBlockAsm12B: | ||
3431 | #ifdef GOAMD64_v3 | ||
3432 | TZCNTQ R10, R10 | ||
3433 | |||
3434 | #else | ||
3435 | BSFQ R10, R10 | ||
3436 | |||
3437 | #endif | ||
3438 | SARQ $0x03, R10 | ||
3439 | LEAL 8(R9)(R10*1), R9 | ||
3440 | JMP match_nolit_end_encodeBlockAsm12B | ||
3441 | |||
3442 | matchlen_match8_match_nolit_encodeBlockAsm12B: | ||
3443 | CMPL SI, $0x08 | ||
3444 | JB matchlen_match4_match_nolit_encodeBlockAsm12B | ||
3445 | MOVQ (DI)(R9*1), R8 | ||
3446 | XORQ (BX)(R9*1), R8 | ||
3447 | JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B | ||
3448 | LEAL -8(SI), SI | ||
3449 | LEAL 8(R9), R9 | ||
3450 | JMP matchlen_match4_match_nolit_encodeBlockAsm12B | ||
3451 | |||
3452 | matchlen_bsf_8_match_nolit_encodeBlockAsm12B: | ||
3453 | #ifdef GOAMD64_v3 | ||
3454 | TZCNTQ R8, R8 | ||
3455 | |||
3456 | #else | ||
3457 | BSFQ R8, R8 | ||
3458 | |||
3459 | #endif | ||
3460 | SARQ $0x03, R8 | ||
3461 | LEAL (R9)(R8*1), R9 | ||
3462 | JMP match_nolit_end_encodeBlockAsm12B | ||
3463 | |||
3464 | matchlen_match4_match_nolit_encodeBlockAsm12B: | ||
3465 | CMPL SI, $0x04 | ||
3466 | JB matchlen_match2_match_nolit_encodeBlockAsm12B | ||
3467 | MOVL (DI)(R9*1), R8 | ||
3468 | CMPL (BX)(R9*1), R8 | ||
3469 | JNE matchlen_match2_match_nolit_encodeBlockAsm12B | ||
3470 | LEAL -4(SI), SI | ||
3471 | LEAL 4(R9), R9 | ||
3472 | |||
3473 | matchlen_match2_match_nolit_encodeBlockAsm12B: | ||
3474 | CMPL SI, $0x01 | ||
3475 | JE matchlen_match1_match_nolit_encodeBlockAsm12B | ||
3476 | JB match_nolit_end_encodeBlockAsm12B | ||
3477 | MOVW (DI)(R9*1), R8 | ||
3478 | CMPW (BX)(R9*1), R8 | ||
3479 | JNE matchlen_match1_match_nolit_encodeBlockAsm12B | ||
3480 | LEAL 2(R9), R9 | ||
3481 | SUBL $0x02, SI | ||
3482 | JZ match_nolit_end_encodeBlockAsm12B | ||
3483 | |||
3484 | matchlen_match1_match_nolit_encodeBlockAsm12B: | ||
3485 | MOVB (DI)(R9*1), R8 | ||
3486 | CMPB (BX)(R9*1), R8 | ||
3487 | JNE match_nolit_end_encodeBlockAsm12B | ||
3488 | LEAL 1(R9), R9 | ||
3489 | |||
3490 | match_nolit_end_encodeBlockAsm12B: | ||
3491 | ADDL R9, CX | ||
3492 | MOVL 16(SP), BX | ||
3493 | ADDL $0x04, R9 | ||
3494 | MOVL CX, 12(SP) | ||
3495 | |||
3496 | // emitCopy | ||
3497 | CMPL R9, $0x40 | ||
3498 | JBE two_byte_offset_short_match_nolit_encodeBlockAsm12B | ||
3499 | CMPL BX, $0x00000800 | ||
3500 | JAE long_offset_short_match_nolit_encodeBlockAsm12B | ||
3501 | MOVL $0x00000001, SI | ||
3502 | LEAL 16(SI), SI | ||
3503 | MOVB BL, 1(AX) | ||
3504 | SHRL $0x08, BX | ||
3505 | SHLL $0x05, BX | ||
3506 | ORL BX, SI | ||
3507 | MOVB SI, (AX) | ||
3508 | ADDQ $0x02, AX | ||
3509 | SUBL $0x08, R9 | ||
3510 | |||
3511 | // emitRepeat | ||
3512 | LEAL -4(R9), R9 | ||
3513 | JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b | ||
3514 | MOVL R9, SI | ||
3515 | LEAL -4(R9), R9 | ||
3516 | CMPL SI, $0x08 | ||
3517 | JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b | ||
3518 | CMPL SI, $0x0c | ||
3519 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b | ||
3520 | CMPL BX, $0x00000800 | ||
3521 | JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b | ||
3522 | |||
3523 | cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: | ||
3524 | CMPL R9, $0x00000104 | ||
3525 | JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b | ||
3526 | LEAL -256(R9), R9 | ||
3527 | MOVW $0x0019, (AX) | ||
3528 | MOVW R9, 2(AX) | ||
3529 | ADDQ $0x04, AX | ||
3530 | JMP match_nolit_emitcopy_end_encodeBlockAsm12B | ||
3531 | |||
3532 | repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: | ||
3533 | LEAL -4(R9), R9 | ||
3534 | MOVW $0x0015, (AX) | ||
3535 | MOVB R9, 2(AX) | ||
3536 | ADDQ $0x03, AX | ||
3537 | JMP match_nolit_emitcopy_end_encodeBlockAsm12B | ||
3538 | |||
3539 | repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: | ||
3540 | SHLL $0x02, R9 | ||
3541 | ORL $0x01, R9 | ||
3542 | MOVW R9, (AX) | ||
3543 | ADDQ $0x02, AX | ||
3544 | JMP match_nolit_emitcopy_end_encodeBlockAsm12B | ||
3545 | |||
3546 | repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: | ||
3547 | XORQ SI, SI | ||
3548 | LEAL 1(SI)(R9*4), R9 | ||
3549 | MOVB BL, 1(AX) | ||
3550 | SARL $0x08, BX | ||
3551 | SHLL $0x05, BX | ||
3552 | ORL BX, R9 | ||
3553 | MOVB R9, (AX) | ||
3554 | ADDQ $0x02, AX | ||
3555 | JMP match_nolit_emitcopy_end_encodeBlockAsm12B | ||
3556 | |||
3557 | long_offset_short_match_nolit_encodeBlockAsm12B: | ||
3558 | MOVB $0xee, (AX) | ||
3559 | MOVW BX, 1(AX) | ||
3560 | LEAL -60(R9), R9 | ||
3561 | ADDQ $0x03, AX | ||
3562 | |||
3563 | // emitRepeat | ||
3564 | MOVL R9, SI | ||
3565 | LEAL -4(R9), R9 | ||
3566 | CMPL SI, $0x08 | ||
3567 | JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short | ||
3568 | CMPL SI, $0x0c | ||
3569 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short | ||
3570 | CMPL BX, $0x00000800 | ||
3571 | JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short | ||
3572 | |||
3573 | cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: | ||
3574 | CMPL R9, $0x00000104 | ||
3575 | JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short | ||
3576 | LEAL -256(R9), R9 | ||
3577 | MOVW $0x0019, (AX) | ||
3578 | MOVW R9, 2(AX) | ||
3579 | ADDQ $0x04, AX | ||
3580 | JMP match_nolit_emitcopy_end_encodeBlockAsm12B | ||
3581 | |||
3582 | repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: | ||
3583 | LEAL -4(R9), R9 | ||
3584 | MOVW $0x0015, (AX) | ||
3585 | MOVB R9, 2(AX) | ||
3586 | ADDQ $0x03, AX | ||
3587 | JMP match_nolit_emitcopy_end_encodeBlockAsm12B | ||
3588 | |||
3589 | repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: | ||
3590 | SHLL $0x02, R9 | ||
3591 | ORL $0x01, R9 | ||
3592 | MOVW R9, (AX) | ||
3593 | ADDQ $0x02, AX | ||
3594 | JMP match_nolit_emitcopy_end_encodeBlockAsm12B | ||
3595 | |||
3596 | repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: | ||
3597 | XORQ SI, SI | ||
3598 | LEAL 1(SI)(R9*4), R9 | ||
3599 | MOVB BL, 1(AX) | ||
3600 | SARL $0x08, BX | ||
3601 | SHLL $0x05, BX | ||
3602 | ORL BX, R9 | ||
3603 | MOVB R9, (AX) | ||
3604 | ADDQ $0x02, AX | ||
3605 | JMP match_nolit_emitcopy_end_encodeBlockAsm12B | ||
3606 | |||
3607 | two_byte_offset_short_match_nolit_encodeBlockAsm12B: | ||
3608 | MOVL R9, SI | ||
3609 | SHLL $0x02, SI | ||
3610 | CMPL R9, $0x0c | ||
3611 | JAE emit_copy_three_match_nolit_encodeBlockAsm12B | ||
3612 | CMPL BX, $0x00000800 | ||
3613 | JAE emit_copy_three_match_nolit_encodeBlockAsm12B | ||
3614 | LEAL -15(SI), SI | ||
3615 | MOVB BL, 1(AX) | ||
3616 | SHRL $0x08, BX | ||
3617 | SHLL $0x05, BX | ||
3618 | ORL BX, SI | ||
3619 | MOVB SI, (AX) | ||
3620 | ADDQ $0x02, AX | ||
3621 | JMP match_nolit_emitcopy_end_encodeBlockAsm12B | ||
3622 | |||
3623 | emit_copy_three_match_nolit_encodeBlockAsm12B: | ||
3624 | LEAL -2(SI), SI | ||
3625 | MOVB SI, (AX) | ||
3626 | MOVW BX, 1(AX) | ||
3627 | ADDQ $0x03, AX | ||
3628 | |||
3629 | match_nolit_emitcopy_end_encodeBlockAsm12B: | ||
3630 | CMPL CX, 8(SP) | ||
3631 | JAE emit_remainder_encodeBlockAsm12B | ||
3632 | MOVQ -2(DX)(CX*1), SI | ||
3633 | CMPQ AX, (SP) | ||
3634 | JB match_nolit_dst_ok_encodeBlockAsm12B | ||
3635 | MOVQ $0x00000000, ret+48(FP) | ||
3636 | RET | ||
3637 | |||
3638 | match_nolit_dst_ok_encodeBlockAsm12B: | ||
3639 | MOVQ $0x000000cf1bbcdcbb, R8 | ||
3640 | MOVQ SI, DI | ||
3641 | SHRQ $0x10, SI | ||
3642 | MOVQ SI, BX | ||
3643 | SHLQ $0x18, DI | ||
3644 | IMULQ R8, DI | ||
3645 | SHRQ $0x34, DI | ||
3646 | SHLQ $0x18, BX | ||
3647 | IMULQ R8, BX | ||
3648 | SHRQ $0x34, BX | ||
3649 | LEAL -2(CX), R8 | ||
3650 | LEAQ 24(SP)(BX*4), R9 | ||
3651 | MOVL (R9), BX | ||
3652 | MOVL R8, 24(SP)(DI*4) | ||
3653 | MOVL CX, (R9) | ||
3654 | CMPL (DX)(BX*1), SI | ||
3655 | JEQ match_nolit_loop_encodeBlockAsm12B | ||
3656 | INCL CX | ||
3657 | JMP search_loop_encodeBlockAsm12B | ||
3658 | |||
3659 | emit_remainder_encodeBlockAsm12B: | ||
3660 | MOVQ src_len+32(FP), CX | ||
3661 | SUBL 12(SP), CX | ||
3662 | LEAQ 3(AX)(CX*1), CX | ||
3663 | CMPQ CX, (SP) | ||
3664 | JB emit_remainder_ok_encodeBlockAsm12B | ||
3665 | MOVQ $0x00000000, ret+48(FP) | ||
3666 | RET | ||
3667 | |||
3668 | emit_remainder_ok_encodeBlockAsm12B: | ||
3669 | MOVQ src_len+32(FP), CX | ||
3670 | MOVL 12(SP), BX | ||
3671 | CMPL BX, CX | ||
3672 | JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B | ||
3673 | MOVL CX, SI | ||
3674 | MOVL CX, 12(SP) | ||
3675 | LEAQ (DX)(BX*1), CX | ||
3676 | SUBL BX, SI | ||
3677 | LEAL -1(SI), DX | ||
3678 | CMPL DX, $0x3c | ||
3679 | JB one_byte_emit_remainder_encodeBlockAsm12B | ||
3680 | CMPL DX, $0x00000100 | ||
3681 | JB two_bytes_emit_remainder_encodeBlockAsm12B | ||
3682 | JB three_bytes_emit_remainder_encodeBlockAsm12B | ||
3683 | |||
3684 | three_bytes_emit_remainder_encodeBlockAsm12B: | ||
3685 | MOVB $0xf4, (AX) | ||
3686 | MOVW DX, 1(AX) | ||
3687 | ADDQ $0x03, AX | ||
3688 | JMP memmove_long_emit_remainder_encodeBlockAsm12B | ||
3689 | |||
3690 | two_bytes_emit_remainder_encodeBlockAsm12B: | ||
3691 | MOVB $0xf0, (AX) | ||
3692 | MOVB DL, 1(AX) | ||
3693 | ADDQ $0x02, AX | ||
3694 | CMPL DX, $0x40 | ||
3695 | JB memmove_emit_remainder_encodeBlockAsm12B | ||
3696 | JMP memmove_long_emit_remainder_encodeBlockAsm12B | ||
3697 | |||
3698 | one_byte_emit_remainder_encodeBlockAsm12B: | ||
3699 | SHLB $0x02, DL | ||
3700 | MOVB DL, (AX) | ||
3701 | ADDQ $0x01, AX | ||
3702 | |||
3703 | memmove_emit_remainder_encodeBlockAsm12B: | ||
3704 | LEAQ (AX)(SI*1), DX | ||
3705 | MOVL SI, BX | ||
3706 | |||
3707 | // genMemMoveShort | ||
3708 | CMPQ BX, $0x03 | ||
3709 | JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 | ||
3710 | JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 | ||
3711 | CMPQ BX, $0x08 | ||
3712 | JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 | ||
3713 | CMPQ BX, $0x10 | ||
3714 | JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 | ||
3715 | CMPQ BX, $0x20 | ||
3716 | JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 | ||
3717 | JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 | ||
3718 | |||
3719 | emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: | ||
3720 | MOVB (CX), SI | ||
3721 | MOVB -1(CX)(BX*1), CL | ||
3722 | MOVB SI, (AX) | ||
3723 | MOVB CL, -1(AX)(BX*1) | ||
3724 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B | ||
3725 | |||
3726 | emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: | ||
3727 | MOVW (CX), SI | ||
3728 | MOVB 2(CX), CL | ||
3729 | MOVW SI, (AX) | ||
3730 | MOVB CL, 2(AX) | ||
3731 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B | ||
3732 | |||
3733 | emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: | ||
3734 | MOVL (CX), SI | ||
3735 | MOVL -4(CX)(BX*1), CX | ||
3736 | MOVL SI, (AX) | ||
3737 | MOVL CX, -4(AX)(BX*1) | ||
3738 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B | ||
3739 | |||
3740 | emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: | ||
3741 | MOVQ (CX), SI | ||
3742 | MOVQ -8(CX)(BX*1), CX | ||
3743 | MOVQ SI, (AX) | ||
3744 | MOVQ CX, -8(AX)(BX*1) | ||
3745 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B | ||
3746 | |||
3747 | emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: | ||
3748 | MOVOU (CX), X0 | ||
3749 | MOVOU -16(CX)(BX*1), X1 | ||
3750 | MOVOU X0, (AX) | ||
3751 | MOVOU X1, -16(AX)(BX*1) | ||
3752 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B | ||
3753 | |||
3754 | emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: | ||
3755 | MOVOU (CX), X0 | ||
3756 | MOVOU 16(CX), X1 | ||
3757 | MOVOU -32(CX)(BX*1), X2 | ||
3758 | MOVOU -16(CX)(BX*1), X3 | ||
3759 | MOVOU X0, (AX) | ||
3760 | MOVOU X1, 16(AX) | ||
3761 | MOVOU X2, -32(AX)(BX*1) | ||
3762 | MOVOU X3, -16(AX)(BX*1) | ||
3763 | |||
3764 | memmove_end_copy_emit_remainder_encodeBlockAsm12B: | ||
3765 | MOVQ DX, AX | ||
3766 | JMP emit_literal_done_emit_remainder_encodeBlockAsm12B | ||
3767 | |||
3768 | memmove_long_emit_remainder_encodeBlockAsm12B: | ||
3769 | LEAQ (AX)(SI*1), DX | ||
3770 | MOVL SI, BX | ||
3771 | |||
3772 | // genMemMoveLong | ||
3773 | MOVOU (CX), X0 | ||
3774 | MOVOU 16(CX), X1 | ||
3775 | MOVOU -32(CX)(BX*1), X2 | ||
3776 | MOVOU -16(CX)(BX*1), X3 | ||
3777 | MOVQ BX, DI | ||
3778 | SHRQ $0x05, DI | ||
3779 | MOVQ AX, SI | ||
3780 | ANDL $0x0000001f, SI | ||
3781 | MOVQ $0x00000040, R8 | ||
3782 | SUBQ SI, R8 | ||
3783 | DECQ DI | ||
3784 | JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 | ||
3785 | LEAQ -32(CX)(R8*1), SI | ||
3786 | LEAQ -32(AX)(R8*1), R9 | ||
3787 | |||
3788 | emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: | ||
3789 | MOVOU (SI), X4 | ||
3790 | MOVOU 16(SI), X5 | ||
3791 | MOVOA X4, (R9) | ||
3792 | MOVOA X5, 16(R9) | ||
3793 | ADDQ $0x20, R9 | ||
3794 | ADDQ $0x20, SI | ||
3795 | ADDQ $0x20, R8 | ||
3796 | DECQ DI | ||
3797 | JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back | ||
3798 | |||
3799 | emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: | ||
3800 | MOVOU -32(CX)(R8*1), X4 | ||
3801 | MOVOU -16(CX)(R8*1), X5 | ||
3802 | MOVOA X4, -32(AX)(R8*1) | ||
3803 | MOVOA X5, -16(AX)(R8*1) | ||
3804 | ADDQ $0x20, R8 | ||
3805 | CMPQ BX, R8 | ||
3806 | JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 | ||
3807 | MOVOU X0, (AX) | ||
3808 | MOVOU X1, 16(AX) | ||
3809 | MOVOU X2, -32(AX)(BX*1) | ||
3810 | MOVOU X3, -16(AX)(BX*1) | ||
3811 | MOVQ DX, AX | ||
3812 | |||
3813 | emit_literal_done_emit_remainder_encodeBlockAsm12B: | ||
3814 | MOVQ dst_base+0(FP), CX | ||
3815 | SUBQ CX, AX | ||
3816 | MOVQ AX, ret+48(FP) | ||
3817 | RET | ||
3818 | |||
3819 | // func encodeBlockAsm10B(dst []byte, src []byte) int | ||
3820 | // Requires: BMI, SSE2 | ||
3821 | TEXT ·encodeBlockAsm10B(SB), $4120-56 | ||
3822 | MOVQ dst_base+0(FP), AX | ||
3823 | MOVQ $0x00000020, CX | ||
3824 | LEAQ 24(SP), DX | ||
3825 | PXOR X0, X0 | ||
3826 | |||
3827 | zero_loop_encodeBlockAsm10B: | ||
3828 | MOVOU X0, (DX) | ||
3829 | MOVOU X0, 16(DX) | ||
3830 | MOVOU X0, 32(DX) | ||
3831 | MOVOU X0, 48(DX) | ||
3832 | MOVOU X0, 64(DX) | ||
3833 | MOVOU X0, 80(DX) | ||
3834 | MOVOU X0, 96(DX) | ||
3835 | MOVOU X0, 112(DX) | ||
3836 | ADDQ $0x80, DX | ||
3837 | DECQ CX | ||
3838 | JNZ zero_loop_encodeBlockAsm10B | ||
3839 | MOVL $0x00000000, 12(SP) | ||
3840 | MOVQ src_len+32(FP), CX | ||
3841 | LEAQ -9(CX), DX | ||
3842 | LEAQ -8(CX), BX | ||
3843 | MOVL BX, 8(SP) | ||
3844 | SHRQ $0x05, CX | ||
3845 | SUBL CX, DX | ||
3846 | LEAQ (AX)(DX*1), DX | ||
3847 | MOVQ DX, (SP) | ||
3848 | MOVL $0x00000001, CX | ||
3849 | MOVL CX, 16(SP) | ||
3850 | MOVQ src_base+24(FP), DX | ||
3851 | |||
3852 | search_loop_encodeBlockAsm10B: | ||
3853 | MOVL CX, BX | ||
3854 | SUBL 12(SP), BX | ||
3855 | SHRL $0x05, BX | ||
3856 | LEAL 4(CX)(BX*1), BX | ||
3857 | CMPL BX, 8(SP) | ||
3858 | JAE emit_remainder_encodeBlockAsm10B | ||
3859 | MOVQ (DX)(CX*1), SI | ||
3860 | MOVL BX, 20(SP) | ||
3861 | MOVQ $0x9e3779b1, R8 | ||
3862 | MOVQ SI, R9 | ||
3863 | MOVQ SI, R10 | ||
3864 | SHRQ $0x08, R10 | ||
3865 | SHLQ $0x20, R9 | ||
3866 | IMULQ R8, R9 | ||
3867 | SHRQ $0x36, R9 | ||
3868 | SHLQ $0x20, R10 | ||
3869 | IMULQ R8, R10 | ||
3870 | SHRQ $0x36, R10 | ||
3871 | MOVL 24(SP)(R9*4), BX | ||
3872 | MOVL 24(SP)(R10*4), DI | ||
3873 | MOVL CX, 24(SP)(R9*4) | ||
3874 | LEAL 1(CX), R9 | ||
3875 | MOVL R9, 24(SP)(R10*4) | ||
3876 | MOVQ SI, R9 | ||
3877 | SHRQ $0x10, R9 | ||
3878 | SHLQ $0x20, R9 | ||
3879 | IMULQ R8, R9 | ||
3880 | SHRQ $0x36, R9 | ||
3881 | MOVL CX, R8 | ||
3882 | SUBL 16(SP), R8 | ||
3883 | MOVL 1(DX)(R8*1), R10 | ||
3884 | MOVQ SI, R8 | ||
3885 | SHRQ $0x08, R8 | ||
3886 | CMPL R8, R10 | ||
3887 | JNE no_repeat_found_encodeBlockAsm10B | ||
3888 | LEAL 1(CX), SI | ||
3889 | MOVL 12(SP), DI | ||
3890 | MOVL SI, BX | ||
3891 | SUBL 16(SP), BX | ||
3892 | JZ repeat_extend_back_end_encodeBlockAsm10B | ||
3893 | |||
3894 | repeat_extend_back_loop_encodeBlockAsm10B: | ||
3895 | CMPL SI, DI | ||
3896 | JBE repeat_extend_back_end_encodeBlockAsm10B | ||
3897 | MOVB -1(DX)(BX*1), R8 | ||
3898 | MOVB -1(DX)(SI*1), R9 | ||
3899 | CMPB R8, R9 | ||
3900 | JNE repeat_extend_back_end_encodeBlockAsm10B | ||
3901 | LEAL -1(SI), SI | ||
3902 | DECL BX | ||
3903 | JNZ repeat_extend_back_loop_encodeBlockAsm10B | ||
3904 | |||
3905 | repeat_extend_back_end_encodeBlockAsm10B: | ||
3906 | MOVL 12(SP), BX | ||
3907 | CMPL BX, SI | ||
3908 | JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B | ||
3909 | MOVL SI, R8 | ||
3910 | MOVL SI, 12(SP) | ||
3911 | LEAQ (DX)(BX*1), R9 | ||
3912 | SUBL BX, R8 | ||
3913 | LEAL -1(R8), BX | ||
3914 | CMPL BX, $0x3c | ||
3915 | JB one_byte_repeat_emit_encodeBlockAsm10B | ||
3916 | CMPL BX, $0x00000100 | ||
3917 | JB two_bytes_repeat_emit_encodeBlockAsm10B | ||
3918 | JB three_bytes_repeat_emit_encodeBlockAsm10B | ||
3919 | |||
3920 | three_bytes_repeat_emit_encodeBlockAsm10B: | ||
3921 | MOVB $0xf4, (AX) | ||
3922 | MOVW BX, 1(AX) | ||
3923 | ADDQ $0x03, AX | ||
3924 | JMP memmove_long_repeat_emit_encodeBlockAsm10B | ||
3925 | |||
3926 | two_bytes_repeat_emit_encodeBlockAsm10B: | ||
3927 | MOVB $0xf0, (AX) | ||
3928 | MOVB BL, 1(AX) | ||
3929 | ADDQ $0x02, AX | ||
3930 | CMPL BX, $0x40 | ||
3931 | JB memmove_repeat_emit_encodeBlockAsm10B | ||
3932 | JMP memmove_long_repeat_emit_encodeBlockAsm10B | ||
3933 | |||
3934 | one_byte_repeat_emit_encodeBlockAsm10B: | ||
3935 | SHLB $0x02, BL | ||
3936 | MOVB BL, (AX) | ||
3937 | ADDQ $0x01, AX | ||
3938 | |||
3939 | memmove_repeat_emit_encodeBlockAsm10B: | ||
3940 | LEAQ (AX)(R8*1), BX | ||
3941 | |||
3942 | // genMemMoveShort | ||
3943 | CMPQ R8, $0x08 | ||
3944 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 | ||
3945 | CMPQ R8, $0x10 | ||
3946 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 | ||
3947 | CMPQ R8, $0x20 | ||
3948 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 | ||
3949 | JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 | ||
3950 | |||
3951 | emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: | ||
3952 | MOVQ (R9), R10 | ||
3953 | MOVQ R10, (AX) | ||
3954 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B | ||
3955 | |||
3956 | emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: | ||
3957 | MOVQ (R9), R10 | ||
3958 | MOVQ -8(R9)(R8*1), R9 | ||
3959 | MOVQ R10, (AX) | ||
3960 | MOVQ R9, -8(AX)(R8*1) | ||
3961 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B | ||
3962 | |||
3963 | emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: | ||
3964 | MOVOU (R9), X0 | ||
3965 | MOVOU -16(R9)(R8*1), X1 | ||
3966 | MOVOU X0, (AX) | ||
3967 | MOVOU X1, -16(AX)(R8*1) | ||
3968 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B | ||
3969 | |||
3970 | emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: | ||
3971 | MOVOU (R9), X0 | ||
3972 | MOVOU 16(R9), X1 | ||
3973 | MOVOU -32(R9)(R8*1), X2 | ||
3974 | MOVOU -16(R9)(R8*1), X3 | ||
3975 | MOVOU X0, (AX) | ||
3976 | MOVOU X1, 16(AX) | ||
3977 | MOVOU X2, -32(AX)(R8*1) | ||
3978 | MOVOU X3, -16(AX)(R8*1) | ||
3979 | |||
3980 | memmove_end_copy_repeat_emit_encodeBlockAsm10B: | ||
3981 | MOVQ BX, AX | ||
3982 | JMP emit_literal_done_repeat_emit_encodeBlockAsm10B | ||
3983 | |||
3984 | memmove_long_repeat_emit_encodeBlockAsm10B: | ||
3985 | LEAQ (AX)(R8*1), BX | ||
3986 | |||
3987 | // genMemMoveLong | ||
3988 | MOVOU (R9), X0 | ||
3989 | MOVOU 16(R9), X1 | ||
3990 | MOVOU -32(R9)(R8*1), X2 | ||
3991 | MOVOU -16(R9)(R8*1), X3 | ||
3992 | MOVQ R8, R11 | ||
3993 | SHRQ $0x05, R11 | ||
3994 | MOVQ AX, R10 | ||
3995 | ANDL $0x0000001f, R10 | ||
3996 | MOVQ $0x00000040, R12 | ||
3997 | SUBQ R10, R12 | ||
3998 | DECQ R11 | ||
3999 | JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 | ||
4000 | LEAQ -32(R9)(R12*1), R10 | ||
4001 | LEAQ -32(AX)(R12*1), R13 | ||
4002 | |||
4003 | emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: | ||
4004 | MOVOU (R10), X4 | ||
4005 | MOVOU 16(R10), X5 | ||
4006 | MOVOA X4, (R13) | ||
4007 | MOVOA X5, 16(R13) | ||
4008 | ADDQ $0x20, R13 | ||
4009 | ADDQ $0x20, R10 | ||
4010 | ADDQ $0x20, R12 | ||
4011 | DECQ R11 | ||
4012 | JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back | ||
4013 | |||
4014 | emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: | ||
4015 | MOVOU -32(R9)(R12*1), X4 | ||
4016 | MOVOU -16(R9)(R12*1), X5 | ||
4017 | MOVOA X4, -32(AX)(R12*1) | ||
4018 | MOVOA X5, -16(AX)(R12*1) | ||
4019 | ADDQ $0x20, R12 | ||
4020 | CMPQ R8, R12 | ||
4021 | JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 | ||
4022 | MOVOU X0, (AX) | ||
4023 | MOVOU X1, 16(AX) | ||
4024 | MOVOU X2, -32(AX)(R8*1) | ||
4025 | MOVOU X3, -16(AX)(R8*1) | ||
4026 | MOVQ BX, AX | ||
4027 | |||
4028 | emit_literal_done_repeat_emit_encodeBlockAsm10B: | ||
4029 | ADDL $0x05, CX | ||
4030 | MOVL CX, BX | ||
4031 | SUBL 16(SP), BX | ||
4032 | MOVQ src_len+32(FP), R8 | ||
4033 | SUBL CX, R8 | ||
4034 | LEAQ (DX)(CX*1), R9 | ||
4035 | LEAQ (DX)(BX*1), BX | ||
4036 | |||
4037 | // matchLen | ||
4038 | XORL R11, R11 | ||
4039 | |||
4040 | matchlen_loopback_16_repeat_extend_encodeBlockAsm10B: | ||
4041 | CMPL R8, $0x10 | ||
4042 | JB matchlen_match8_repeat_extend_encodeBlockAsm10B | ||
4043 | MOVQ (R9)(R11*1), R10 | ||
4044 | MOVQ 8(R9)(R11*1), R12 | ||
4045 | XORQ (BX)(R11*1), R10 | ||
4046 | JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B | ||
4047 | XORQ 8(BX)(R11*1), R12 | ||
4048 | JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm10B | ||
4049 | LEAL -16(R8), R8 | ||
4050 | LEAL 16(R11), R11 | ||
4051 | JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm10B | ||
4052 | |||
4053 | matchlen_bsf_16repeat_extend_encodeBlockAsm10B: | ||
4054 | #ifdef GOAMD64_v3 | ||
4055 | TZCNTQ R12, R12 | ||
4056 | |||
4057 | #else | ||
4058 | BSFQ R12, R12 | ||
4059 | |||
4060 | #endif | ||
4061 | SARQ $0x03, R12 | ||
4062 | LEAL 8(R11)(R12*1), R11 | ||
4063 | JMP repeat_extend_forward_end_encodeBlockAsm10B | ||
4064 | |||
4065 | matchlen_match8_repeat_extend_encodeBlockAsm10B: | ||
4066 | CMPL R8, $0x08 | ||
4067 | JB matchlen_match4_repeat_extend_encodeBlockAsm10B | ||
4068 | MOVQ (R9)(R11*1), R10 | ||
4069 | XORQ (BX)(R11*1), R10 | ||
4070 | JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B | ||
4071 | LEAL -8(R8), R8 | ||
4072 | LEAL 8(R11), R11 | ||
4073 | JMP matchlen_match4_repeat_extend_encodeBlockAsm10B | ||
4074 | |||
4075 | matchlen_bsf_8_repeat_extend_encodeBlockAsm10B: | ||
4076 | #ifdef GOAMD64_v3 | ||
4077 | TZCNTQ R10, R10 | ||
4078 | |||
4079 | #else | ||
4080 | BSFQ R10, R10 | ||
4081 | |||
4082 | #endif | ||
4083 | SARQ $0x03, R10 | ||
4084 | LEAL (R11)(R10*1), R11 | ||
4085 | JMP repeat_extend_forward_end_encodeBlockAsm10B | ||
4086 | |||
4087 | matchlen_match4_repeat_extend_encodeBlockAsm10B: | ||
4088 | CMPL R8, $0x04 | ||
4089 | JB matchlen_match2_repeat_extend_encodeBlockAsm10B | ||
4090 | MOVL (R9)(R11*1), R10 | ||
4091 | CMPL (BX)(R11*1), R10 | ||
4092 | JNE matchlen_match2_repeat_extend_encodeBlockAsm10B | ||
4093 | LEAL -4(R8), R8 | ||
4094 | LEAL 4(R11), R11 | ||
4095 | |||
4096 | matchlen_match2_repeat_extend_encodeBlockAsm10B: | ||
4097 | CMPL R8, $0x01 | ||
4098 | JE matchlen_match1_repeat_extend_encodeBlockAsm10B | ||
4099 | JB repeat_extend_forward_end_encodeBlockAsm10B | ||
4100 | MOVW (R9)(R11*1), R10 | ||
4101 | CMPW (BX)(R11*1), R10 | ||
4102 | JNE matchlen_match1_repeat_extend_encodeBlockAsm10B | ||
4103 | LEAL 2(R11), R11 | ||
4104 | SUBL $0x02, R8 | ||
4105 | JZ repeat_extend_forward_end_encodeBlockAsm10B | ||
4106 | |||
4107 | matchlen_match1_repeat_extend_encodeBlockAsm10B: | ||
4108 | MOVB (R9)(R11*1), R10 | ||
4109 | CMPB (BX)(R11*1), R10 | ||
4110 | JNE repeat_extend_forward_end_encodeBlockAsm10B | ||
4111 | LEAL 1(R11), R11 | ||
4112 | |||
4113 | repeat_extend_forward_end_encodeBlockAsm10B: | ||
4114 | ADDL R11, CX | ||
4115 | MOVL CX, BX | ||
4116 | SUBL SI, BX | ||
4117 | MOVL 16(SP), SI | ||
4118 | TESTL DI, DI | ||
4119 | JZ repeat_as_copy_encodeBlockAsm10B | ||
4120 | |||
4121 | // emitRepeat | ||
4122 | MOVL BX, DI | ||
4123 | LEAL -4(BX), BX | ||
4124 | CMPL DI, $0x08 | ||
4125 | JBE repeat_two_match_repeat_encodeBlockAsm10B | ||
4126 | CMPL DI, $0x0c | ||
4127 | JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B | ||
4128 | CMPL SI, $0x00000800 | ||
4129 | JB repeat_two_offset_match_repeat_encodeBlockAsm10B | ||
4130 | |||
4131 | cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: | ||
4132 | CMPL BX, $0x00000104 | ||
4133 | JB repeat_three_match_repeat_encodeBlockAsm10B | ||
4134 | LEAL -256(BX), BX | ||
4135 | MOVW $0x0019, (AX) | ||
4136 | MOVW BX, 2(AX) | ||
4137 | ADDQ $0x04, AX | ||
4138 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4139 | |||
4140 | repeat_three_match_repeat_encodeBlockAsm10B: | ||
4141 | LEAL -4(BX), BX | ||
4142 | MOVW $0x0015, (AX) | ||
4143 | MOVB BL, 2(AX) | ||
4144 | ADDQ $0x03, AX | ||
4145 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4146 | |||
4147 | repeat_two_match_repeat_encodeBlockAsm10B: | ||
4148 | SHLL $0x02, BX | ||
4149 | ORL $0x01, BX | ||
4150 | MOVW BX, (AX) | ||
4151 | ADDQ $0x02, AX | ||
4152 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4153 | |||
4154 | repeat_two_offset_match_repeat_encodeBlockAsm10B: | ||
4155 | XORQ DI, DI | ||
4156 | LEAL 1(DI)(BX*4), BX | ||
4157 | MOVB SI, 1(AX) | ||
4158 | SARL $0x08, SI | ||
4159 | SHLL $0x05, SI | ||
4160 | ORL SI, BX | ||
4161 | MOVB BL, (AX) | ||
4162 | ADDQ $0x02, AX | ||
4163 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4164 | |||
4165 | repeat_as_copy_encodeBlockAsm10B: | ||
4166 | // emitCopy | ||
4167 | CMPL BX, $0x40 | ||
4168 | JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B | ||
4169 | CMPL SI, $0x00000800 | ||
4170 | JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B | ||
4171 | MOVL $0x00000001, DI | ||
4172 | LEAL 16(DI), DI | ||
4173 | MOVB SI, 1(AX) | ||
4174 | SHRL $0x08, SI | ||
4175 | SHLL $0x05, SI | ||
4176 | ORL SI, DI | ||
4177 | MOVB DI, (AX) | ||
4178 | ADDQ $0x02, AX | ||
4179 | SUBL $0x08, BX | ||
4180 | |||
4181 | // emitRepeat | ||
4182 | LEAL -4(BX), BX | ||
4183 | JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b | ||
4184 | MOVL BX, DI | ||
4185 | LEAL -4(BX), BX | ||
4186 | CMPL DI, $0x08 | ||
4187 | JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b | ||
4188 | CMPL DI, $0x0c | ||
4189 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b | ||
4190 | CMPL SI, $0x00000800 | ||
4191 | JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b | ||
4192 | |||
4193 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: | ||
4194 | CMPL BX, $0x00000104 | ||
4195 | JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b | ||
4196 | LEAL -256(BX), BX | ||
4197 | MOVW $0x0019, (AX) | ||
4198 | MOVW BX, 2(AX) | ||
4199 | ADDQ $0x04, AX | ||
4200 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4201 | |||
4202 | repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: | ||
4203 | LEAL -4(BX), BX | ||
4204 | MOVW $0x0015, (AX) | ||
4205 | MOVB BL, 2(AX) | ||
4206 | ADDQ $0x03, AX | ||
4207 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4208 | |||
4209 | repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: | ||
4210 | SHLL $0x02, BX | ||
4211 | ORL $0x01, BX | ||
4212 | MOVW BX, (AX) | ||
4213 | ADDQ $0x02, AX | ||
4214 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4215 | |||
4216 | repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: | ||
4217 | XORQ DI, DI | ||
4218 | LEAL 1(DI)(BX*4), BX | ||
4219 | MOVB SI, 1(AX) | ||
4220 | SARL $0x08, SI | ||
4221 | SHLL $0x05, SI | ||
4222 | ORL SI, BX | ||
4223 | MOVB BL, (AX) | ||
4224 | ADDQ $0x02, AX | ||
4225 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4226 | |||
4227 | long_offset_short_repeat_as_copy_encodeBlockAsm10B: | ||
4228 | MOVB $0xee, (AX) | ||
4229 | MOVW SI, 1(AX) | ||
4230 | LEAL -60(BX), BX | ||
4231 | ADDQ $0x03, AX | ||
4232 | |||
4233 | // emitRepeat | ||
4234 | MOVL BX, DI | ||
4235 | LEAL -4(BX), BX | ||
4236 | CMPL DI, $0x08 | ||
4237 | JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short | ||
4238 | CMPL DI, $0x0c | ||
4239 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short | ||
4240 | CMPL SI, $0x00000800 | ||
4241 | JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short | ||
4242 | |||
4243 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: | ||
4244 | CMPL BX, $0x00000104 | ||
4245 | JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short | ||
4246 | LEAL -256(BX), BX | ||
4247 | MOVW $0x0019, (AX) | ||
4248 | MOVW BX, 2(AX) | ||
4249 | ADDQ $0x04, AX | ||
4250 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4251 | |||
4252 | repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: | ||
4253 | LEAL -4(BX), BX | ||
4254 | MOVW $0x0015, (AX) | ||
4255 | MOVB BL, 2(AX) | ||
4256 | ADDQ $0x03, AX | ||
4257 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4258 | |||
4259 | repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: | ||
4260 | SHLL $0x02, BX | ||
4261 | ORL $0x01, BX | ||
4262 | MOVW BX, (AX) | ||
4263 | ADDQ $0x02, AX | ||
4264 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4265 | |||
4266 | repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: | ||
4267 | XORQ DI, DI | ||
4268 | LEAL 1(DI)(BX*4), BX | ||
4269 | MOVB SI, 1(AX) | ||
4270 | SARL $0x08, SI | ||
4271 | SHLL $0x05, SI | ||
4272 | ORL SI, BX | ||
4273 | MOVB BL, (AX) | ||
4274 | ADDQ $0x02, AX | ||
4275 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4276 | |||
4277 | two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: | ||
4278 | MOVL BX, DI | ||
4279 | SHLL $0x02, DI | ||
4280 | CMPL BX, $0x0c | ||
4281 | JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B | ||
4282 | CMPL SI, $0x00000800 | ||
4283 | JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B | ||
4284 | LEAL -15(DI), DI | ||
4285 | MOVB SI, 1(AX) | ||
4286 | SHRL $0x08, SI | ||
4287 | SHLL $0x05, SI | ||
4288 | ORL SI, DI | ||
4289 | MOVB DI, (AX) | ||
4290 | ADDQ $0x02, AX | ||
4291 | JMP repeat_end_emit_encodeBlockAsm10B | ||
4292 | |||
4293 | emit_copy_three_repeat_as_copy_encodeBlockAsm10B: | ||
4294 | LEAL -2(DI), DI | ||
4295 | MOVB DI, (AX) | ||
4296 | MOVW SI, 1(AX) | ||
4297 | ADDQ $0x03, AX | ||
4298 | |||
4299 | repeat_end_emit_encodeBlockAsm10B: | ||
4300 | MOVL CX, 12(SP) | ||
4301 | JMP search_loop_encodeBlockAsm10B | ||
4302 | |||
4303 | no_repeat_found_encodeBlockAsm10B: | ||
4304 | CMPL (DX)(BX*1), SI | ||
4305 | JEQ candidate_match_encodeBlockAsm10B | ||
4306 | SHRQ $0x08, SI | ||
4307 | MOVL 24(SP)(R9*4), BX | ||
4308 | LEAL 2(CX), R8 | ||
4309 | CMPL (DX)(DI*1), SI | ||
4310 | JEQ candidate2_match_encodeBlockAsm10B | ||
4311 | MOVL R8, 24(SP)(R9*4) | ||
4312 | SHRQ $0x08, SI | ||
4313 | CMPL (DX)(BX*1), SI | ||
4314 | JEQ candidate3_match_encodeBlockAsm10B | ||
4315 | MOVL 20(SP), CX | ||
4316 | JMP search_loop_encodeBlockAsm10B | ||
4317 | |||
4318 | candidate3_match_encodeBlockAsm10B: | ||
4319 | ADDL $0x02, CX | ||
4320 | JMP candidate_match_encodeBlockAsm10B | ||
4321 | |||
4322 | candidate2_match_encodeBlockAsm10B: | ||
4323 | MOVL R8, 24(SP)(R9*4) | ||
4324 | INCL CX | ||
4325 | MOVL DI, BX | ||
4326 | |||
4327 | candidate_match_encodeBlockAsm10B: | ||
4328 | MOVL 12(SP), SI | ||
4329 | TESTL BX, BX | ||
4330 | JZ match_extend_back_end_encodeBlockAsm10B | ||
4331 | |||
4332 | match_extend_back_loop_encodeBlockAsm10B: | ||
4333 | CMPL CX, SI | ||
4334 | JBE match_extend_back_end_encodeBlockAsm10B | ||
4335 | MOVB -1(DX)(BX*1), DI | ||
4336 | MOVB -1(DX)(CX*1), R8 | ||
4337 | CMPB DI, R8 | ||
4338 | JNE match_extend_back_end_encodeBlockAsm10B | ||
4339 | LEAL -1(CX), CX | ||
4340 | DECL BX | ||
4341 | JZ match_extend_back_end_encodeBlockAsm10B | ||
4342 | JMP match_extend_back_loop_encodeBlockAsm10B | ||
4343 | |||
4344 | match_extend_back_end_encodeBlockAsm10B: | ||
4345 | MOVL CX, SI | ||
4346 | SUBL 12(SP), SI | ||
4347 | LEAQ 3(AX)(SI*1), SI | ||
4348 | CMPQ SI, (SP) | ||
4349 | JB match_dst_size_check_encodeBlockAsm10B | ||
4350 | MOVQ $0x00000000, ret+48(FP) | ||
4351 | RET | ||
4352 | |||
4353 | match_dst_size_check_encodeBlockAsm10B: | ||
4354 | MOVL CX, SI | ||
4355 | MOVL 12(SP), DI | ||
4356 | CMPL DI, SI | ||
4357 | JEQ emit_literal_done_match_emit_encodeBlockAsm10B | ||
4358 | MOVL SI, R8 | ||
4359 | MOVL SI, 12(SP) | ||
4360 | LEAQ (DX)(DI*1), SI | ||
4361 | SUBL DI, R8 | ||
4362 | LEAL -1(R8), DI | ||
4363 | CMPL DI, $0x3c | ||
4364 | JB one_byte_match_emit_encodeBlockAsm10B | ||
4365 | CMPL DI, $0x00000100 | ||
4366 | JB two_bytes_match_emit_encodeBlockAsm10B | ||
4367 | JB three_bytes_match_emit_encodeBlockAsm10B | ||
4368 | |||
4369 | three_bytes_match_emit_encodeBlockAsm10B: | ||
4370 | MOVB $0xf4, (AX) | ||
4371 | MOVW DI, 1(AX) | ||
4372 | ADDQ $0x03, AX | ||
4373 | JMP memmove_long_match_emit_encodeBlockAsm10B | ||
4374 | |||
4375 | two_bytes_match_emit_encodeBlockAsm10B: | ||
4376 | MOVB $0xf0, (AX) | ||
4377 | MOVB DI, 1(AX) | ||
4378 | ADDQ $0x02, AX | ||
4379 | CMPL DI, $0x40 | ||
4380 | JB memmove_match_emit_encodeBlockAsm10B | ||
4381 | JMP memmove_long_match_emit_encodeBlockAsm10B | ||
4382 | |||
4383 | one_byte_match_emit_encodeBlockAsm10B: | ||
4384 | SHLB $0x02, DI | ||
4385 | MOVB DI, (AX) | ||
4386 | ADDQ $0x01, AX | ||
4387 | |||
4388 | memmove_match_emit_encodeBlockAsm10B: | ||
4389 | LEAQ (AX)(R8*1), DI | ||
4390 | |||
4391 | // genMemMoveShort | ||
4392 | CMPQ R8, $0x08 | ||
4393 | JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 | ||
4394 | CMPQ R8, $0x10 | ||
4395 | JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 | ||
4396 | CMPQ R8, $0x20 | ||
4397 | JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 | ||
4398 | JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 | ||
4399 | |||
4400 | emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: | ||
4401 | MOVQ (SI), R9 | ||
4402 | MOVQ R9, (AX) | ||
4403 | JMP memmove_end_copy_match_emit_encodeBlockAsm10B | ||
4404 | |||
4405 | emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: | ||
4406 | MOVQ (SI), R9 | ||
4407 | MOVQ -8(SI)(R8*1), SI | ||
4408 | MOVQ R9, (AX) | ||
4409 | MOVQ SI, -8(AX)(R8*1) | ||
4410 | JMP memmove_end_copy_match_emit_encodeBlockAsm10B | ||
4411 | |||
4412 | emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: | ||
4413 | MOVOU (SI), X0 | ||
4414 | MOVOU -16(SI)(R8*1), X1 | ||
4415 | MOVOU X0, (AX) | ||
4416 | MOVOU X1, -16(AX)(R8*1) | ||
4417 | JMP memmove_end_copy_match_emit_encodeBlockAsm10B | ||
4418 | |||
4419 | emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: | ||
4420 | MOVOU (SI), X0 | ||
4421 | MOVOU 16(SI), X1 | ||
4422 | MOVOU -32(SI)(R8*1), X2 | ||
4423 | MOVOU -16(SI)(R8*1), X3 | ||
4424 | MOVOU X0, (AX) | ||
4425 | MOVOU X1, 16(AX) | ||
4426 | MOVOU X2, -32(AX)(R8*1) | ||
4427 | MOVOU X3, -16(AX)(R8*1) | ||
4428 | |||
4429 | memmove_end_copy_match_emit_encodeBlockAsm10B: | ||
4430 | MOVQ DI, AX | ||
4431 | JMP emit_literal_done_match_emit_encodeBlockAsm10B | ||
4432 | |||
4433 | memmove_long_match_emit_encodeBlockAsm10B: | ||
4434 | LEAQ (AX)(R8*1), DI | ||
4435 | |||
4436 | // genMemMoveLong | ||
4437 | MOVOU (SI), X0 | ||
4438 | MOVOU 16(SI), X1 | ||
4439 | MOVOU -32(SI)(R8*1), X2 | ||
4440 | MOVOU -16(SI)(R8*1), X3 | ||
4441 | MOVQ R8, R10 | ||
4442 | SHRQ $0x05, R10 | ||
4443 | MOVQ AX, R9 | ||
4444 | ANDL $0x0000001f, R9 | ||
4445 | MOVQ $0x00000040, R11 | ||
4446 | SUBQ R9, R11 | ||
4447 | DECQ R10 | ||
4448 | JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 | ||
4449 | LEAQ -32(SI)(R11*1), R9 | ||
4450 | LEAQ -32(AX)(R11*1), R12 | ||
4451 | |||
4452 | emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: | ||
4453 | MOVOU (R9), X4 | ||
4454 | MOVOU 16(R9), X5 | ||
4455 | MOVOA X4, (R12) | ||
4456 | MOVOA X5, 16(R12) | ||
4457 | ADDQ $0x20, R12 | ||
4458 | ADDQ $0x20, R9 | ||
4459 | ADDQ $0x20, R11 | ||
4460 | DECQ R10 | ||
4461 | JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back | ||
4462 | |||
4463 | emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: | ||
4464 | MOVOU -32(SI)(R11*1), X4 | ||
4465 | MOVOU -16(SI)(R11*1), X5 | ||
4466 | MOVOA X4, -32(AX)(R11*1) | ||
4467 | MOVOA X5, -16(AX)(R11*1) | ||
4468 | ADDQ $0x20, R11 | ||
4469 | CMPQ R8, R11 | ||
4470 | JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 | ||
4471 | MOVOU X0, (AX) | ||
4472 | MOVOU X1, 16(AX) | ||
4473 | MOVOU X2, -32(AX)(R8*1) | ||
4474 | MOVOU X3, -16(AX)(R8*1) | ||
4475 | MOVQ DI, AX | ||
4476 | |||
4477 | emit_literal_done_match_emit_encodeBlockAsm10B: | ||
4478 | match_nolit_loop_encodeBlockAsm10B: | ||
4479 | MOVL CX, SI | ||
4480 | SUBL BX, SI | ||
4481 | MOVL SI, 16(SP) | ||
4482 | ADDL $0x04, CX | ||
4483 | ADDL $0x04, BX | ||
4484 | MOVQ src_len+32(FP), SI | ||
4485 | SUBL CX, SI | ||
4486 | LEAQ (DX)(CX*1), DI | ||
4487 | LEAQ (DX)(BX*1), BX | ||
4488 | |||
4489 | // matchLen | ||
4490 | XORL R9, R9 | ||
4491 | |||
4492 | matchlen_loopback_16_match_nolit_encodeBlockAsm10B: | ||
4493 | CMPL SI, $0x10 | ||
4494 | JB matchlen_match8_match_nolit_encodeBlockAsm10B | ||
4495 | MOVQ (DI)(R9*1), R8 | ||
4496 | MOVQ 8(DI)(R9*1), R10 | ||
4497 | XORQ (BX)(R9*1), R8 | ||
4498 | JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B | ||
4499 | XORQ 8(BX)(R9*1), R10 | ||
4500 | JNZ matchlen_bsf_16match_nolit_encodeBlockAsm10B | ||
4501 | LEAL -16(SI), SI | ||
4502 | LEAL 16(R9), R9 | ||
4503 | JMP matchlen_loopback_16_match_nolit_encodeBlockAsm10B | ||
4504 | |||
4505 | matchlen_bsf_16match_nolit_encodeBlockAsm10B: | ||
4506 | #ifdef GOAMD64_v3 | ||
4507 | TZCNTQ R10, R10 | ||
4508 | |||
4509 | #else | ||
4510 | BSFQ R10, R10 | ||
4511 | |||
4512 | #endif | ||
4513 | SARQ $0x03, R10 | ||
4514 | LEAL 8(R9)(R10*1), R9 | ||
4515 | JMP match_nolit_end_encodeBlockAsm10B | ||
4516 | |||
4517 | matchlen_match8_match_nolit_encodeBlockAsm10B: | ||
4518 | CMPL SI, $0x08 | ||
4519 | JB matchlen_match4_match_nolit_encodeBlockAsm10B | ||
4520 | MOVQ (DI)(R9*1), R8 | ||
4521 | XORQ (BX)(R9*1), R8 | ||
4522 | JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B | ||
4523 | LEAL -8(SI), SI | ||
4524 | LEAL 8(R9), R9 | ||
4525 | JMP matchlen_match4_match_nolit_encodeBlockAsm10B | ||
4526 | |||
4527 | matchlen_bsf_8_match_nolit_encodeBlockAsm10B: | ||
4528 | #ifdef GOAMD64_v3 | ||
4529 | TZCNTQ R8, R8 | ||
4530 | |||
4531 | #else | ||
4532 | BSFQ R8, R8 | ||
4533 | |||
4534 | #endif | ||
4535 | SARQ $0x03, R8 | ||
4536 | LEAL (R9)(R8*1), R9 | ||
4537 | JMP match_nolit_end_encodeBlockAsm10B | ||
4538 | |||
4539 | matchlen_match4_match_nolit_encodeBlockAsm10B: | ||
4540 | CMPL SI, $0x04 | ||
4541 | JB matchlen_match2_match_nolit_encodeBlockAsm10B | ||
4542 | MOVL (DI)(R9*1), R8 | ||
4543 | CMPL (BX)(R9*1), R8 | ||
4544 | JNE matchlen_match2_match_nolit_encodeBlockAsm10B | ||
4545 | LEAL -4(SI), SI | ||
4546 | LEAL 4(R9), R9 | ||
4547 | |||
4548 | matchlen_match2_match_nolit_encodeBlockAsm10B: | ||
4549 | CMPL SI, $0x01 | ||
4550 | JE matchlen_match1_match_nolit_encodeBlockAsm10B | ||
4551 | JB match_nolit_end_encodeBlockAsm10B | ||
4552 | MOVW (DI)(R9*1), R8 | ||
4553 | CMPW (BX)(R9*1), R8 | ||
4554 | JNE matchlen_match1_match_nolit_encodeBlockAsm10B | ||
4555 | LEAL 2(R9), R9 | ||
4556 | SUBL $0x02, SI | ||
4557 | JZ match_nolit_end_encodeBlockAsm10B | ||
4558 | |||
4559 | matchlen_match1_match_nolit_encodeBlockAsm10B: | ||
4560 | MOVB (DI)(R9*1), R8 | ||
4561 | CMPB (BX)(R9*1), R8 | ||
4562 | JNE match_nolit_end_encodeBlockAsm10B | ||
4563 | LEAL 1(R9), R9 | ||
4564 | |||
4565 | match_nolit_end_encodeBlockAsm10B: | ||
4566 | ADDL R9, CX | ||
4567 | MOVL 16(SP), BX | ||
4568 | ADDL $0x04, R9 | ||
4569 | MOVL CX, 12(SP) | ||
4570 | |||
4571 | // emitCopy | ||
4572 | CMPL R9, $0x40 | ||
4573 | JBE two_byte_offset_short_match_nolit_encodeBlockAsm10B | ||
4574 | CMPL BX, $0x00000800 | ||
4575 | JAE long_offset_short_match_nolit_encodeBlockAsm10B | ||
4576 | MOVL $0x00000001, SI | ||
4577 | LEAL 16(SI), SI | ||
4578 | MOVB BL, 1(AX) | ||
4579 | SHRL $0x08, BX | ||
4580 | SHLL $0x05, BX | ||
4581 | ORL BX, SI | ||
4582 | MOVB SI, (AX) | ||
4583 | ADDQ $0x02, AX | ||
4584 | SUBL $0x08, R9 | ||
4585 | |||
4586 | // emitRepeat | ||
4587 | LEAL -4(R9), R9 | ||
4588 | JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b | ||
4589 | MOVL R9, SI | ||
4590 | LEAL -4(R9), R9 | ||
4591 | CMPL SI, $0x08 | ||
4592 | JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b | ||
4593 | CMPL SI, $0x0c | ||
4594 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b | ||
4595 | CMPL BX, $0x00000800 | ||
4596 | JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b | ||
4597 | |||
4598 | cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: | ||
4599 | CMPL R9, $0x00000104 | ||
4600 | JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b | ||
4601 | LEAL -256(R9), R9 | ||
4602 | MOVW $0x0019, (AX) | ||
4603 | MOVW R9, 2(AX) | ||
4604 | ADDQ $0x04, AX | ||
4605 | JMP match_nolit_emitcopy_end_encodeBlockAsm10B | ||
4606 | |||
4607 | repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: | ||
4608 | LEAL -4(R9), R9 | ||
4609 | MOVW $0x0015, (AX) | ||
4610 | MOVB R9, 2(AX) | ||
4611 | ADDQ $0x03, AX | ||
4612 | JMP match_nolit_emitcopy_end_encodeBlockAsm10B | ||
4613 | |||
4614 | repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: | ||
4615 | SHLL $0x02, R9 | ||
4616 | ORL $0x01, R9 | ||
4617 | MOVW R9, (AX) | ||
4618 | ADDQ $0x02, AX | ||
4619 | JMP match_nolit_emitcopy_end_encodeBlockAsm10B | ||
4620 | |||
4621 | repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: | ||
4622 | XORQ SI, SI | ||
4623 | LEAL 1(SI)(R9*4), R9 | ||
4624 | MOVB BL, 1(AX) | ||
4625 | SARL $0x08, BX | ||
4626 | SHLL $0x05, BX | ||
4627 | ORL BX, R9 | ||
4628 | MOVB R9, (AX) | ||
4629 | ADDQ $0x02, AX | ||
4630 | JMP match_nolit_emitcopy_end_encodeBlockAsm10B | ||
4631 | |||
4632 | long_offset_short_match_nolit_encodeBlockAsm10B: | ||
4633 | MOVB $0xee, (AX) | ||
4634 | MOVW BX, 1(AX) | ||
4635 | LEAL -60(R9), R9 | ||
4636 | ADDQ $0x03, AX | ||
4637 | |||
4638 | // emitRepeat | ||
4639 | MOVL R9, SI | ||
4640 | LEAL -4(R9), R9 | ||
4641 | CMPL SI, $0x08 | ||
4642 | JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short | ||
4643 | CMPL SI, $0x0c | ||
4644 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short | ||
4645 | CMPL BX, $0x00000800 | ||
4646 | JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short | ||
4647 | |||
4648 | cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: | ||
4649 | CMPL R9, $0x00000104 | ||
4650 | JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short | ||
4651 | LEAL -256(R9), R9 | ||
4652 | MOVW $0x0019, (AX) | ||
4653 | MOVW R9, 2(AX) | ||
4654 | ADDQ $0x04, AX | ||
4655 | JMP match_nolit_emitcopy_end_encodeBlockAsm10B | ||
4656 | |||
4657 | repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: | ||
4658 | LEAL -4(R9), R9 | ||
4659 | MOVW $0x0015, (AX) | ||
4660 | MOVB R9, 2(AX) | ||
4661 | ADDQ $0x03, AX | ||
4662 | JMP match_nolit_emitcopy_end_encodeBlockAsm10B | ||
4663 | |||
4664 | repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: | ||
4665 | SHLL $0x02, R9 | ||
4666 | ORL $0x01, R9 | ||
4667 | MOVW R9, (AX) | ||
4668 | ADDQ $0x02, AX | ||
4669 | JMP match_nolit_emitcopy_end_encodeBlockAsm10B | ||
4670 | |||
4671 | repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: | ||
4672 | XORQ SI, SI | ||
4673 | LEAL 1(SI)(R9*4), R9 | ||
4674 | MOVB BL, 1(AX) | ||
4675 | SARL $0x08, BX | ||
4676 | SHLL $0x05, BX | ||
4677 | ORL BX, R9 | ||
4678 | MOVB R9, (AX) | ||
4679 | ADDQ $0x02, AX | ||
4680 | JMP match_nolit_emitcopy_end_encodeBlockAsm10B | ||
4681 | |||
4682 | two_byte_offset_short_match_nolit_encodeBlockAsm10B: | ||
4683 | MOVL R9, SI | ||
4684 | SHLL $0x02, SI | ||
4685 | CMPL R9, $0x0c | ||
4686 | JAE emit_copy_three_match_nolit_encodeBlockAsm10B | ||
4687 | CMPL BX, $0x00000800 | ||
4688 | JAE emit_copy_three_match_nolit_encodeBlockAsm10B | ||
4689 | LEAL -15(SI), SI | ||
4690 | MOVB BL, 1(AX) | ||
4691 | SHRL $0x08, BX | ||
4692 | SHLL $0x05, BX | ||
4693 | ORL BX, SI | ||
4694 | MOVB SI, (AX) | ||
4695 | ADDQ $0x02, AX | ||
4696 | JMP match_nolit_emitcopy_end_encodeBlockAsm10B | ||
4697 | |||
4698 | emit_copy_three_match_nolit_encodeBlockAsm10B: | ||
4699 | LEAL -2(SI), SI | ||
4700 | MOVB SI, (AX) | ||
4701 | MOVW BX, 1(AX) | ||
4702 | ADDQ $0x03, AX | ||
4703 | |||
4704 | match_nolit_emitcopy_end_encodeBlockAsm10B: | ||
4705 | CMPL CX, 8(SP) | ||
4706 | JAE emit_remainder_encodeBlockAsm10B | ||
4707 | MOVQ -2(DX)(CX*1), SI | ||
4708 | CMPQ AX, (SP) | ||
4709 | JB match_nolit_dst_ok_encodeBlockAsm10B | ||
4710 | MOVQ $0x00000000, ret+48(FP) | ||
4711 | RET | ||
4712 | |||
4713 | match_nolit_dst_ok_encodeBlockAsm10B: | ||
4714 | MOVQ $0x9e3779b1, R8 | ||
4715 | MOVQ SI, DI | ||
4716 | SHRQ $0x10, SI | ||
4717 | MOVQ SI, BX | ||
4718 | SHLQ $0x20, DI | ||
4719 | IMULQ R8, DI | ||
4720 | SHRQ $0x36, DI | ||
4721 | SHLQ $0x20, BX | ||
4722 | IMULQ R8, BX | ||
4723 | SHRQ $0x36, BX | ||
4724 | LEAL -2(CX), R8 | ||
4725 | LEAQ 24(SP)(BX*4), R9 | ||
4726 | MOVL (R9), BX | ||
4727 | MOVL R8, 24(SP)(DI*4) | ||
4728 | MOVL CX, (R9) | ||
4729 | CMPL (DX)(BX*1), SI | ||
4730 | JEQ match_nolit_loop_encodeBlockAsm10B | ||
4731 | INCL CX | ||
4732 | JMP search_loop_encodeBlockAsm10B | ||
4733 | |||
4734 | emit_remainder_encodeBlockAsm10B: | ||
4735 | MOVQ src_len+32(FP), CX | ||
4736 | SUBL 12(SP), CX | ||
4737 | LEAQ 3(AX)(CX*1), CX | ||
4738 | CMPQ CX, (SP) | ||
4739 | JB emit_remainder_ok_encodeBlockAsm10B | ||
4740 | MOVQ $0x00000000, ret+48(FP) | ||
4741 | RET | ||
4742 | |||
4743 | emit_remainder_ok_encodeBlockAsm10B: | ||
4744 | MOVQ src_len+32(FP), CX | ||
4745 | MOVL 12(SP), BX | ||
4746 | CMPL BX, CX | ||
4747 | JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B | ||
4748 | MOVL CX, SI | ||
4749 | MOVL CX, 12(SP) | ||
4750 | LEAQ (DX)(BX*1), CX | ||
4751 | SUBL BX, SI | ||
4752 | LEAL -1(SI), DX | ||
4753 | CMPL DX, $0x3c | ||
4754 | JB one_byte_emit_remainder_encodeBlockAsm10B | ||
4755 | CMPL DX, $0x00000100 | ||
4756 | JB two_bytes_emit_remainder_encodeBlockAsm10B | ||
4757 | JB three_bytes_emit_remainder_encodeBlockAsm10B | ||
4758 | |||
4759 | three_bytes_emit_remainder_encodeBlockAsm10B: | ||
4760 | MOVB $0xf4, (AX) | ||
4761 | MOVW DX, 1(AX) | ||
4762 | ADDQ $0x03, AX | ||
4763 | JMP memmove_long_emit_remainder_encodeBlockAsm10B | ||
4764 | |||
4765 | two_bytes_emit_remainder_encodeBlockAsm10B: | ||
4766 | MOVB $0xf0, (AX) | ||
4767 | MOVB DL, 1(AX) | ||
4768 | ADDQ $0x02, AX | ||
4769 | CMPL DX, $0x40 | ||
4770 | JB memmove_emit_remainder_encodeBlockAsm10B | ||
4771 | JMP memmove_long_emit_remainder_encodeBlockAsm10B | ||
4772 | |||
4773 | one_byte_emit_remainder_encodeBlockAsm10B: | ||
4774 | SHLB $0x02, DL | ||
4775 | MOVB DL, (AX) | ||
4776 | ADDQ $0x01, AX | ||
4777 | |||
4778 | memmove_emit_remainder_encodeBlockAsm10B: | ||
4779 | LEAQ (AX)(SI*1), DX | ||
4780 | MOVL SI, BX | ||
4781 | |||
4782 | // genMemMoveShort | ||
4783 | CMPQ BX, $0x03 | ||
4784 | JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 | ||
4785 | JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 | ||
4786 | CMPQ BX, $0x08 | ||
4787 | JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 | ||
4788 | CMPQ BX, $0x10 | ||
4789 | JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 | ||
4790 | CMPQ BX, $0x20 | ||
4791 | JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 | ||
4792 | JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 | ||
4793 | |||
4794 | emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: | ||
4795 | MOVB (CX), SI | ||
4796 | MOVB -1(CX)(BX*1), CL | ||
4797 | MOVB SI, (AX) | ||
4798 | MOVB CL, -1(AX)(BX*1) | ||
4799 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B | ||
4800 | |||
4801 | emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: | ||
4802 | MOVW (CX), SI | ||
4803 | MOVB 2(CX), CL | ||
4804 | MOVW SI, (AX) | ||
4805 | MOVB CL, 2(AX) | ||
4806 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B | ||
4807 | |||
4808 | emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: | ||
4809 | MOVL (CX), SI | ||
4810 | MOVL -4(CX)(BX*1), CX | ||
4811 | MOVL SI, (AX) | ||
4812 | MOVL CX, -4(AX)(BX*1) | ||
4813 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B | ||
4814 | |||
4815 | emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: | ||
4816 | MOVQ (CX), SI | ||
4817 | MOVQ -8(CX)(BX*1), CX | ||
4818 | MOVQ SI, (AX) | ||
4819 | MOVQ CX, -8(AX)(BX*1) | ||
4820 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B | ||
4821 | |||
4822 | emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: | ||
4823 | MOVOU (CX), X0 | ||
4824 | MOVOU -16(CX)(BX*1), X1 | ||
4825 | MOVOU X0, (AX) | ||
4826 | MOVOU X1, -16(AX)(BX*1) | ||
4827 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B | ||
4828 | |||
4829 | emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: | ||
4830 | MOVOU (CX), X0 | ||
4831 | MOVOU 16(CX), X1 | ||
4832 | MOVOU -32(CX)(BX*1), X2 | ||
4833 | MOVOU -16(CX)(BX*1), X3 | ||
4834 | MOVOU X0, (AX) | ||
4835 | MOVOU X1, 16(AX) | ||
4836 | MOVOU X2, -32(AX)(BX*1) | ||
4837 | MOVOU X3, -16(AX)(BX*1) | ||
4838 | |||
4839 | memmove_end_copy_emit_remainder_encodeBlockAsm10B: | ||
4840 | MOVQ DX, AX | ||
4841 | JMP emit_literal_done_emit_remainder_encodeBlockAsm10B | ||
4842 | |||
4843 | memmove_long_emit_remainder_encodeBlockAsm10B: | ||
4844 | LEAQ (AX)(SI*1), DX | ||
4845 | MOVL SI, BX | ||
4846 | |||
4847 | // genMemMoveLong | ||
4848 | MOVOU (CX), X0 | ||
4849 | MOVOU 16(CX), X1 | ||
4850 | MOVOU -32(CX)(BX*1), X2 | ||
4851 | MOVOU -16(CX)(BX*1), X3 | ||
4852 | MOVQ BX, DI | ||
4853 | SHRQ $0x05, DI | ||
4854 | MOVQ AX, SI | ||
4855 | ANDL $0x0000001f, SI | ||
4856 | MOVQ $0x00000040, R8 | ||
4857 | SUBQ SI, R8 | ||
4858 | DECQ DI | ||
4859 | JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 | ||
4860 | LEAQ -32(CX)(R8*1), SI | ||
4861 | LEAQ -32(AX)(R8*1), R9 | ||
4862 | |||
4863 | emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: | ||
4864 | MOVOU (SI), X4 | ||
4865 | MOVOU 16(SI), X5 | ||
4866 | MOVOA X4, (R9) | ||
4867 | MOVOA X5, 16(R9) | ||
4868 | ADDQ $0x20, R9 | ||
4869 | ADDQ $0x20, SI | ||
4870 | ADDQ $0x20, R8 | ||
4871 | DECQ DI | ||
4872 | JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back | ||
4873 | |||
4874 | emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: | ||
4875 | MOVOU -32(CX)(R8*1), X4 | ||
4876 | MOVOU -16(CX)(R8*1), X5 | ||
4877 | MOVOA X4, -32(AX)(R8*1) | ||
4878 | MOVOA X5, -16(AX)(R8*1) | ||
4879 | ADDQ $0x20, R8 | ||
4880 | CMPQ BX, R8 | ||
4881 | JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 | ||
4882 | MOVOU X0, (AX) | ||
4883 | MOVOU X1, 16(AX) | ||
4884 | MOVOU X2, -32(AX)(BX*1) | ||
4885 | MOVOU X3, -16(AX)(BX*1) | ||
4886 | MOVQ DX, AX | ||
4887 | |||
4888 | emit_literal_done_emit_remainder_encodeBlockAsm10B: | ||
4889 | MOVQ dst_base+0(FP), CX | ||
4890 | SUBQ CX, AX | ||
4891 | MOVQ AX, ret+48(FP) | ||
4892 | RET | ||
4893 | |||
4894 | // func encodeBlockAsm8B(dst []byte, src []byte) int | ||
4895 | // Requires: BMI, SSE2 | ||
4896 | TEXT ·encodeBlockAsm8B(SB), $1048-56 | ||
4897 | MOVQ dst_base+0(FP), AX | ||
4898 | MOVQ $0x00000008, CX | ||
4899 | LEAQ 24(SP), DX | ||
4900 | PXOR X0, X0 | ||
4901 | |||
4902 | zero_loop_encodeBlockAsm8B: | ||
4903 | MOVOU X0, (DX) | ||
4904 | MOVOU X0, 16(DX) | ||
4905 | MOVOU X0, 32(DX) | ||
4906 | MOVOU X0, 48(DX) | ||
4907 | MOVOU X0, 64(DX) | ||
4908 | MOVOU X0, 80(DX) | ||
4909 | MOVOU X0, 96(DX) | ||
4910 | MOVOU X0, 112(DX) | ||
4911 | ADDQ $0x80, DX | ||
4912 | DECQ CX | ||
4913 | JNZ zero_loop_encodeBlockAsm8B | ||
4914 | MOVL $0x00000000, 12(SP) | ||
4915 | MOVQ src_len+32(FP), CX | ||
4916 | LEAQ -9(CX), DX | ||
4917 | LEAQ -8(CX), BX | ||
4918 | MOVL BX, 8(SP) | ||
4919 | SHRQ $0x05, CX | ||
4920 | SUBL CX, DX | ||
4921 | LEAQ (AX)(DX*1), DX | ||
4922 | MOVQ DX, (SP) | ||
4923 | MOVL $0x00000001, CX | ||
4924 | MOVL CX, 16(SP) | ||
4925 | MOVQ src_base+24(FP), DX | ||
4926 | |||
4927 | search_loop_encodeBlockAsm8B: | ||
4928 | MOVL CX, BX | ||
4929 | SUBL 12(SP), BX | ||
4930 | SHRL $0x04, BX | ||
4931 | LEAL 4(CX)(BX*1), BX | ||
4932 | CMPL BX, 8(SP) | ||
4933 | JAE emit_remainder_encodeBlockAsm8B | ||
4934 | MOVQ (DX)(CX*1), SI | ||
4935 | MOVL BX, 20(SP) | ||
4936 | MOVQ $0x9e3779b1, R8 | ||
4937 | MOVQ SI, R9 | ||
4938 | MOVQ SI, R10 | ||
4939 | SHRQ $0x08, R10 | ||
4940 | SHLQ $0x20, R9 | ||
4941 | IMULQ R8, R9 | ||
4942 | SHRQ $0x38, R9 | ||
4943 | SHLQ $0x20, R10 | ||
4944 | IMULQ R8, R10 | ||
4945 | SHRQ $0x38, R10 | ||
4946 | MOVL 24(SP)(R9*4), BX | ||
4947 | MOVL 24(SP)(R10*4), DI | ||
4948 | MOVL CX, 24(SP)(R9*4) | ||
4949 | LEAL 1(CX), R9 | ||
4950 | MOVL R9, 24(SP)(R10*4) | ||
4951 | MOVQ SI, R9 | ||
4952 | SHRQ $0x10, R9 | ||
4953 | SHLQ $0x20, R9 | ||
4954 | IMULQ R8, R9 | ||
4955 | SHRQ $0x38, R9 | ||
4956 | MOVL CX, R8 | ||
4957 | SUBL 16(SP), R8 | ||
4958 | MOVL 1(DX)(R8*1), R10 | ||
4959 | MOVQ SI, R8 | ||
4960 | SHRQ $0x08, R8 | ||
4961 | CMPL R8, R10 | ||
4962 | JNE no_repeat_found_encodeBlockAsm8B | ||
4963 | LEAL 1(CX), SI | ||
4964 | MOVL 12(SP), DI | ||
4965 | MOVL SI, BX | ||
4966 | SUBL 16(SP), BX | ||
4967 | JZ repeat_extend_back_end_encodeBlockAsm8B | ||
4968 | |||
4969 | repeat_extend_back_loop_encodeBlockAsm8B: | ||
4970 | CMPL SI, DI | ||
4971 | JBE repeat_extend_back_end_encodeBlockAsm8B | ||
4972 | MOVB -1(DX)(BX*1), R8 | ||
4973 | MOVB -1(DX)(SI*1), R9 | ||
4974 | CMPB R8, R9 | ||
4975 | JNE repeat_extend_back_end_encodeBlockAsm8B | ||
4976 | LEAL -1(SI), SI | ||
4977 | DECL BX | ||
4978 | JNZ repeat_extend_back_loop_encodeBlockAsm8B | ||
4979 | |||
4980 | repeat_extend_back_end_encodeBlockAsm8B: | ||
4981 | MOVL 12(SP), BX | ||
4982 | CMPL BX, SI | ||
4983 | JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B | ||
4984 | MOVL SI, R8 | ||
4985 | MOVL SI, 12(SP) | ||
4986 | LEAQ (DX)(BX*1), R9 | ||
4987 | SUBL BX, R8 | ||
4988 | LEAL -1(R8), BX | ||
4989 | CMPL BX, $0x3c | ||
4990 | JB one_byte_repeat_emit_encodeBlockAsm8B | ||
4991 | CMPL BX, $0x00000100 | ||
4992 | JB two_bytes_repeat_emit_encodeBlockAsm8B | ||
4993 | JB three_bytes_repeat_emit_encodeBlockAsm8B | ||
4994 | |||
4995 | three_bytes_repeat_emit_encodeBlockAsm8B: | ||
4996 | MOVB $0xf4, (AX) | ||
4997 | MOVW BX, 1(AX) | ||
4998 | ADDQ $0x03, AX | ||
4999 | JMP memmove_long_repeat_emit_encodeBlockAsm8B | ||
5000 | |||
5001 | two_bytes_repeat_emit_encodeBlockAsm8B: | ||
5002 | MOVB $0xf0, (AX) | ||
5003 | MOVB BL, 1(AX) | ||
5004 | ADDQ $0x02, AX | ||
5005 | CMPL BX, $0x40 | ||
5006 | JB memmove_repeat_emit_encodeBlockAsm8B | ||
5007 | JMP memmove_long_repeat_emit_encodeBlockAsm8B | ||
5008 | |||
5009 | one_byte_repeat_emit_encodeBlockAsm8B: | ||
5010 | SHLB $0x02, BL | ||
5011 | MOVB BL, (AX) | ||
5012 | ADDQ $0x01, AX | ||
5013 | |||
5014 | memmove_repeat_emit_encodeBlockAsm8B: | ||
5015 | LEAQ (AX)(R8*1), BX | ||
5016 | |||
5017 | // genMemMoveShort | ||
5018 | CMPQ R8, $0x08 | ||
5019 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 | ||
5020 | CMPQ R8, $0x10 | ||
5021 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 | ||
5022 | CMPQ R8, $0x20 | ||
5023 | JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 | ||
5024 | JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 | ||
5025 | |||
5026 | emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: | ||
5027 | MOVQ (R9), R10 | ||
5028 | MOVQ R10, (AX) | ||
5029 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B | ||
5030 | |||
5031 | emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: | ||
5032 | MOVQ (R9), R10 | ||
5033 | MOVQ -8(R9)(R8*1), R9 | ||
5034 | MOVQ R10, (AX) | ||
5035 | MOVQ R9, -8(AX)(R8*1) | ||
5036 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B | ||
5037 | |||
5038 | emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: | ||
5039 | MOVOU (R9), X0 | ||
5040 | MOVOU -16(R9)(R8*1), X1 | ||
5041 | MOVOU X0, (AX) | ||
5042 | MOVOU X1, -16(AX)(R8*1) | ||
5043 | JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B | ||
5044 | |||
5045 | emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: | ||
5046 | MOVOU (R9), X0 | ||
5047 | MOVOU 16(R9), X1 | ||
5048 | MOVOU -32(R9)(R8*1), X2 | ||
5049 | MOVOU -16(R9)(R8*1), X3 | ||
5050 | MOVOU X0, (AX) | ||
5051 | MOVOU X1, 16(AX) | ||
5052 | MOVOU X2, -32(AX)(R8*1) | ||
5053 | MOVOU X3, -16(AX)(R8*1) | ||
5054 | |||
5055 | memmove_end_copy_repeat_emit_encodeBlockAsm8B: | ||
5056 | MOVQ BX, AX | ||
5057 | JMP emit_literal_done_repeat_emit_encodeBlockAsm8B | ||
5058 | |||
5059 | memmove_long_repeat_emit_encodeBlockAsm8B: | ||
5060 | LEAQ (AX)(R8*1), BX | ||
5061 | |||
5062 | // genMemMoveLong | ||
5063 | MOVOU (R9), X0 | ||
5064 | MOVOU 16(R9), X1 | ||
5065 | MOVOU -32(R9)(R8*1), X2 | ||
5066 | MOVOU -16(R9)(R8*1), X3 | ||
5067 | MOVQ R8, R11 | ||
5068 | SHRQ $0x05, R11 | ||
5069 | MOVQ AX, R10 | ||
5070 | ANDL $0x0000001f, R10 | ||
5071 | MOVQ $0x00000040, R12 | ||
5072 | SUBQ R10, R12 | ||
5073 | DECQ R11 | ||
5074 | JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 | ||
5075 | LEAQ -32(R9)(R12*1), R10 | ||
5076 | LEAQ -32(AX)(R12*1), R13 | ||
5077 | |||
5078 | emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: | ||
5079 | MOVOU (R10), X4 | ||
5080 | MOVOU 16(R10), X5 | ||
5081 | MOVOA X4, (R13) | ||
5082 | MOVOA X5, 16(R13) | ||
5083 | ADDQ $0x20, R13 | ||
5084 | ADDQ $0x20, R10 | ||
5085 | ADDQ $0x20, R12 | ||
5086 | DECQ R11 | ||
5087 | JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back | ||
5088 | |||
5089 | emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: | ||
5090 | MOVOU -32(R9)(R12*1), X4 | ||
5091 | MOVOU -16(R9)(R12*1), X5 | ||
5092 | MOVOA X4, -32(AX)(R12*1) | ||
5093 | MOVOA X5, -16(AX)(R12*1) | ||
5094 | ADDQ $0x20, R12 | ||
5095 | CMPQ R8, R12 | ||
5096 | JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 | ||
5097 | MOVOU X0, (AX) | ||
5098 | MOVOU X1, 16(AX) | ||
5099 | MOVOU X2, -32(AX)(R8*1) | ||
5100 | MOVOU X3, -16(AX)(R8*1) | ||
5101 | MOVQ BX, AX | ||
5102 | |||
5103 | emit_literal_done_repeat_emit_encodeBlockAsm8B: | ||
5104 | ADDL $0x05, CX | ||
5105 | MOVL CX, BX | ||
5106 | SUBL 16(SP), BX | ||
5107 | MOVQ src_len+32(FP), R8 | ||
5108 | SUBL CX, R8 | ||
5109 | LEAQ (DX)(CX*1), R9 | ||
5110 | LEAQ (DX)(BX*1), BX | ||
5111 | |||
5112 | // matchLen | ||
5113 | XORL R11, R11 | ||
5114 | |||
5115 | matchlen_loopback_16_repeat_extend_encodeBlockAsm8B: | ||
5116 | CMPL R8, $0x10 | ||
5117 | JB matchlen_match8_repeat_extend_encodeBlockAsm8B | ||
5118 | MOVQ (R9)(R11*1), R10 | ||
5119 | MOVQ 8(R9)(R11*1), R12 | ||
5120 | XORQ (BX)(R11*1), R10 | ||
5121 | JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B | ||
5122 | XORQ 8(BX)(R11*1), R12 | ||
5123 | JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm8B | ||
5124 | LEAL -16(R8), R8 | ||
5125 | LEAL 16(R11), R11 | ||
5126 | JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm8B | ||
5127 | |||
5128 | matchlen_bsf_16repeat_extend_encodeBlockAsm8B: | ||
5129 | #ifdef GOAMD64_v3 | ||
5130 | TZCNTQ R12, R12 | ||
5131 | |||
5132 | #else | ||
5133 | BSFQ R12, R12 | ||
5134 | |||
5135 | #endif | ||
5136 | SARQ $0x03, R12 | ||
5137 | LEAL 8(R11)(R12*1), R11 | ||
5138 | JMP repeat_extend_forward_end_encodeBlockAsm8B | ||
5139 | |||
5140 | matchlen_match8_repeat_extend_encodeBlockAsm8B: | ||
5141 | CMPL R8, $0x08 | ||
5142 | JB matchlen_match4_repeat_extend_encodeBlockAsm8B | ||
5143 | MOVQ (R9)(R11*1), R10 | ||
5144 | XORQ (BX)(R11*1), R10 | ||
5145 | JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B | ||
5146 | LEAL -8(R8), R8 | ||
5147 | LEAL 8(R11), R11 | ||
5148 | JMP matchlen_match4_repeat_extend_encodeBlockAsm8B | ||
5149 | |||
5150 | matchlen_bsf_8_repeat_extend_encodeBlockAsm8B: | ||
5151 | #ifdef GOAMD64_v3 | ||
5152 | TZCNTQ R10, R10 | ||
5153 | |||
5154 | #else | ||
5155 | BSFQ R10, R10 | ||
5156 | |||
5157 | #endif | ||
5158 | SARQ $0x03, R10 | ||
5159 | LEAL (R11)(R10*1), R11 | ||
5160 | JMP repeat_extend_forward_end_encodeBlockAsm8B | ||
5161 | |||
5162 | matchlen_match4_repeat_extend_encodeBlockAsm8B: | ||
5163 | CMPL R8, $0x04 | ||
5164 | JB matchlen_match2_repeat_extend_encodeBlockAsm8B | ||
5165 | MOVL (R9)(R11*1), R10 | ||
5166 | CMPL (BX)(R11*1), R10 | ||
5167 | JNE matchlen_match2_repeat_extend_encodeBlockAsm8B | ||
5168 | LEAL -4(R8), R8 | ||
5169 | LEAL 4(R11), R11 | ||
5170 | |||
5171 | matchlen_match2_repeat_extend_encodeBlockAsm8B: | ||
5172 | CMPL R8, $0x01 | ||
5173 | JE matchlen_match1_repeat_extend_encodeBlockAsm8B | ||
5174 | JB repeat_extend_forward_end_encodeBlockAsm8B | ||
5175 | MOVW (R9)(R11*1), R10 | ||
5176 | CMPW (BX)(R11*1), R10 | ||
5177 | JNE matchlen_match1_repeat_extend_encodeBlockAsm8B | ||
5178 | LEAL 2(R11), R11 | ||
5179 | SUBL $0x02, R8 | ||
5180 | JZ repeat_extend_forward_end_encodeBlockAsm8B | ||
5181 | |||
5182 | matchlen_match1_repeat_extend_encodeBlockAsm8B: | ||
5183 | MOVB (R9)(R11*1), R10 | ||
5184 | CMPB (BX)(R11*1), R10 | ||
5185 | JNE repeat_extend_forward_end_encodeBlockAsm8B | ||
5186 | LEAL 1(R11), R11 | ||
5187 | |||
5188 | repeat_extend_forward_end_encodeBlockAsm8B: | ||
5189 | ADDL R11, CX | ||
5190 | MOVL CX, BX | ||
5191 | SUBL SI, BX | ||
5192 | MOVL 16(SP), SI | ||
5193 | TESTL DI, DI | ||
5194 | JZ repeat_as_copy_encodeBlockAsm8B | ||
5195 | |||
5196 | // emitRepeat | ||
5197 | MOVL BX, SI | ||
5198 | LEAL -4(BX), BX | ||
5199 | CMPL SI, $0x08 | ||
5200 | JBE repeat_two_match_repeat_encodeBlockAsm8B | ||
5201 | CMPL SI, $0x0c | ||
5202 | JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B | ||
5203 | |||
5204 | cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: | ||
5205 | CMPL BX, $0x00000104 | ||
5206 | JB repeat_three_match_repeat_encodeBlockAsm8B | ||
5207 | LEAL -256(BX), BX | ||
5208 | MOVW $0x0019, (AX) | ||
5209 | MOVW BX, 2(AX) | ||
5210 | ADDQ $0x04, AX | ||
5211 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5212 | |||
5213 | repeat_three_match_repeat_encodeBlockAsm8B: | ||
5214 | LEAL -4(BX), BX | ||
5215 | MOVW $0x0015, (AX) | ||
5216 | MOVB BL, 2(AX) | ||
5217 | ADDQ $0x03, AX | ||
5218 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5219 | |||
5220 | repeat_two_match_repeat_encodeBlockAsm8B: | ||
5221 | SHLL $0x02, BX | ||
5222 | ORL $0x01, BX | ||
5223 | MOVW BX, (AX) | ||
5224 | ADDQ $0x02, AX | ||
5225 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5226 | XORQ DI, DI | ||
5227 | LEAL 1(DI)(BX*4), BX | ||
5228 | MOVB SI, 1(AX) | ||
5229 | SARL $0x08, SI | ||
5230 | SHLL $0x05, SI | ||
5231 | ORL SI, BX | ||
5232 | MOVB BL, (AX) | ||
5233 | ADDQ $0x02, AX | ||
5234 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5235 | |||
5236 | repeat_as_copy_encodeBlockAsm8B: | ||
5237 | // emitCopy | ||
5238 | CMPL BX, $0x40 | ||
5239 | JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B | ||
5240 | CMPL SI, $0x00000800 | ||
5241 | JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B | ||
5242 | MOVL $0x00000001, DI | ||
5243 | LEAL 16(DI), DI | ||
5244 | MOVB SI, 1(AX) | ||
5245 | SHRL $0x08, SI | ||
5246 | SHLL $0x05, SI | ||
5247 | ORL SI, DI | ||
5248 | MOVB DI, (AX) | ||
5249 | ADDQ $0x02, AX | ||
5250 | SUBL $0x08, BX | ||
5251 | |||
5252 | // emitRepeat | ||
5253 | LEAL -4(BX), BX | ||
5254 | JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b | ||
5255 | MOVL BX, SI | ||
5256 | LEAL -4(BX), BX | ||
5257 | CMPL SI, $0x08 | ||
5258 | JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b | ||
5259 | CMPL SI, $0x0c | ||
5260 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b | ||
5261 | |||
5262 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: | ||
5263 | CMPL BX, $0x00000104 | ||
5264 | JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b | ||
5265 | LEAL -256(BX), BX | ||
5266 | MOVW $0x0019, (AX) | ||
5267 | MOVW BX, 2(AX) | ||
5268 | ADDQ $0x04, AX | ||
5269 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5270 | |||
5271 | repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: | ||
5272 | LEAL -4(BX), BX | ||
5273 | MOVW $0x0015, (AX) | ||
5274 | MOVB BL, 2(AX) | ||
5275 | ADDQ $0x03, AX | ||
5276 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5277 | |||
5278 | repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: | ||
5279 | SHLL $0x02, BX | ||
5280 | ORL $0x01, BX | ||
5281 | MOVW BX, (AX) | ||
5282 | ADDQ $0x02, AX | ||
5283 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5284 | XORQ DI, DI | ||
5285 | LEAL 1(DI)(BX*4), BX | ||
5286 | MOVB SI, 1(AX) | ||
5287 | SARL $0x08, SI | ||
5288 | SHLL $0x05, SI | ||
5289 | ORL SI, BX | ||
5290 | MOVB BL, (AX) | ||
5291 | ADDQ $0x02, AX | ||
5292 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5293 | |||
5294 | long_offset_short_repeat_as_copy_encodeBlockAsm8B: | ||
5295 | MOVB $0xee, (AX) | ||
5296 | MOVW SI, 1(AX) | ||
5297 | LEAL -60(BX), BX | ||
5298 | ADDQ $0x03, AX | ||
5299 | |||
5300 | // emitRepeat | ||
5301 | MOVL BX, SI | ||
5302 | LEAL -4(BX), BX | ||
5303 | CMPL SI, $0x08 | ||
5304 | JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short | ||
5305 | CMPL SI, $0x0c | ||
5306 | JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short | ||
5307 | |||
5308 | cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: | ||
5309 | CMPL BX, $0x00000104 | ||
5310 | JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short | ||
5311 | LEAL -256(BX), BX | ||
5312 | MOVW $0x0019, (AX) | ||
5313 | MOVW BX, 2(AX) | ||
5314 | ADDQ $0x04, AX | ||
5315 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5316 | |||
5317 | repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: | ||
5318 | LEAL -4(BX), BX | ||
5319 | MOVW $0x0015, (AX) | ||
5320 | MOVB BL, 2(AX) | ||
5321 | ADDQ $0x03, AX | ||
5322 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5323 | |||
5324 | repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: | ||
5325 | SHLL $0x02, BX | ||
5326 | ORL $0x01, BX | ||
5327 | MOVW BX, (AX) | ||
5328 | ADDQ $0x02, AX | ||
5329 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5330 | XORQ DI, DI | ||
5331 | LEAL 1(DI)(BX*4), BX | ||
5332 | MOVB SI, 1(AX) | ||
5333 | SARL $0x08, SI | ||
5334 | SHLL $0x05, SI | ||
5335 | ORL SI, BX | ||
5336 | MOVB BL, (AX) | ||
5337 | ADDQ $0x02, AX | ||
5338 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5339 | |||
5340 | two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: | ||
5341 | MOVL BX, DI | ||
5342 | SHLL $0x02, DI | ||
5343 | CMPL BX, $0x0c | ||
5344 | JAE emit_copy_three_repeat_as_copy_encodeBlockAsm8B | ||
5345 | LEAL -15(DI), DI | ||
5346 | MOVB SI, 1(AX) | ||
5347 | SHRL $0x08, SI | ||
5348 | SHLL $0x05, SI | ||
5349 | ORL SI, DI | ||
5350 | MOVB DI, (AX) | ||
5351 | ADDQ $0x02, AX | ||
5352 | JMP repeat_end_emit_encodeBlockAsm8B | ||
5353 | |||
5354 | emit_copy_three_repeat_as_copy_encodeBlockAsm8B: | ||
5355 | LEAL -2(DI), DI | ||
5356 | MOVB DI, (AX) | ||
5357 | MOVW SI, 1(AX) | ||
5358 | ADDQ $0x03, AX | ||
5359 | |||
5360 | repeat_end_emit_encodeBlockAsm8B: | ||
5361 | MOVL CX, 12(SP) | ||
5362 | JMP search_loop_encodeBlockAsm8B | ||
5363 | |||
5364 | no_repeat_found_encodeBlockAsm8B: | ||
5365 | CMPL (DX)(BX*1), SI | ||
5366 | JEQ candidate_match_encodeBlockAsm8B | ||
5367 | SHRQ $0x08, SI | ||
5368 | MOVL 24(SP)(R9*4), BX | ||
5369 | LEAL 2(CX), R8 | ||
5370 | CMPL (DX)(DI*1), SI | ||
5371 | JEQ candidate2_match_encodeBlockAsm8B | ||
5372 | MOVL R8, 24(SP)(R9*4) | ||
5373 | SHRQ $0x08, SI | ||
5374 | CMPL (DX)(BX*1), SI | ||
5375 | JEQ candidate3_match_encodeBlockAsm8B | ||
5376 | MOVL 20(SP), CX | ||
5377 | JMP search_loop_encodeBlockAsm8B | ||
5378 | |||
5379 | candidate3_match_encodeBlockAsm8B: | ||
5380 | ADDL $0x02, CX | ||
5381 | JMP candidate_match_encodeBlockAsm8B | ||
5382 | |||
5383 | candidate2_match_encodeBlockAsm8B: | ||
5384 | MOVL R8, 24(SP)(R9*4) | ||
5385 | INCL CX | ||
5386 | MOVL DI, BX | ||
5387 | |||
5388 | candidate_match_encodeBlockAsm8B: | ||
5389 | MOVL 12(SP), SI | ||
5390 | TESTL BX, BX | ||
5391 | JZ match_extend_back_end_encodeBlockAsm8B | ||
5392 | |||
5393 | match_extend_back_loop_encodeBlockAsm8B: | ||
5394 | CMPL CX, SI | ||
5395 | JBE match_extend_back_end_encodeBlockAsm8B | ||
5396 | MOVB -1(DX)(BX*1), DI | ||
5397 | MOVB -1(DX)(CX*1), R8 | ||
5398 | CMPB DI, R8 | ||
5399 | JNE match_extend_back_end_encodeBlockAsm8B | ||
5400 | LEAL -1(CX), CX | ||
5401 | DECL BX | ||
5402 | JZ match_extend_back_end_encodeBlockAsm8B | ||
5403 | JMP match_extend_back_loop_encodeBlockAsm8B | ||
5404 | |||
5405 | match_extend_back_end_encodeBlockAsm8B: | ||
5406 | MOVL CX, SI | ||
5407 | SUBL 12(SP), SI | ||
5408 | LEAQ 3(AX)(SI*1), SI | ||
5409 | CMPQ SI, (SP) | ||
5410 | JB match_dst_size_check_encodeBlockAsm8B | ||
5411 | MOVQ $0x00000000, ret+48(FP) | ||
5412 | RET | ||
5413 | |||
5414 | match_dst_size_check_encodeBlockAsm8B: | ||
5415 | MOVL CX, SI | ||
5416 | MOVL 12(SP), DI | ||
5417 | CMPL DI, SI | ||
5418 | JEQ emit_literal_done_match_emit_encodeBlockAsm8B | ||
5419 | MOVL SI, R8 | ||
5420 | MOVL SI, 12(SP) | ||
5421 | LEAQ (DX)(DI*1), SI | ||
5422 | SUBL DI, R8 | ||
5423 | LEAL -1(R8), DI | ||
5424 | CMPL DI, $0x3c | ||
5425 | JB one_byte_match_emit_encodeBlockAsm8B | ||
5426 | CMPL DI, $0x00000100 | ||
5427 | JB two_bytes_match_emit_encodeBlockAsm8B | ||
5428 | JB three_bytes_match_emit_encodeBlockAsm8B | ||
5429 | |||
5430 | three_bytes_match_emit_encodeBlockAsm8B: | ||
5431 | MOVB $0xf4, (AX) | ||
5432 | MOVW DI, 1(AX) | ||
5433 | ADDQ $0x03, AX | ||
5434 | JMP memmove_long_match_emit_encodeBlockAsm8B | ||
5435 | |||
5436 | two_bytes_match_emit_encodeBlockAsm8B: | ||
5437 | MOVB $0xf0, (AX) | ||
5438 | MOVB DI, 1(AX) | ||
5439 | ADDQ $0x02, AX | ||
5440 | CMPL DI, $0x40 | ||
5441 | JB memmove_match_emit_encodeBlockAsm8B | ||
5442 | JMP memmove_long_match_emit_encodeBlockAsm8B | ||
5443 | |||
5444 | one_byte_match_emit_encodeBlockAsm8B: | ||
5445 | SHLB $0x02, DI | ||
5446 | MOVB DI, (AX) | ||
5447 | ADDQ $0x01, AX | ||
5448 | |||
5449 | memmove_match_emit_encodeBlockAsm8B: | ||
5450 | LEAQ (AX)(R8*1), DI | ||
5451 | |||
5452 | // genMemMoveShort | ||
5453 | CMPQ R8, $0x08 | ||
5454 | JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 | ||
5455 | CMPQ R8, $0x10 | ||
5456 | JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 | ||
5457 | CMPQ R8, $0x20 | ||
5458 | JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 | ||
5459 | JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 | ||
5460 | |||
5461 | emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: | ||
5462 | MOVQ (SI), R9 | ||
5463 | MOVQ R9, (AX) | ||
5464 | JMP memmove_end_copy_match_emit_encodeBlockAsm8B | ||
5465 | |||
5466 | emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: | ||
5467 | MOVQ (SI), R9 | ||
5468 | MOVQ -8(SI)(R8*1), SI | ||
5469 | MOVQ R9, (AX) | ||
5470 | MOVQ SI, -8(AX)(R8*1) | ||
5471 | JMP memmove_end_copy_match_emit_encodeBlockAsm8B | ||
5472 | |||
5473 | emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: | ||
5474 | MOVOU (SI), X0 | ||
5475 | MOVOU -16(SI)(R8*1), X1 | ||
5476 | MOVOU X0, (AX) | ||
5477 | MOVOU X1, -16(AX)(R8*1) | ||
5478 | JMP memmove_end_copy_match_emit_encodeBlockAsm8B | ||
5479 | |||
5480 | emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: | ||
5481 | MOVOU (SI), X0 | ||
5482 | MOVOU 16(SI), X1 | ||
5483 | MOVOU -32(SI)(R8*1), X2 | ||
5484 | MOVOU -16(SI)(R8*1), X3 | ||
5485 | MOVOU X0, (AX) | ||
5486 | MOVOU X1, 16(AX) | ||
5487 | MOVOU X2, -32(AX)(R8*1) | ||
5488 | MOVOU X3, -16(AX)(R8*1) | ||
5489 | |||
5490 | memmove_end_copy_match_emit_encodeBlockAsm8B: | ||
5491 | MOVQ DI, AX | ||
5492 | JMP emit_literal_done_match_emit_encodeBlockAsm8B | ||
5493 | |||
5494 | memmove_long_match_emit_encodeBlockAsm8B: | ||
5495 | LEAQ (AX)(R8*1), DI | ||
5496 | |||
5497 | // genMemMoveLong | ||
5498 | MOVOU (SI), X0 | ||
5499 | MOVOU 16(SI), X1 | ||
5500 | MOVOU -32(SI)(R8*1), X2 | ||
5501 | MOVOU -16(SI)(R8*1), X3 | ||
5502 | MOVQ R8, R10 | ||
5503 | SHRQ $0x05, R10 | ||
5504 | MOVQ AX, R9 | ||
5505 | ANDL $0x0000001f, R9 | ||
5506 | MOVQ $0x00000040, R11 | ||
5507 | SUBQ R9, R11 | ||
5508 | DECQ R10 | ||
5509 | JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 | ||
5510 | LEAQ -32(SI)(R11*1), R9 | ||
5511 | LEAQ -32(AX)(R11*1), R12 | ||
5512 | |||
5513 | emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: | ||
5514 | MOVOU (R9), X4 | ||
5515 | MOVOU 16(R9), X5 | ||
5516 | MOVOA X4, (R12) | ||
5517 | MOVOA X5, 16(R12) | ||
5518 | ADDQ $0x20, R12 | ||
5519 | ADDQ $0x20, R9 | ||
5520 | ADDQ $0x20, R11 | ||
5521 | DECQ R10 | ||
5522 | JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back | ||
5523 | |||
5524 | emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: | ||
5525 | MOVOU -32(SI)(R11*1), X4 | ||
5526 | MOVOU -16(SI)(R11*1), X5 | ||
5527 | MOVOA X4, -32(AX)(R11*1) | ||
5528 | MOVOA X5, -16(AX)(R11*1) | ||
5529 | ADDQ $0x20, R11 | ||
5530 | CMPQ R8, R11 | ||
5531 | JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 | ||
5532 | MOVOU X0, (AX) | ||
5533 | MOVOU X1, 16(AX) | ||
5534 | MOVOU X2, -32(AX)(R8*1) | ||
5535 | MOVOU X3, -16(AX)(R8*1) | ||
5536 | MOVQ DI, AX | ||
5537 | |||
5538 | emit_literal_done_match_emit_encodeBlockAsm8B: | ||
5539 | match_nolit_loop_encodeBlockAsm8B: | ||
5540 | MOVL CX, SI | ||
5541 | SUBL BX, SI | ||
5542 | MOVL SI, 16(SP) | ||
5543 | ADDL $0x04, CX | ||
5544 | ADDL $0x04, BX | ||
5545 | MOVQ src_len+32(FP), SI | ||
5546 | SUBL CX, SI | ||
5547 | LEAQ (DX)(CX*1), DI | ||
5548 | LEAQ (DX)(BX*1), BX | ||
5549 | |||
5550 | // matchLen | ||
5551 | XORL R9, R9 | ||
5552 | |||
5553 | matchlen_loopback_16_match_nolit_encodeBlockAsm8B: | ||
5554 | CMPL SI, $0x10 | ||
5555 | JB matchlen_match8_match_nolit_encodeBlockAsm8B | ||
5556 | MOVQ (DI)(R9*1), R8 | ||
5557 | MOVQ 8(DI)(R9*1), R10 | ||
5558 | XORQ (BX)(R9*1), R8 | ||
5559 | JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B | ||
5560 | XORQ 8(BX)(R9*1), R10 | ||
5561 | JNZ matchlen_bsf_16match_nolit_encodeBlockAsm8B | ||
5562 | LEAL -16(SI), SI | ||
5563 | LEAL 16(R9), R9 | ||
5564 | JMP matchlen_loopback_16_match_nolit_encodeBlockAsm8B | ||
5565 | |||
5566 | matchlen_bsf_16match_nolit_encodeBlockAsm8B: | ||
5567 | #ifdef GOAMD64_v3 | ||
5568 | TZCNTQ R10, R10 | ||
5569 | |||
5570 | #else | ||
5571 | BSFQ R10, R10 | ||
5572 | |||
5573 | #endif | ||
5574 | SARQ $0x03, R10 | ||
5575 | LEAL 8(R9)(R10*1), R9 | ||
5576 | JMP match_nolit_end_encodeBlockAsm8B | ||
5577 | |||
5578 | matchlen_match8_match_nolit_encodeBlockAsm8B: | ||
5579 | CMPL SI, $0x08 | ||
5580 | JB matchlen_match4_match_nolit_encodeBlockAsm8B | ||
5581 | MOVQ (DI)(R9*1), R8 | ||
5582 | XORQ (BX)(R9*1), R8 | ||
5583 | JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B | ||
5584 | LEAL -8(SI), SI | ||
5585 | LEAL 8(R9), R9 | ||
5586 | JMP matchlen_match4_match_nolit_encodeBlockAsm8B | ||
5587 | |||
5588 | matchlen_bsf_8_match_nolit_encodeBlockAsm8B: | ||
5589 | #ifdef GOAMD64_v3 | ||
5590 | TZCNTQ R8, R8 | ||
5591 | |||
5592 | #else | ||
5593 | BSFQ R8, R8 | ||
5594 | |||
5595 | #endif | ||
5596 | SARQ $0x03, R8 | ||
5597 | LEAL (R9)(R8*1), R9 | ||
5598 | JMP match_nolit_end_encodeBlockAsm8B | ||
5599 | |||
5600 | matchlen_match4_match_nolit_encodeBlockAsm8B: | ||
5601 | CMPL SI, $0x04 | ||
5602 | JB matchlen_match2_match_nolit_encodeBlockAsm8B | ||
5603 | MOVL (DI)(R9*1), R8 | ||
5604 | CMPL (BX)(R9*1), R8 | ||
5605 | JNE matchlen_match2_match_nolit_encodeBlockAsm8B | ||
5606 | LEAL -4(SI), SI | ||
5607 | LEAL 4(R9), R9 | ||
5608 | |||
5609 | matchlen_match2_match_nolit_encodeBlockAsm8B: | ||
5610 | CMPL SI, $0x01 | ||
5611 | JE matchlen_match1_match_nolit_encodeBlockAsm8B | ||
5612 | JB match_nolit_end_encodeBlockAsm8B | ||
5613 | MOVW (DI)(R9*1), R8 | ||
5614 | CMPW (BX)(R9*1), R8 | ||
5615 | JNE matchlen_match1_match_nolit_encodeBlockAsm8B | ||
5616 | LEAL 2(R9), R9 | ||
5617 | SUBL $0x02, SI | ||
5618 | JZ match_nolit_end_encodeBlockAsm8B | ||
5619 | |||
5620 | matchlen_match1_match_nolit_encodeBlockAsm8B: | ||
5621 | MOVB (DI)(R9*1), R8 | ||
5622 | CMPB (BX)(R9*1), R8 | ||
5623 | JNE match_nolit_end_encodeBlockAsm8B | ||
5624 | LEAL 1(R9), R9 | ||
5625 | |||
5626 | match_nolit_end_encodeBlockAsm8B: | ||
5627 | ADDL R9, CX | ||
5628 | MOVL 16(SP), BX | ||
5629 | ADDL $0x04, R9 | ||
5630 | MOVL CX, 12(SP) | ||
5631 | |||
5632 | // emitCopy | ||
5633 | CMPL R9, $0x40 | ||
5634 | JBE two_byte_offset_short_match_nolit_encodeBlockAsm8B | ||
5635 | CMPL BX, $0x00000800 | ||
5636 | JAE long_offset_short_match_nolit_encodeBlockAsm8B | ||
5637 | MOVL $0x00000001, SI | ||
5638 | LEAL 16(SI), SI | ||
5639 | MOVB BL, 1(AX) | ||
5640 | SHRL $0x08, BX | ||
5641 | SHLL $0x05, BX | ||
5642 | ORL BX, SI | ||
5643 | MOVB SI, (AX) | ||
5644 | ADDQ $0x02, AX | ||
5645 | SUBL $0x08, R9 | ||
5646 | |||
5647 | // emitRepeat | ||
5648 | LEAL -4(R9), R9 | ||
5649 | JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b | ||
5650 | MOVL R9, BX | ||
5651 | LEAL -4(R9), R9 | ||
5652 | CMPL BX, $0x08 | ||
5653 | JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b | ||
5654 | CMPL BX, $0x0c | ||
5655 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b | ||
5656 | |||
5657 | cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: | ||
5658 | CMPL R9, $0x00000104 | ||
5659 | JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b | ||
5660 | LEAL -256(R9), R9 | ||
5661 | MOVW $0x0019, (AX) | ||
5662 | MOVW R9, 2(AX) | ||
5663 | ADDQ $0x04, AX | ||
5664 | JMP match_nolit_emitcopy_end_encodeBlockAsm8B | ||
5665 | |||
5666 | repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: | ||
5667 | LEAL -4(R9), R9 | ||
5668 | MOVW $0x0015, (AX) | ||
5669 | MOVB R9, 2(AX) | ||
5670 | ADDQ $0x03, AX | ||
5671 | JMP match_nolit_emitcopy_end_encodeBlockAsm8B | ||
5672 | |||
5673 | repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: | ||
5674 | SHLL $0x02, R9 | ||
5675 | ORL $0x01, R9 | ||
5676 | MOVW R9, (AX) | ||
5677 | ADDQ $0x02, AX | ||
5678 | JMP match_nolit_emitcopy_end_encodeBlockAsm8B | ||
5679 | XORQ SI, SI | ||
5680 | LEAL 1(SI)(R9*4), R9 | ||
5681 | MOVB BL, 1(AX) | ||
5682 | SARL $0x08, BX | ||
5683 | SHLL $0x05, BX | ||
5684 | ORL BX, R9 | ||
5685 | MOVB R9, (AX) | ||
5686 | ADDQ $0x02, AX | ||
5687 | JMP match_nolit_emitcopy_end_encodeBlockAsm8B | ||
5688 | |||
5689 | long_offset_short_match_nolit_encodeBlockAsm8B: | ||
5690 | MOVB $0xee, (AX) | ||
5691 | MOVW BX, 1(AX) | ||
5692 | LEAL -60(R9), R9 | ||
5693 | ADDQ $0x03, AX | ||
5694 | |||
5695 | // emitRepeat | ||
5696 | MOVL R9, BX | ||
5697 | LEAL -4(R9), R9 | ||
5698 | CMPL BX, $0x08 | ||
5699 | JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short | ||
5700 | CMPL BX, $0x0c | ||
5701 | JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short | ||
5702 | |||
5703 | cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: | ||
5704 | CMPL R9, $0x00000104 | ||
5705 | JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short | ||
5706 | LEAL -256(R9), R9 | ||
5707 | MOVW $0x0019, (AX) | ||
5708 | MOVW R9, 2(AX) | ||
5709 | ADDQ $0x04, AX | ||
5710 | JMP match_nolit_emitcopy_end_encodeBlockAsm8B | ||
5711 | |||
5712 | repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: | ||
5713 | LEAL -4(R9), R9 | ||
5714 | MOVW $0x0015, (AX) | ||
5715 | MOVB R9, 2(AX) | ||
5716 | ADDQ $0x03, AX | ||
5717 | JMP match_nolit_emitcopy_end_encodeBlockAsm8B | ||
5718 | |||
5719 | repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: | ||
5720 | SHLL $0x02, R9 | ||
5721 | ORL $0x01, R9 | ||
5722 | MOVW R9, (AX) | ||
5723 | ADDQ $0x02, AX | ||
5724 | JMP match_nolit_emitcopy_end_encodeBlockAsm8B | ||
5725 | XORQ SI, SI | ||
5726 | LEAL 1(SI)(R9*4), R9 | ||
5727 | MOVB BL, 1(AX) | ||
5728 | SARL $0x08, BX | ||
5729 | SHLL $0x05, BX | ||
5730 | ORL BX, R9 | ||
5731 | MOVB R9, (AX) | ||
5732 | ADDQ $0x02, AX | ||
5733 | JMP match_nolit_emitcopy_end_encodeBlockAsm8B | ||
5734 | |||
5735 | two_byte_offset_short_match_nolit_encodeBlockAsm8B: | ||
5736 | MOVL R9, SI | ||
5737 | SHLL $0x02, SI | ||
5738 | CMPL R9, $0x0c | ||
5739 | JAE emit_copy_three_match_nolit_encodeBlockAsm8B | ||
5740 | LEAL -15(SI), SI | ||
5741 | MOVB BL, 1(AX) | ||
5742 | SHRL $0x08, BX | ||
5743 | SHLL $0x05, BX | ||
5744 | ORL BX, SI | ||
5745 | MOVB SI, (AX) | ||
5746 | ADDQ $0x02, AX | ||
5747 | JMP match_nolit_emitcopy_end_encodeBlockAsm8B | ||
5748 | |||
5749 | emit_copy_three_match_nolit_encodeBlockAsm8B: | ||
5750 | LEAL -2(SI), SI | ||
5751 | MOVB SI, (AX) | ||
5752 | MOVW BX, 1(AX) | ||
5753 | ADDQ $0x03, AX | ||
5754 | |||
5755 | match_nolit_emitcopy_end_encodeBlockAsm8B: | ||
5756 | CMPL CX, 8(SP) | ||
5757 | JAE emit_remainder_encodeBlockAsm8B | ||
5758 | MOVQ -2(DX)(CX*1), SI | ||
5759 | CMPQ AX, (SP) | ||
5760 | JB match_nolit_dst_ok_encodeBlockAsm8B | ||
5761 | MOVQ $0x00000000, ret+48(FP) | ||
5762 | RET | ||
5763 | |||
5764 | match_nolit_dst_ok_encodeBlockAsm8B: | ||
5765 | MOVQ $0x9e3779b1, R8 | ||
5766 | MOVQ SI, DI | ||
5767 | SHRQ $0x10, SI | ||
5768 | MOVQ SI, BX | ||
5769 | SHLQ $0x20, DI | ||
5770 | IMULQ R8, DI | ||
5771 | SHRQ $0x38, DI | ||
5772 | SHLQ $0x20, BX | ||
5773 | IMULQ R8, BX | ||
5774 | SHRQ $0x38, BX | ||
5775 | LEAL -2(CX), R8 | ||
5776 | LEAQ 24(SP)(BX*4), R9 | ||
5777 | MOVL (R9), BX | ||
5778 | MOVL R8, 24(SP)(DI*4) | ||
5779 | MOVL CX, (R9) | ||
5780 | CMPL (DX)(BX*1), SI | ||
5781 | JEQ match_nolit_loop_encodeBlockAsm8B | ||
5782 | INCL CX | ||
5783 | JMP search_loop_encodeBlockAsm8B | ||
5784 | |||
5785 | emit_remainder_encodeBlockAsm8B: | ||
5786 | MOVQ src_len+32(FP), CX | ||
5787 | SUBL 12(SP), CX | ||
5788 | LEAQ 3(AX)(CX*1), CX | ||
5789 | CMPQ CX, (SP) | ||
5790 | JB emit_remainder_ok_encodeBlockAsm8B | ||
5791 | MOVQ $0x00000000, ret+48(FP) | ||
5792 | RET | ||
5793 | |||
5794 | emit_remainder_ok_encodeBlockAsm8B: | ||
5795 | MOVQ src_len+32(FP), CX | ||
5796 | MOVL 12(SP), BX | ||
5797 | CMPL BX, CX | ||
5798 | JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B | ||
5799 | MOVL CX, SI | ||
5800 | MOVL CX, 12(SP) | ||
5801 | LEAQ (DX)(BX*1), CX | ||
5802 | SUBL BX, SI | ||
5803 | LEAL -1(SI), DX | ||
5804 | CMPL DX, $0x3c | ||
5805 | JB one_byte_emit_remainder_encodeBlockAsm8B | ||
5806 | CMPL DX, $0x00000100 | ||
5807 | JB two_bytes_emit_remainder_encodeBlockAsm8B | ||
5808 | JB three_bytes_emit_remainder_encodeBlockAsm8B | ||
5809 | |||
5810 | three_bytes_emit_remainder_encodeBlockAsm8B: | ||
5811 | MOVB $0xf4, (AX) | ||
5812 | MOVW DX, 1(AX) | ||
5813 | ADDQ $0x03, AX | ||
5814 | JMP memmove_long_emit_remainder_encodeBlockAsm8B | ||
5815 | |||
5816 | two_bytes_emit_remainder_encodeBlockAsm8B: | ||
5817 | MOVB $0xf0, (AX) | ||
5818 | MOVB DL, 1(AX) | ||
5819 | ADDQ $0x02, AX | ||
5820 | CMPL DX, $0x40 | ||
5821 | JB memmove_emit_remainder_encodeBlockAsm8B | ||
5822 | JMP memmove_long_emit_remainder_encodeBlockAsm8B | ||
5823 | |||
5824 | one_byte_emit_remainder_encodeBlockAsm8B: | ||
5825 | SHLB $0x02, DL | ||
5826 | MOVB DL, (AX) | ||
5827 | ADDQ $0x01, AX | ||
5828 | |||
5829 | memmove_emit_remainder_encodeBlockAsm8B: | ||
5830 | LEAQ (AX)(SI*1), DX | ||
5831 | MOVL SI, BX | ||
5832 | |||
5833 | // genMemMoveShort | ||
5834 | CMPQ BX, $0x03 | ||
5835 | JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 | ||
5836 | JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 | ||
5837 | CMPQ BX, $0x08 | ||
5838 | JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 | ||
5839 | CMPQ BX, $0x10 | ||
5840 | JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 | ||
5841 | CMPQ BX, $0x20 | ||
5842 | JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 | ||
5843 | JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 | ||
5844 | |||
5845 | emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: | ||
5846 | MOVB (CX), SI | ||
5847 | MOVB -1(CX)(BX*1), CL | ||
5848 | MOVB SI, (AX) | ||
5849 | MOVB CL, -1(AX)(BX*1) | ||
5850 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B | ||
5851 | |||
5852 | emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: | ||
5853 | MOVW (CX), SI | ||
5854 | MOVB 2(CX), CL | ||
5855 | MOVW SI, (AX) | ||
5856 | MOVB CL, 2(AX) | ||
5857 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B | ||
5858 | |||
5859 | emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: | ||
5860 | MOVL (CX), SI | ||
5861 | MOVL -4(CX)(BX*1), CX | ||
5862 | MOVL SI, (AX) | ||
5863 | MOVL CX, -4(AX)(BX*1) | ||
5864 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B | ||
5865 | |||
5866 | emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: | ||
5867 | MOVQ (CX), SI | ||
5868 | MOVQ -8(CX)(BX*1), CX | ||
5869 | MOVQ SI, (AX) | ||
5870 | MOVQ CX, -8(AX)(BX*1) | ||
5871 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B | ||
5872 | |||
5873 | emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: | ||
5874 | MOVOU (CX), X0 | ||
5875 | MOVOU -16(CX)(BX*1), X1 | ||
5876 | MOVOU X0, (AX) | ||
5877 | MOVOU X1, -16(AX)(BX*1) | ||
5878 | JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B | ||
5879 | |||
5880 | emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: | ||
5881 | MOVOU (CX), X0 | ||
5882 | MOVOU 16(CX), X1 | ||
5883 | MOVOU -32(CX)(BX*1), X2 | ||
5884 | MOVOU -16(CX)(BX*1), X3 | ||
5885 | MOVOU X0, (AX) | ||
5886 | MOVOU X1, 16(AX) | ||
5887 | MOVOU X2, -32(AX)(BX*1) | ||
5888 | MOVOU X3, -16(AX)(BX*1) | ||
5889 | |||
5890 | memmove_end_copy_emit_remainder_encodeBlockAsm8B: | ||
5891 | MOVQ DX, AX | ||
5892 | JMP emit_literal_done_emit_remainder_encodeBlockAsm8B | ||
5893 | |||
5894 | memmove_long_emit_remainder_encodeBlockAsm8B: | ||
5895 | LEAQ (AX)(SI*1), DX | ||
5896 | MOVL SI, BX | ||
5897 | |||
5898 | // genMemMoveLong | ||
5899 | MOVOU (CX), X0 | ||
5900 | MOVOU 16(CX), X1 | ||
5901 | MOVOU -32(CX)(BX*1), X2 | ||
5902 | MOVOU -16(CX)(BX*1), X3 | ||
5903 | MOVQ BX, DI | ||
5904 | SHRQ $0x05, DI | ||
5905 | MOVQ AX, SI | ||
5906 | ANDL $0x0000001f, SI | ||
5907 | MOVQ $0x00000040, R8 | ||
5908 | SUBQ SI, R8 | ||
5909 | DECQ DI | ||
5910 | JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 | ||
5911 | LEAQ -32(CX)(R8*1), SI | ||
5912 | LEAQ -32(AX)(R8*1), R9 | ||
5913 | |||
5914 | emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: | ||
5915 | MOVOU (SI), X4 | ||
5916 | MOVOU 16(SI), X5 | ||
5917 | MOVOA X4, (R9) | ||
5918 | MOVOA X5, 16(R9) | ||
5919 | ADDQ $0x20, R9 | ||
5920 | ADDQ $0x20, SI | ||
5921 | ADDQ $0x20, R8 | ||
5922 | DECQ DI | ||
5923 | JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back | ||
5924 | |||
5925 | emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: | ||
5926 | MOVOU -32(CX)(R8*1), X4 | ||
5927 | MOVOU -16(CX)(R8*1), X5 | ||
5928 | MOVOA X4, -32(AX)(R8*1) | ||
5929 | MOVOA X5, -16(AX)(R8*1) | ||
5930 | ADDQ $0x20, R8 | ||
5931 | CMPQ BX, R8 | ||
5932 | JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 | ||
5933 | MOVOU X0, (AX) | ||
5934 | MOVOU X1, 16(AX) | ||
5935 | MOVOU X2, -32(AX)(BX*1) | ||
5936 | MOVOU X3, -16(AX)(BX*1) | ||
5937 | MOVQ DX, AX | ||
5938 | |||
5939 | emit_literal_done_emit_remainder_encodeBlockAsm8B: | ||
5940 | MOVQ dst_base+0(FP), CX | ||
5941 | SUBQ CX, AX | ||
5942 | MOVQ AX, ret+48(FP) | ||
5943 | RET | ||
5944 | |||
5945 | // func encodeBetterBlockAsm(dst []byte, src []byte) int | ||
5946 | // Requires: BMI, SSE2 | ||
5947 | TEXT ·encodeBetterBlockAsm(SB), $589848-56 | ||
5948 | MOVQ dst_base+0(FP), AX | ||
5949 | MOVQ $0x00001200, CX | ||
5950 | LEAQ 24(SP), DX | ||
5951 | PXOR X0, X0 | ||
5952 | |||
5953 | zero_loop_encodeBetterBlockAsm: | ||
5954 | MOVOU X0, (DX) | ||
5955 | MOVOU X0, 16(DX) | ||
5956 | MOVOU X0, 32(DX) | ||
5957 | MOVOU X0, 48(DX) | ||
5958 | MOVOU X0, 64(DX) | ||
5959 | MOVOU X0, 80(DX) | ||
5960 | MOVOU X0, 96(DX) | ||
5961 | MOVOU X0, 112(DX) | ||
5962 | ADDQ $0x80, DX | ||
5963 | DECQ CX | ||
5964 | JNZ zero_loop_encodeBetterBlockAsm | ||
5965 | MOVL $0x00000000, 12(SP) | ||
5966 | MOVQ src_len+32(FP), CX | ||
5967 | LEAQ -6(CX), DX | ||
5968 | LEAQ -8(CX), BX | ||
5969 | MOVL BX, 8(SP) | ||
5970 | SHRQ $0x05, CX | ||
5971 | SUBL CX, DX | ||
5972 | LEAQ (AX)(DX*1), DX | ||
5973 | MOVQ DX, (SP) | ||
5974 | MOVL $0x00000001, CX | ||
5975 | MOVL $0x00000000, 16(SP) | ||
5976 | MOVQ src_base+24(FP), DX | ||
5977 | |||
5978 | search_loop_encodeBetterBlockAsm: | ||
5979 | MOVL CX, BX | ||
5980 | SUBL 12(SP), BX | ||
5981 | SHRL $0x07, BX | ||
5982 | CMPL BX, $0x63 | ||
5983 | JBE check_maxskip_ok_encodeBetterBlockAsm | ||
5984 | LEAL 100(CX), BX | ||
5985 | JMP check_maxskip_cont_encodeBetterBlockAsm | ||
5986 | |||
5987 | check_maxskip_ok_encodeBetterBlockAsm: | ||
5988 | LEAL 1(CX)(BX*1), BX | ||
5989 | |||
5990 | check_maxskip_cont_encodeBetterBlockAsm: | ||
5991 | CMPL BX, 8(SP) | ||
5992 | JAE emit_remainder_encodeBetterBlockAsm | ||
5993 | MOVQ (DX)(CX*1), SI | ||
5994 | MOVL BX, 20(SP) | ||
5995 | MOVQ $0x00cf1bbcdcbfa563, R8 | ||
5996 | MOVQ $0x9e3779b1, BX | ||
5997 | MOVQ SI, R9 | ||
5998 | MOVQ SI, R10 | ||
5999 | SHLQ $0x08, R9 | ||
6000 | IMULQ R8, R9 | ||
6001 | SHRQ $0x2f, R9 | ||
6002 | SHLQ $0x20, R10 | ||
6003 | IMULQ BX, R10 | ||
6004 | SHRQ $0x32, R10 | ||
6005 | MOVL 24(SP)(R9*4), BX | ||
6006 | MOVL 524312(SP)(R10*4), DI | ||
6007 | MOVL CX, 24(SP)(R9*4) | ||
6008 | MOVL CX, 524312(SP)(R10*4) | ||
6009 | MOVQ (DX)(BX*1), R9 | ||
6010 | MOVQ (DX)(DI*1), R10 | ||
6011 | CMPQ R9, SI | ||
6012 | JEQ candidate_match_encodeBetterBlockAsm | ||
6013 | CMPQ R10, SI | ||
6014 | JNE no_short_found_encodeBetterBlockAsm | ||
6015 | MOVL DI, BX | ||
6016 | JMP candidate_match_encodeBetterBlockAsm | ||
6017 | |||
6018 | no_short_found_encodeBetterBlockAsm: | ||
6019 | CMPL R9, SI | ||
6020 | JEQ candidate_match_encodeBetterBlockAsm | ||
6021 | CMPL R10, SI | ||
6022 | JEQ candidateS_match_encodeBetterBlockAsm | ||
6023 | MOVL 20(SP), CX | ||
6024 | JMP search_loop_encodeBetterBlockAsm | ||
6025 | |||
6026 | candidateS_match_encodeBetterBlockAsm: | ||
6027 | SHRQ $0x08, SI | ||
6028 | MOVQ SI, R9 | ||
6029 | SHLQ $0x08, R9 | ||
6030 | IMULQ R8, R9 | ||
6031 | SHRQ $0x2f, R9 | ||
6032 | MOVL 24(SP)(R9*4), BX | ||
6033 | INCL CX | ||
6034 | MOVL CX, 24(SP)(R9*4) | ||
6035 | CMPL (DX)(BX*1), SI | ||
6036 | JEQ candidate_match_encodeBetterBlockAsm | ||
6037 | DECL CX | ||
6038 | MOVL DI, BX | ||
6039 | |||
6040 | candidate_match_encodeBetterBlockAsm: | ||
6041 | MOVL 12(SP), SI | ||
6042 | TESTL BX, BX | ||
6043 | JZ match_extend_back_end_encodeBetterBlockAsm | ||
6044 | |||
6045 | match_extend_back_loop_encodeBetterBlockAsm: | ||
6046 | CMPL CX, SI | ||
6047 | JBE match_extend_back_end_encodeBetterBlockAsm | ||
6048 | MOVB -1(DX)(BX*1), DI | ||
6049 | MOVB -1(DX)(CX*1), R8 | ||
6050 | CMPB DI, R8 | ||
6051 | JNE match_extend_back_end_encodeBetterBlockAsm | ||
6052 | LEAL -1(CX), CX | ||
6053 | DECL BX | ||
6054 | JZ match_extend_back_end_encodeBetterBlockAsm | ||
6055 | JMP match_extend_back_loop_encodeBetterBlockAsm | ||
6056 | |||
6057 | match_extend_back_end_encodeBetterBlockAsm: | ||
6058 | MOVL CX, SI | ||
6059 | SUBL 12(SP), SI | ||
6060 | LEAQ 5(AX)(SI*1), SI | ||
6061 | CMPQ SI, (SP) | ||
6062 | JB match_dst_size_check_encodeBetterBlockAsm | ||
6063 | MOVQ $0x00000000, ret+48(FP) | ||
6064 | RET | ||
6065 | |||
6066 | match_dst_size_check_encodeBetterBlockAsm: | ||
6067 | MOVL CX, SI | ||
6068 | ADDL $0x04, CX | ||
6069 | ADDL $0x04, BX | ||
6070 | MOVQ src_len+32(FP), DI | ||
6071 | SUBL CX, DI | ||
6072 | LEAQ (DX)(CX*1), R8 | ||
6073 | LEAQ (DX)(BX*1), R9 | ||
6074 | |||
6075 | // matchLen | ||
6076 | XORL R11, R11 | ||
6077 | |||
6078 | matchlen_loopback_16_match_nolit_encodeBetterBlockAsm: | ||
6079 | CMPL DI, $0x10 | ||
6080 | JB matchlen_match8_match_nolit_encodeBetterBlockAsm | ||
6081 | MOVQ (R8)(R11*1), R10 | ||
6082 | MOVQ 8(R8)(R11*1), R12 | ||
6083 | XORQ (R9)(R11*1), R10 | ||
6084 | JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm | ||
6085 | XORQ 8(R9)(R11*1), R12 | ||
6086 | JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm | ||
6087 | LEAL -16(DI), DI | ||
6088 | LEAL 16(R11), R11 | ||
6089 | JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm | ||
6090 | |||
6091 | matchlen_bsf_16match_nolit_encodeBetterBlockAsm: | ||
6092 | #ifdef GOAMD64_v3 | ||
6093 | TZCNTQ R12, R12 | ||
6094 | |||
6095 | #else | ||
6096 | BSFQ R12, R12 | ||
6097 | |||
6098 | #endif | ||
6099 | SARQ $0x03, R12 | ||
6100 | LEAL 8(R11)(R12*1), R11 | ||
6101 | JMP match_nolit_end_encodeBetterBlockAsm | ||
6102 | |||
6103 | matchlen_match8_match_nolit_encodeBetterBlockAsm: | ||
6104 | CMPL DI, $0x08 | ||
6105 | JB matchlen_match4_match_nolit_encodeBetterBlockAsm | ||
6106 | MOVQ (R8)(R11*1), R10 | ||
6107 | XORQ (R9)(R11*1), R10 | ||
6108 | JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm | ||
6109 | LEAL -8(DI), DI | ||
6110 | LEAL 8(R11), R11 | ||
6111 | JMP matchlen_match4_match_nolit_encodeBetterBlockAsm | ||
6112 | |||
6113 | matchlen_bsf_8_match_nolit_encodeBetterBlockAsm: | ||
6114 | #ifdef GOAMD64_v3 | ||
6115 | TZCNTQ R10, R10 | ||
6116 | |||
6117 | #else | ||
6118 | BSFQ R10, R10 | ||
6119 | |||
6120 | #endif | ||
6121 | SARQ $0x03, R10 | ||
6122 | LEAL (R11)(R10*1), R11 | ||
6123 | JMP match_nolit_end_encodeBetterBlockAsm | ||
6124 | |||
6125 | matchlen_match4_match_nolit_encodeBetterBlockAsm: | ||
6126 | CMPL DI, $0x04 | ||
6127 | JB matchlen_match2_match_nolit_encodeBetterBlockAsm | ||
6128 | MOVL (R8)(R11*1), R10 | ||
6129 | CMPL (R9)(R11*1), R10 | ||
6130 | JNE matchlen_match2_match_nolit_encodeBetterBlockAsm | ||
6131 | LEAL -4(DI), DI | ||
6132 | LEAL 4(R11), R11 | ||
6133 | |||
6134 | matchlen_match2_match_nolit_encodeBetterBlockAsm: | ||
6135 | CMPL DI, $0x01 | ||
6136 | JE matchlen_match1_match_nolit_encodeBetterBlockAsm | ||
6137 | JB match_nolit_end_encodeBetterBlockAsm | ||
6138 | MOVW (R8)(R11*1), R10 | ||
6139 | CMPW (R9)(R11*1), R10 | ||
6140 | JNE matchlen_match1_match_nolit_encodeBetterBlockAsm | ||
6141 | LEAL 2(R11), R11 | ||
6142 | SUBL $0x02, DI | ||
6143 | JZ match_nolit_end_encodeBetterBlockAsm | ||
6144 | |||
6145 | matchlen_match1_match_nolit_encodeBetterBlockAsm: | ||
6146 | MOVB (R8)(R11*1), R10 | ||
6147 | CMPB (R9)(R11*1), R10 | ||
6148 | JNE match_nolit_end_encodeBetterBlockAsm | ||
6149 | LEAL 1(R11), R11 | ||
6150 | |||
6151 | match_nolit_end_encodeBetterBlockAsm: | ||
6152 | MOVL CX, DI | ||
6153 | SUBL BX, DI | ||
6154 | |||
6155 | // Check if repeat | ||
6156 | CMPL 16(SP), DI | ||
6157 | JEQ match_is_repeat_encodeBetterBlockAsm | ||
6158 | CMPL R11, $0x01 | ||
6159 | JA match_length_ok_encodeBetterBlockAsm | ||
6160 | CMPL DI, $0x0000ffff | ||
6161 | JBE match_length_ok_encodeBetterBlockAsm | ||
6162 | MOVL 20(SP), CX | ||
6163 | INCL CX | ||
6164 | JMP search_loop_encodeBetterBlockAsm | ||
6165 | |||
6166 | match_length_ok_encodeBetterBlockAsm: | ||
6167 | MOVL DI, 16(SP) | ||
6168 | MOVL 12(SP), BX | ||
6169 | CMPL BX, SI | ||
6170 | JEQ emit_literal_done_match_emit_encodeBetterBlockAsm | ||
6171 | MOVL SI, R8 | ||
6172 | MOVL SI, 12(SP) | ||
6173 | LEAQ (DX)(BX*1), R9 | ||
6174 | SUBL BX, R8 | ||
6175 | LEAL -1(R8), BX | ||
6176 | CMPL BX, $0x3c | ||
6177 | JB one_byte_match_emit_encodeBetterBlockAsm | ||
6178 | CMPL BX, $0x00000100 | ||
6179 | JB two_bytes_match_emit_encodeBetterBlockAsm | ||
6180 | CMPL BX, $0x00010000 | ||
6181 | JB three_bytes_match_emit_encodeBetterBlockAsm | ||
6182 | CMPL BX, $0x01000000 | ||
6183 | JB four_bytes_match_emit_encodeBetterBlockAsm | ||
6184 | MOVB $0xfc, (AX) | ||
6185 | MOVL BX, 1(AX) | ||
6186 | ADDQ $0x05, AX | ||
6187 | JMP memmove_long_match_emit_encodeBetterBlockAsm | ||
6188 | |||
6189 | four_bytes_match_emit_encodeBetterBlockAsm: | ||
6190 | MOVL BX, R10 | ||
6191 | SHRL $0x10, R10 | ||
6192 | MOVB $0xf8, (AX) | ||
6193 | MOVW BX, 1(AX) | ||
6194 | MOVB R10, 3(AX) | ||
6195 | ADDQ $0x04, AX | ||
6196 | JMP memmove_long_match_emit_encodeBetterBlockAsm | ||
6197 | |||
6198 | three_bytes_match_emit_encodeBetterBlockAsm: | ||
6199 | MOVB $0xf4, (AX) | ||
6200 | MOVW BX, 1(AX) | ||
6201 | ADDQ $0x03, AX | ||
6202 | JMP memmove_long_match_emit_encodeBetterBlockAsm | ||
6203 | |||
6204 | two_bytes_match_emit_encodeBetterBlockAsm: | ||
6205 | MOVB $0xf0, (AX) | ||
6206 | MOVB BL, 1(AX) | ||
6207 | ADDQ $0x02, AX | ||
6208 | CMPL BX, $0x40 | ||
6209 | JB memmove_match_emit_encodeBetterBlockAsm | ||
6210 | JMP memmove_long_match_emit_encodeBetterBlockAsm | ||
6211 | |||
6212 | one_byte_match_emit_encodeBetterBlockAsm: | ||
6213 | SHLB $0x02, BL | ||
6214 | MOVB BL, (AX) | ||
6215 | ADDQ $0x01, AX | ||
6216 | |||
6217 | memmove_match_emit_encodeBetterBlockAsm: | ||
6218 | LEAQ (AX)(R8*1), BX | ||
6219 | |||
6220 | // genMemMoveShort | ||
6221 | CMPQ R8, $0x04 | ||
6222 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 | ||
6223 | CMPQ R8, $0x08 | ||
6224 | JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 | ||
6225 | CMPQ R8, $0x10 | ||
6226 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 | ||
6227 | CMPQ R8, $0x20 | ||
6228 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 | ||
6229 | JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 | ||
6230 | |||
6231 | emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: | ||
6232 | MOVL (R9), R10 | ||
6233 | MOVL R10, (AX) | ||
6234 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm | ||
6235 | |||
6236 | emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: | ||
6237 | MOVL (R9), R10 | ||
6238 | MOVL -4(R9)(R8*1), R9 | ||
6239 | MOVL R10, (AX) | ||
6240 | MOVL R9, -4(AX)(R8*1) | ||
6241 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm | ||
6242 | |||
6243 | emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: | ||
6244 | MOVQ (R9), R10 | ||
6245 | MOVQ -8(R9)(R8*1), R9 | ||
6246 | MOVQ R10, (AX) | ||
6247 | MOVQ R9, -8(AX)(R8*1) | ||
6248 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm | ||
6249 | |||
6250 | emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: | ||
6251 | MOVOU (R9), X0 | ||
6252 | MOVOU -16(R9)(R8*1), X1 | ||
6253 | MOVOU X0, (AX) | ||
6254 | MOVOU X1, -16(AX)(R8*1) | ||
6255 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm | ||
6256 | |||
6257 | emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: | ||
6258 | MOVOU (R9), X0 | ||
6259 | MOVOU 16(R9), X1 | ||
6260 | MOVOU -32(R9)(R8*1), X2 | ||
6261 | MOVOU -16(R9)(R8*1), X3 | ||
6262 | MOVOU X0, (AX) | ||
6263 | MOVOU X1, 16(AX) | ||
6264 | MOVOU X2, -32(AX)(R8*1) | ||
6265 | MOVOU X3, -16(AX)(R8*1) | ||
6266 | |||
6267 | memmove_end_copy_match_emit_encodeBetterBlockAsm: | ||
6268 | MOVQ BX, AX | ||
6269 | JMP emit_literal_done_match_emit_encodeBetterBlockAsm | ||
6270 | |||
6271 | memmove_long_match_emit_encodeBetterBlockAsm: | ||
6272 | LEAQ (AX)(R8*1), BX | ||
6273 | |||
6274 | // genMemMoveLong | ||
6275 | MOVOU (R9), X0 | ||
6276 | MOVOU 16(R9), X1 | ||
6277 | MOVOU -32(R9)(R8*1), X2 | ||
6278 | MOVOU -16(R9)(R8*1), X3 | ||
6279 | MOVQ R8, R12 | ||
6280 | SHRQ $0x05, R12 | ||
6281 | MOVQ AX, R10 | ||
6282 | ANDL $0x0000001f, R10 | ||
6283 | MOVQ $0x00000040, R13 | ||
6284 | SUBQ R10, R13 | ||
6285 | DECQ R12 | ||
6286 | JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 | ||
6287 | LEAQ -32(R9)(R13*1), R10 | ||
6288 | LEAQ -32(AX)(R13*1), R14 | ||
6289 | |||
6290 | emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: | ||
6291 | MOVOU (R10), X4 | ||
6292 | MOVOU 16(R10), X5 | ||
6293 | MOVOA X4, (R14) | ||
6294 | MOVOA X5, 16(R14) | ||
6295 | ADDQ $0x20, R14 | ||
6296 | ADDQ $0x20, R10 | ||
6297 | ADDQ $0x20, R13 | ||
6298 | DECQ R12 | ||
6299 | JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back | ||
6300 | |||
6301 | emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: | ||
6302 | MOVOU -32(R9)(R13*1), X4 | ||
6303 | MOVOU -16(R9)(R13*1), X5 | ||
6304 | MOVOA X4, -32(AX)(R13*1) | ||
6305 | MOVOA X5, -16(AX)(R13*1) | ||
6306 | ADDQ $0x20, R13 | ||
6307 | CMPQ R8, R13 | ||
6308 | JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 | ||
6309 | MOVOU X0, (AX) | ||
6310 | MOVOU X1, 16(AX) | ||
6311 | MOVOU X2, -32(AX)(R8*1) | ||
6312 | MOVOU X3, -16(AX)(R8*1) | ||
6313 | MOVQ BX, AX | ||
6314 | |||
6315 | emit_literal_done_match_emit_encodeBetterBlockAsm: | ||
6316 | ADDL R11, CX | ||
6317 | ADDL $0x04, R11 | ||
6318 | MOVL CX, 12(SP) | ||
6319 | |||
6320 | // emitCopy | ||
6321 | CMPL DI, $0x00010000 | ||
6322 | JB two_byte_offset_match_nolit_encodeBetterBlockAsm | ||
6323 | CMPL R11, $0x40 | ||
6324 | JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm | ||
6325 | MOVB $0xff, (AX) | ||
6326 | MOVL DI, 1(AX) | ||
6327 | LEAL -64(R11), R11 | ||
6328 | ADDQ $0x05, AX | ||
6329 | CMPL R11, $0x04 | ||
6330 | JB four_bytes_remain_match_nolit_encodeBetterBlockAsm | ||
6331 | |||
6332 | // emitRepeat | ||
6333 | emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: | ||
6334 | MOVL R11, BX | ||
6335 | LEAL -4(R11), R11 | ||
6336 | CMPL BX, $0x08 | ||
6337 | JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy | ||
6338 | CMPL BX, $0x0c | ||
6339 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy | ||
6340 | CMPL DI, $0x00000800 | ||
6341 | JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy | ||
6342 | |||
6343 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: | ||
6344 | CMPL R11, $0x00000104 | ||
6345 | JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy | ||
6346 | CMPL R11, $0x00010100 | ||
6347 | JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy | ||
6348 | CMPL R11, $0x0100ffff | ||
6349 | JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy | ||
6350 | LEAL -16842747(R11), R11 | ||
6351 | MOVL $0xfffb001d, (AX) | ||
6352 | MOVB $0xff, 4(AX) | ||
6353 | ADDQ $0x05, AX | ||
6354 | JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy | ||
6355 | |||
6356 | repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: | ||
6357 | LEAL -65536(R11), R11 | ||
6358 | MOVL R11, DI | ||
6359 | MOVW $0x001d, (AX) | ||
6360 | MOVW R11, 2(AX) | ||
6361 | SARL $0x10, DI | ||
6362 | MOVB DI, 4(AX) | ||
6363 | ADDQ $0x05, AX | ||
6364 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6365 | |||
6366 | repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: | ||
6367 | LEAL -256(R11), R11 | ||
6368 | MOVW $0x0019, (AX) | ||
6369 | MOVW R11, 2(AX) | ||
6370 | ADDQ $0x04, AX | ||
6371 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6372 | |||
6373 | repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: | ||
6374 | LEAL -4(R11), R11 | ||
6375 | MOVW $0x0015, (AX) | ||
6376 | MOVB R11, 2(AX) | ||
6377 | ADDQ $0x03, AX | ||
6378 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6379 | |||
6380 | repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: | ||
6381 | SHLL $0x02, R11 | ||
6382 | ORL $0x01, R11 | ||
6383 | MOVW R11, (AX) | ||
6384 | ADDQ $0x02, AX | ||
6385 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6386 | |||
6387 | repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: | ||
6388 | XORQ BX, BX | ||
6389 | LEAL 1(BX)(R11*4), R11 | ||
6390 | MOVB DI, 1(AX) | ||
6391 | SARL $0x08, DI | ||
6392 | SHLL $0x05, DI | ||
6393 | ORL DI, R11 | ||
6394 | MOVB R11, (AX) | ||
6395 | ADDQ $0x02, AX | ||
6396 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6397 | |||
6398 | four_bytes_remain_match_nolit_encodeBetterBlockAsm: | ||
6399 | TESTL R11, R11 | ||
6400 | JZ match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6401 | XORL BX, BX | ||
6402 | LEAL -1(BX)(R11*4), R11 | ||
6403 | MOVB R11, (AX) | ||
6404 | MOVL DI, 1(AX) | ||
6405 | ADDQ $0x05, AX | ||
6406 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6407 | |||
6408 | two_byte_offset_match_nolit_encodeBetterBlockAsm: | ||
6409 | CMPL R11, $0x40 | ||
6410 | JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm | ||
6411 | CMPL DI, $0x00000800 | ||
6412 | JAE long_offset_short_match_nolit_encodeBetterBlockAsm | ||
6413 | MOVL $0x00000001, BX | ||
6414 | LEAL 16(BX), BX | ||
6415 | MOVB DI, 1(AX) | ||
6416 | MOVL DI, R8 | ||
6417 | SHRL $0x08, R8 | ||
6418 | SHLL $0x05, R8 | ||
6419 | ORL R8, BX | ||
6420 | MOVB BL, (AX) | ||
6421 | ADDQ $0x02, AX | ||
6422 | SUBL $0x08, R11 | ||
6423 | |||
6424 | // emitRepeat | ||
6425 | LEAL -4(R11), R11 | ||
6426 | JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b | ||
6427 | |||
6428 | emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: | ||
6429 | MOVL R11, BX | ||
6430 | LEAL -4(R11), R11 | ||
6431 | CMPL BX, $0x08 | ||
6432 | JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b | ||
6433 | CMPL BX, $0x0c | ||
6434 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b | ||
6435 | CMPL DI, $0x00000800 | ||
6436 | JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b | ||
6437 | |||
6438 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: | ||
6439 | CMPL R11, $0x00000104 | ||
6440 | JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b | ||
6441 | CMPL R11, $0x00010100 | ||
6442 | JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b | ||
6443 | CMPL R11, $0x0100ffff | ||
6444 | JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b | ||
6445 | LEAL -16842747(R11), R11 | ||
6446 | MOVL $0xfffb001d, (AX) | ||
6447 | MOVB $0xff, 4(AX) | ||
6448 | ADDQ $0x05, AX | ||
6449 | JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b | ||
6450 | |||
6451 | repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: | ||
6452 | LEAL -65536(R11), R11 | ||
6453 | MOVL R11, DI | ||
6454 | MOVW $0x001d, (AX) | ||
6455 | MOVW R11, 2(AX) | ||
6456 | SARL $0x10, DI | ||
6457 | MOVB DI, 4(AX) | ||
6458 | ADDQ $0x05, AX | ||
6459 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6460 | |||
6461 | repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: | ||
6462 | LEAL -256(R11), R11 | ||
6463 | MOVW $0x0019, (AX) | ||
6464 | MOVW R11, 2(AX) | ||
6465 | ADDQ $0x04, AX | ||
6466 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6467 | |||
6468 | repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: | ||
6469 | LEAL -4(R11), R11 | ||
6470 | MOVW $0x0015, (AX) | ||
6471 | MOVB R11, 2(AX) | ||
6472 | ADDQ $0x03, AX | ||
6473 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6474 | |||
6475 | repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: | ||
6476 | SHLL $0x02, R11 | ||
6477 | ORL $0x01, R11 | ||
6478 | MOVW R11, (AX) | ||
6479 | ADDQ $0x02, AX | ||
6480 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6481 | |||
6482 | repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: | ||
6483 | XORQ BX, BX | ||
6484 | LEAL 1(BX)(R11*4), R11 | ||
6485 | MOVB DI, 1(AX) | ||
6486 | SARL $0x08, DI | ||
6487 | SHLL $0x05, DI | ||
6488 | ORL DI, R11 | ||
6489 | MOVB R11, (AX) | ||
6490 | ADDQ $0x02, AX | ||
6491 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6492 | |||
6493 | long_offset_short_match_nolit_encodeBetterBlockAsm: | ||
6494 | MOVB $0xee, (AX) | ||
6495 | MOVW DI, 1(AX) | ||
6496 | LEAL -60(R11), R11 | ||
6497 | ADDQ $0x03, AX | ||
6498 | |||
6499 | // emitRepeat | ||
6500 | emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: | ||
6501 | MOVL R11, BX | ||
6502 | LEAL -4(R11), R11 | ||
6503 | CMPL BX, $0x08 | ||
6504 | JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short | ||
6505 | CMPL BX, $0x0c | ||
6506 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short | ||
6507 | CMPL DI, $0x00000800 | ||
6508 | JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short | ||
6509 | |||
6510 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: | ||
6511 | CMPL R11, $0x00000104 | ||
6512 | JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short | ||
6513 | CMPL R11, $0x00010100 | ||
6514 | JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short | ||
6515 | CMPL R11, $0x0100ffff | ||
6516 | JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short | ||
6517 | LEAL -16842747(R11), R11 | ||
6518 | MOVL $0xfffb001d, (AX) | ||
6519 | MOVB $0xff, 4(AX) | ||
6520 | ADDQ $0x05, AX | ||
6521 | JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short | ||
6522 | |||
6523 | repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: | ||
6524 | LEAL -65536(R11), R11 | ||
6525 | MOVL R11, DI | ||
6526 | MOVW $0x001d, (AX) | ||
6527 | MOVW R11, 2(AX) | ||
6528 | SARL $0x10, DI | ||
6529 | MOVB DI, 4(AX) | ||
6530 | ADDQ $0x05, AX | ||
6531 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6532 | |||
6533 | repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: | ||
6534 | LEAL -256(R11), R11 | ||
6535 | MOVW $0x0019, (AX) | ||
6536 | MOVW R11, 2(AX) | ||
6537 | ADDQ $0x04, AX | ||
6538 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6539 | |||
6540 | repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: | ||
6541 | LEAL -4(R11), R11 | ||
6542 | MOVW $0x0015, (AX) | ||
6543 | MOVB R11, 2(AX) | ||
6544 | ADDQ $0x03, AX | ||
6545 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6546 | |||
6547 | repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: | ||
6548 | SHLL $0x02, R11 | ||
6549 | ORL $0x01, R11 | ||
6550 | MOVW R11, (AX) | ||
6551 | ADDQ $0x02, AX | ||
6552 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6553 | |||
6554 | repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: | ||
6555 | XORQ BX, BX | ||
6556 | LEAL 1(BX)(R11*4), R11 | ||
6557 | MOVB DI, 1(AX) | ||
6558 | SARL $0x08, DI | ||
6559 | SHLL $0x05, DI | ||
6560 | ORL DI, R11 | ||
6561 | MOVB R11, (AX) | ||
6562 | ADDQ $0x02, AX | ||
6563 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6564 | |||
6565 | two_byte_offset_short_match_nolit_encodeBetterBlockAsm: | ||
6566 | MOVL R11, BX | ||
6567 | SHLL $0x02, BX | ||
6568 | CMPL R11, $0x0c | ||
6569 | JAE emit_copy_three_match_nolit_encodeBetterBlockAsm | ||
6570 | CMPL DI, $0x00000800 | ||
6571 | JAE emit_copy_three_match_nolit_encodeBetterBlockAsm | ||
6572 | LEAL -15(BX), BX | ||
6573 | MOVB DI, 1(AX) | ||
6574 | SHRL $0x08, DI | ||
6575 | SHLL $0x05, DI | ||
6576 | ORL DI, BX | ||
6577 | MOVB BL, (AX) | ||
6578 | ADDQ $0x02, AX | ||
6579 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6580 | |||
6581 | emit_copy_three_match_nolit_encodeBetterBlockAsm: | ||
6582 | LEAL -2(BX), BX | ||
6583 | MOVB BL, (AX) | ||
6584 | MOVW DI, 1(AX) | ||
6585 | ADDQ $0x03, AX | ||
6586 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6587 | |||
6588 | match_is_repeat_encodeBetterBlockAsm: | ||
6589 | MOVL 12(SP), BX | ||
6590 | CMPL BX, SI | ||
6591 | JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm | ||
6592 | MOVL SI, R8 | ||
6593 | MOVL SI, 12(SP) | ||
6594 | LEAQ (DX)(BX*1), R9 | ||
6595 | SUBL BX, R8 | ||
6596 | LEAL -1(R8), BX | ||
6597 | CMPL BX, $0x3c | ||
6598 | JB one_byte_match_emit_repeat_encodeBetterBlockAsm | ||
6599 | CMPL BX, $0x00000100 | ||
6600 | JB two_bytes_match_emit_repeat_encodeBetterBlockAsm | ||
6601 | CMPL BX, $0x00010000 | ||
6602 | JB three_bytes_match_emit_repeat_encodeBetterBlockAsm | ||
6603 | CMPL BX, $0x01000000 | ||
6604 | JB four_bytes_match_emit_repeat_encodeBetterBlockAsm | ||
6605 | MOVB $0xfc, (AX) | ||
6606 | MOVL BX, 1(AX) | ||
6607 | ADDQ $0x05, AX | ||
6608 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm | ||
6609 | |||
6610 | four_bytes_match_emit_repeat_encodeBetterBlockAsm: | ||
6611 | MOVL BX, R10 | ||
6612 | SHRL $0x10, R10 | ||
6613 | MOVB $0xf8, (AX) | ||
6614 | MOVW BX, 1(AX) | ||
6615 | MOVB R10, 3(AX) | ||
6616 | ADDQ $0x04, AX | ||
6617 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm | ||
6618 | |||
6619 | three_bytes_match_emit_repeat_encodeBetterBlockAsm: | ||
6620 | MOVB $0xf4, (AX) | ||
6621 | MOVW BX, 1(AX) | ||
6622 | ADDQ $0x03, AX | ||
6623 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm | ||
6624 | |||
6625 | two_bytes_match_emit_repeat_encodeBetterBlockAsm: | ||
6626 | MOVB $0xf0, (AX) | ||
6627 | MOVB BL, 1(AX) | ||
6628 | ADDQ $0x02, AX | ||
6629 | CMPL BX, $0x40 | ||
6630 | JB memmove_match_emit_repeat_encodeBetterBlockAsm | ||
6631 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm | ||
6632 | |||
6633 | one_byte_match_emit_repeat_encodeBetterBlockAsm: | ||
6634 | SHLB $0x02, BL | ||
6635 | MOVB BL, (AX) | ||
6636 | ADDQ $0x01, AX | ||
6637 | |||
6638 | memmove_match_emit_repeat_encodeBetterBlockAsm: | ||
6639 | LEAQ (AX)(R8*1), BX | ||
6640 | |||
6641 | // genMemMoveShort | ||
6642 | CMPQ R8, $0x04 | ||
6643 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 | ||
6644 | CMPQ R8, $0x08 | ||
6645 | JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 | ||
6646 | CMPQ R8, $0x10 | ||
6647 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 | ||
6648 | CMPQ R8, $0x20 | ||
6649 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 | ||
6650 | JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 | ||
6651 | |||
6652 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: | ||
6653 | MOVL (R9), R10 | ||
6654 | MOVL R10, (AX) | ||
6655 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm | ||
6656 | |||
6657 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: | ||
6658 | MOVL (R9), R10 | ||
6659 | MOVL -4(R9)(R8*1), R9 | ||
6660 | MOVL R10, (AX) | ||
6661 | MOVL R9, -4(AX)(R8*1) | ||
6662 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm | ||
6663 | |||
6664 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: | ||
6665 | MOVQ (R9), R10 | ||
6666 | MOVQ -8(R9)(R8*1), R9 | ||
6667 | MOVQ R10, (AX) | ||
6668 | MOVQ R9, -8(AX)(R8*1) | ||
6669 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm | ||
6670 | |||
6671 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: | ||
6672 | MOVOU (R9), X0 | ||
6673 | MOVOU -16(R9)(R8*1), X1 | ||
6674 | MOVOU X0, (AX) | ||
6675 | MOVOU X1, -16(AX)(R8*1) | ||
6676 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm | ||
6677 | |||
6678 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: | ||
6679 | MOVOU (R9), X0 | ||
6680 | MOVOU 16(R9), X1 | ||
6681 | MOVOU -32(R9)(R8*1), X2 | ||
6682 | MOVOU -16(R9)(R8*1), X3 | ||
6683 | MOVOU X0, (AX) | ||
6684 | MOVOU X1, 16(AX) | ||
6685 | MOVOU X2, -32(AX)(R8*1) | ||
6686 | MOVOU X3, -16(AX)(R8*1) | ||
6687 | |||
6688 | memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: | ||
6689 | MOVQ BX, AX | ||
6690 | JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm | ||
6691 | |||
6692 | memmove_long_match_emit_repeat_encodeBetterBlockAsm: | ||
6693 | LEAQ (AX)(R8*1), BX | ||
6694 | |||
6695 | // genMemMoveLong | ||
6696 | MOVOU (R9), X0 | ||
6697 | MOVOU 16(R9), X1 | ||
6698 | MOVOU -32(R9)(R8*1), X2 | ||
6699 | MOVOU -16(R9)(R8*1), X3 | ||
6700 | MOVQ R8, R12 | ||
6701 | SHRQ $0x05, R12 | ||
6702 | MOVQ AX, R10 | ||
6703 | ANDL $0x0000001f, R10 | ||
6704 | MOVQ $0x00000040, R13 | ||
6705 | SUBQ R10, R13 | ||
6706 | DECQ R12 | ||
6707 | JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 | ||
6708 | LEAQ -32(R9)(R13*1), R10 | ||
6709 | LEAQ -32(AX)(R13*1), R14 | ||
6710 | |||
6711 | emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: | ||
6712 | MOVOU (R10), X4 | ||
6713 | MOVOU 16(R10), X5 | ||
6714 | MOVOA X4, (R14) | ||
6715 | MOVOA X5, 16(R14) | ||
6716 | ADDQ $0x20, R14 | ||
6717 | ADDQ $0x20, R10 | ||
6718 | ADDQ $0x20, R13 | ||
6719 | DECQ R12 | ||
6720 | JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back | ||
6721 | |||
6722 | emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: | ||
6723 | MOVOU -32(R9)(R13*1), X4 | ||
6724 | MOVOU -16(R9)(R13*1), X5 | ||
6725 | MOVOA X4, -32(AX)(R13*1) | ||
6726 | MOVOA X5, -16(AX)(R13*1) | ||
6727 | ADDQ $0x20, R13 | ||
6728 | CMPQ R8, R13 | ||
6729 | JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 | ||
6730 | MOVOU X0, (AX) | ||
6731 | MOVOU X1, 16(AX) | ||
6732 | MOVOU X2, -32(AX)(R8*1) | ||
6733 | MOVOU X3, -16(AX)(R8*1) | ||
6734 | MOVQ BX, AX | ||
6735 | |||
6736 | emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: | ||
6737 | ADDL R11, CX | ||
6738 | ADDL $0x04, R11 | ||
6739 | MOVL CX, 12(SP) | ||
6740 | |||
6741 | // emitRepeat | ||
6742 | emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: | ||
6743 | MOVL R11, BX | ||
6744 | LEAL -4(R11), R11 | ||
6745 | CMPL BX, $0x08 | ||
6746 | JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm | ||
6747 | CMPL BX, $0x0c | ||
6748 | JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm | ||
6749 | CMPL DI, $0x00000800 | ||
6750 | JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm | ||
6751 | |||
6752 | cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: | ||
6753 | CMPL R11, $0x00000104 | ||
6754 | JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm | ||
6755 | CMPL R11, $0x00010100 | ||
6756 | JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm | ||
6757 | CMPL R11, $0x0100ffff | ||
6758 | JB repeat_five_match_nolit_repeat_encodeBetterBlockAsm | ||
6759 | LEAL -16842747(R11), R11 | ||
6760 | MOVL $0xfffb001d, (AX) | ||
6761 | MOVB $0xff, 4(AX) | ||
6762 | ADDQ $0x05, AX | ||
6763 | JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm | ||
6764 | |||
6765 | repeat_five_match_nolit_repeat_encodeBetterBlockAsm: | ||
6766 | LEAL -65536(R11), R11 | ||
6767 | MOVL R11, DI | ||
6768 | MOVW $0x001d, (AX) | ||
6769 | MOVW R11, 2(AX) | ||
6770 | SARL $0x10, DI | ||
6771 | MOVB DI, 4(AX) | ||
6772 | ADDQ $0x05, AX | ||
6773 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6774 | |||
6775 | repeat_four_match_nolit_repeat_encodeBetterBlockAsm: | ||
6776 | LEAL -256(R11), R11 | ||
6777 | MOVW $0x0019, (AX) | ||
6778 | MOVW R11, 2(AX) | ||
6779 | ADDQ $0x04, AX | ||
6780 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6781 | |||
6782 | repeat_three_match_nolit_repeat_encodeBetterBlockAsm: | ||
6783 | LEAL -4(R11), R11 | ||
6784 | MOVW $0x0015, (AX) | ||
6785 | MOVB R11, 2(AX) | ||
6786 | ADDQ $0x03, AX | ||
6787 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6788 | |||
6789 | repeat_two_match_nolit_repeat_encodeBetterBlockAsm: | ||
6790 | SHLL $0x02, R11 | ||
6791 | ORL $0x01, R11 | ||
6792 | MOVW R11, (AX) | ||
6793 | ADDQ $0x02, AX | ||
6794 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm | ||
6795 | |||
6796 | repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: | ||
6797 | XORQ BX, BX | ||
6798 | LEAL 1(BX)(R11*4), R11 | ||
6799 | MOVB DI, 1(AX) | ||
6800 | SARL $0x08, DI | ||
6801 | SHLL $0x05, DI | ||
6802 | ORL DI, R11 | ||
6803 | MOVB R11, (AX) | ||
6804 | ADDQ $0x02, AX | ||
6805 | |||
6806 | match_nolit_emitcopy_end_encodeBetterBlockAsm: | ||
6807 | CMPL CX, 8(SP) | ||
6808 | JAE emit_remainder_encodeBetterBlockAsm | ||
6809 | CMPQ AX, (SP) | ||
6810 | JB match_nolit_dst_ok_encodeBetterBlockAsm | ||
6811 | MOVQ $0x00000000, ret+48(FP) | ||
6812 | RET | ||
6813 | |||
6814 | match_nolit_dst_ok_encodeBetterBlockAsm: | ||
6815 | MOVQ $0x00cf1bbcdcbfa563, BX | ||
6816 | MOVQ $0x9e3779b1, DI | ||
6817 | LEAQ 1(SI), SI | ||
6818 | LEAQ -2(CX), R8 | ||
6819 | MOVQ (DX)(SI*1), R9 | ||
6820 | MOVQ 1(DX)(SI*1), R10 | ||
6821 | MOVQ (DX)(R8*1), R11 | ||
6822 | MOVQ 1(DX)(R8*1), R12 | ||
6823 | SHLQ $0x08, R9 | ||
6824 | IMULQ BX, R9 | ||
6825 | SHRQ $0x2f, R9 | ||
6826 | SHLQ $0x20, R10 | ||
6827 | IMULQ DI, R10 | ||
6828 | SHRQ $0x32, R10 | ||
6829 | SHLQ $0x08, R11 | ||
6830 | IMULQ BX, R11 | ||
6831 | SHRQ $0x2f, R11 | ||
6832 | SHLQ $0x20, R12 | ||
6833 | IMULQ DI, R12 | ||
6834 | SHRQ $0x32, R12 | ||
6835 | LEAQ 1(SI), DI | ||
6836 | LEAQ 1(R8), R13 | ||
6837 | MOVL SI, 24(SP)(R9*4) | ||
6838 | MOVL R8, 24(SP)(R11*4) | ||
6839 | MOVL DI, 524312(SP)(R10*4) | ||
6840 | MOVL R13, 524312(SP)(R12*4) | ||
6841 | LEAQ 1(R8)(SI*1), DI | ||
6842 | SHRQ $0x01, DI | ||
6843 | ADDQ $0x01, SI | ||
6844 | SUBQ $0x01, R8 | ||
6845 | |||
6846 | index_loop_encodeBetterBlockAsm: | ||
6847 | CMPQ DI, R8 | ||
6848 | JAE search_loop_encodeBetterBlockAsm | ||
6849 | MOVQ (DX)(SI*1), R9 | ||
6850 | MOVQ (DX)(DI*1), R10 | ||
6851 | SHLQ $0x08, R9 | ||
6852 | IMULQ BX, R9 | ||
6853 | SHRQ $0x2f, R9 | ||
6854 | SHLQ $0x08, R10 | ||
6855 | IMULQ BX, R10 | ||
6856 | SHRQ $0x2f, R10 | ||
6857 | MOVL SI, 24(SP)(R9*4) | ||
6858 | MOVL DI, 24(SP)(R10*4) | ||
6859 | ADDQ $0x02, SI | ||
6860 | ADDQ $0x02, DI | ||
6861 | JMP index_loop_encodeBetterBlockAsm | ||
6862 | |||
6863 | emit_remainder_encodeBetterBlockAsm: | ||
6864 | MOVQ src_len+32(FP), CX | ||
6865 | SUBL 12(SP), CX | ||
6866 | LEAQ 5(AX)(CX*1), CX | ||
6867 | CMPQ CX, (SP) | ||
6868 | JB emit_remainder_ok_encodeBetterBlockAsm | ||
6869 | MOVQ $0x00000000, ret+48(FP) | ||
6870 | RET | ||
6871 | |||
6872 | emit_remainder_ok_encodeBetterBlockAsm: | ||
6873 | MOVQ src_len+32(FP), CX | ||
6874 | MOVL 12(SP), BX | ||
6875 | CMPL BX, CX | ||
6876 | JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm | ||
6877 | MOVL CX, SI | ||
6878 | MOVL CX, 12(SP) | ||
6879 | LEAQ (DX)(BX*1), CX | ||
6880 | SUBL BX, SI | ||
6881 | LEAL -1(SI), DX | ||
6882 | CMPL DX, $0x3c | ||
6883 | JB one_byte_emit_remainder_encodeBetterBlockAsm | ||
6884 | CMPL DX, $0x00000100 | ||
6885 | JB two_bytes_emit_remainder_encodeBetterBlockAsm | ||
6886 | CMPL DX, $0x00010000 | ||
6887 | JB three_bytes_emit_remainder_encodeBetterBlockAsm | ||
6888 | CMPL DX, $0x01000000 | ||
6889 | JB four_bytes_emit_remainder_encodeBetterBlockAsm | ||
6890 | MOVB $0xfc, (AX) | ||
6891 | MOVL DX, 1(AX) | ||
6892 | ADDQ $0x05, AX | ||
6893 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm | ||
6894 | |||
6895 | four_bytes_emit_remainder_encodeBetterBlockAsm: | ||
6896 | MOVL DX, BX | ||
6897 | SHRL $0x10, BX | ||
6898 | MOVB $0xf8, (AX) | ||
6899 | MOVW DX, 1(AX) | ||
6900 | MOVB BL, 3(AX) | ||
6901 | ADDQ $0x04, AX | ||
6902 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm | ||
6903 | |||
6904 | three_bytes_emit_remainder_encodeBetterBlockAsm: | ||
6905 | MOVB $0xf4, (AX) | ||
6906 | MOVW DX, 1(AX) | ||
6907 | ADDQ $0x03, AX | ||
6908 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm | ||
6909 | |||
6910 | two_bytes_emit_remainder_encodeBetterBlockAsm: | ||
6911 | MOVB $0xf0, (AX) | ||
6912 | MOVB DL, 1(AX) | ||
6913 | ADDQ $0x02, AX | ||
6914 | CMPL DX, $0x40 | ||
6915 | JB memmove_emit_remainder_encodeBetterBlockAsm | ||
6916 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm | ||
6917 | |||
6918 | one_byte_emit_remainder_encodeBetterBlockAsm: | ||
6919 | SHLB $0x02, DL | ||
6920 | MOVB DL, (AX) | ||
6921 | ADDQ $0x01, AX | ||
6922 | |||
6923 | memmove_emit_remainder_encodeBetterBlockAsm: | ||
6924 | LEAQ (AX)(SI*1), DX | ||
6925 | MOVL SI, BX | ||
6926 | |||
6927 | // genMemMoveShort | ||
6928 | CMPQ BX, $0x03 | ||
6929 | JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 | ||
6930 | JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 | ||
6931 | CMPQ BX, $0x08 | ||
6932 | JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 | ||
6933 | CMPQ BX, $0x10 | ||
6934 | JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 | ||
6935 | CMPQ BX, $0x20 | ||
6936 | JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 | ||
6937 | JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 | ||
6938 | |||
6939 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: | ||
6940 | MOVB (CX), SI | ||
6941 | MOVB -1(CX)(BX*1), CL | ||
6942 | MOVB SI, (AX) | ||
6943 | MOVB CL, -1(AX)(BX*1) | ||
6944 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm | ||
6945 | |||
6946 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: | ||
6947 | MOVW (CX), SI | ||
6948 | MOVB 2(CX), CL | ||
6949 | MOVW SI, (AX) | ||
6950 | MOVB CL, 2(AX) | ||
6951 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm | ||
6952 | |||
6953 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: | ||
6954 | MOVL (CX), SI | ||
6955 | MOVL -4(CX)(BX*1), CX | ||
6956 | MOVL SI, (AX) | ||
6957 | MOVL CX, -4(AX)(BX*1) | ||
6958 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm | ||
6959 | |||
6960 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: | ||
6961 | MOVQ (CX), SI | ||
6962 | MOVQ -8(CX)(BX*1), CX | ||
6963 | MOVQ SI, (AX) | ||
6964 | MOVQ CX, -8(AX)(BX*1) | ||
6965 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm | ||
6966 | |||
6967 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: | ||
6968 | MOVOU (CX), X0 | ||
6969 | MOVOU -16(CX)(BX*1), X1 | ||
6970 | MOVOU X0, (AX) | ||
6971 | MOVOU X1, -16(AX)(BX*1) | ||
6972 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm | ||
6973 | |||
6974 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: | ||
6975 | MOVOU (CX), X0 | ||
6976 | MOVOU 16(CX), X1 | ||
6977 | MOVOU -32(CX)(BX*1), X2 | ||
6978 | MOVOU -16(CX)(BX*1), X3 | ||
6979 | MOVOU X0, (AX) | ||
6980 | MOVOU X1, 16(AX) | ||
6981 | MOVOU X2, -32(AX)(BX*1) | ||
6982 | MOVOU X3, -16(AX)(BX*1) | ||
6983 | |||
6984 | memmove_end_copy_emit_remainder_encodeBetterBlockAsm: | ||
6985 | MOVQ DX, AX | ||
6986 | JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm | ||
6987 | |||
6988 | memmove_long_emit_remainder_encodeBetterBlockAsm: | ||
6989 | LEAQ (AX)(SI*1), DX | ||
6990 | MOVL SI, BX | ||
6991 | |||
6992 | // genMemMoveLong | ||
6993 | MOVOU (CX), X0 | ||
6994 | MOVOU 16(CX), X1 | ||
6995 | MOVOU -32(CX)(BX*1), X2 | ||
6996 | MOVOU -16(CX)(BX*1), X3 | ||
6997 | MOVQ BX, DI | ||
6998 | SHRQ $0x05, DI | ||
6999 | MOVQ AX, SI | ||
7000 | ANDL $0x0000001f, SI | ||
7001 | MOVQ $0x00000040, R8 | ||
7002 | SUBQ SI, R8 | ||
7003 | DECQ DI | ||
7004 | JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 | ||
7005 | LEAQ -32(CX)(R8*1), SI | ||
7006 | LEAQ -32(AX)(R8*1), R9 | ||
7007 | |||
7008 | emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: | ||
7009 | MOVOU (SI), X4 | ||
7010 | MOVOU 16(SI), X5 | ||
7011 | MOVOA X4, (R9) | ||
7012 | MOVOA X5, 16(R9) | ||
7013 | ADDQ $0x20, R9 | ||
7014 | ADDQ $0x20, SI | ||
7015 | ADDQ $0x20, R8 | ||
7016 | DECQ DI | ||
7017 | JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back | ||
7018 | |||
7019 | emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: | ||
7020 | MOVOU -32(CX)(R8*1), X4 | ||
7021 | MOVOU -16(CX)(R8*1), X5 | ||
7022 | MOVOA X4, -32(AX)(R8*1) | ||
7023 | MOVOA X5, -16(AX)(R8*1) | ||
7024 | ADDQ $0x20, R8 | ||
7025 | CMPQ BX, R8 | ||
7026 | JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 | ||
7027 | MOVOU X0, (AX) | ||
7028 | MOVOU X1, 16(AX) | ||
7029 | MOVOU X2, -32(AX)(BX*1) | ||
7030 | MOVOU X3, -16(AX)(BX*1) | ||
7031 | MOVQ DX, AX | ||
7032 | |||
7033 | emit_literal_done_emit_remainder_encodeBetterBlockAsm: | ||
7034 | MOVQ dst_base+0(FP), CX | ||
7035 | SUBQ CX, AX | ||
7036 | MOVQ AX, ret+48(FP) | ||
7037 | RET | ||
7038 | |||
7039 | // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int | ||
7040 | // Requires: BMI, SSE2 | ||
7041 | TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56 | ||
7042 | MOVQ dst_base+0(FP), AX | ||
7043 | MOVQ $0x00001200, CX | ||
7044 | LEAQ 24(SP), DX | ||
7045 | PXOR X0, X0 | ||
7046 | |||
7047 | zero_loop_encodeBetterBlockAsm4MB: | ||
7048 | MOVOU X0, (DX) | ||
7049 | MOVOU X0, 16(DX) | ||
7050 | MOVOU X0, 32(DX) | ||
7051 | MOVOU X0, 48(DX) | ||
7052 | MOVOU X0, 64(DX) | ||
7053 | MOVOU X0, 80(DX) | ||
7054 | MOVOU X0, 96(DX) | ||
7055 | MOVOU X0, 112(DX) | ||
7056 | ADDQ $0x80, DX | ||
7057 | DECQ CX | ||
7058 | JNZ zero_loop_encodeBetterBlockAsm4MB | ||
7059 | MOVL $0x00000000, 12(SP) | ||
7060 | MOVQ src_len+32(FP), CX | ||
7061 | LEAQ -6(CX), DX | ||
7062 | LEAQ -8(CX), BX | ||
7063 | MOVL BX, 8(SP) | ||
7064 | SHRQ $0x05, CX | ||
7065 | SUBL CX, DX | ||
7066 | LEAQ (AX)(DX*1), DX | ||
7067 | MOVQ DX, (SP) | ||
7068 | MOVL $0x00000001, CX | ||
7069 | MOVL $0x00000000, 16(SP) | ||
7070 | MOVQ src_base+24(FP), DX | ||
7071 | |||
7072 | search_loop_encodeBetterBlockAsm4MB: | ||
7073 | MOVL CX, BX | ||
7074 | SUBL 12(SP), BX | ||
7075 | SHRL $0x07, BX | ||
7076 | CMPL BX, $0x63 | ||
7077 | JBE check_maxskip_ok_encodeBetterBlockAsm4MB | ||
7078 | LEAL 100(CX), BX | ||
7079 | JMP check_maxskip_cont_encodeBetterBlockAsm4MB | ||
7080 | |||
7081 | check_maxskip_ok_encodeBetterBlockAsm4MB: | ||
7082 | LEAL 1(CX)(BX*1), BX | ||
7083 | |||
7084 | check_maxskip_cont_encodeBetterBlockAsm4MB: | ||
7085 | CMPL BX, 8(SP) | ||
7086 | JAE emit_remainder_encodeBetterBlockAsm4MB | ||
7087 | MOVQ (DX)(CX*1), SI | ||
7088 | MOVL BX, 20(SP) | ||
7089 | MOVQ $0x00cf1bbcdcbfa563, R8 | ||
7090 | MOVQ $0x9e3779b1, BX | ||
7091 | MOVQ SI, R9 | ||
7092 | MOVQ SI, R10 | ||
7093 | SHLQ $0x08, R9 | ||
7094 | IMULQ R8, R9 | ||
7095 | SHRQ $0x2f, R9 | ||
7096 | SHLQ $0x20, R10 | ||
7097 | IMULQ BX, R10 | ||
7098 | SHRQ $0x32, R10 | ||
7099 | MOVL 24(SP)(R9*4), BX | ||
7100 | MOVL 524312(SP)(R10*4), DI | ||
7101 | MOVL CX, 24(SP)(R9*4) | ||
7102 | MOVL CX, 524312(SP)(R10*4) | ||
7103 | MOVQ (DX)(BX*1), R9 | ||
7104 | MOVQ (DX)(DI*1), R10 | ||
7105 | CMPQ R9, SI | ||
7106 | JEQ candidate_match_encodeBetterBlockAsm4MB | ||
7107 | CMPQ R10, SI | ||
7108 | JNE no_short_found_encodeBetterBlockAsm4MB | ||
7109 | MOVL DI, BX | ||
7110 | JMP candidate_match_encodeBetterBlockAsm4MB | ||
7111 | |||
7112 | no_short_found_encodeBetterBlockAsm4MB: | ||
7113 | CMPL R9, SI | ||
7114 | JEQ candidate_match_encodeBetterBlockAsm4MB | ||
7115 | CMPL R10, SI | ||
7116 | JEQ candidateS_match_encodeBetterBlockAsm4MB | ||
7117 | MOVL 20(SP), CX | ||
7118 | JMP search_loop_encodeBetterBlockAsm4MB | ||
7119 | |||
7120 | candidateS_match_encodeBetterBlockAsm4MB: | ||
7121 | SHRQ $0x08, SI | ||
7122 | MOVQ SI, R9 | ||
7123 | SHLQ $0x08, R9 | ||
7124 | IMULQ R8, R9 | ||
7125 | SHRQ $0x2f, R9 | ||
7126 | MOVL 24(SP)(R9*4), BX | ||
7127 | INCL CX | ||
7128 | MOVL CX, 24(SP)(R9*4) | ||
7129 | CMPL (DX)(BX*1), SI | ||
7130 | JEQ candidate_match_encodeBetterBlockAsm4MB | ||
7131 | DECL CX | ||
7132 | MOVL DI, BX | ||
7133 | |||
7134 | candidate_match_encodeBetterBlockAsm4MB: | ||
7135 | MOVL 12(SP), SI | ||
7136 | TESTL BX, BX | ||
7137 | JZ match_extend_back_end_encodeBetterBlockAsm4MB | ||
7138 | |||
7139 | match_extend_back_loop_encodeBetterBlockAsm4MB: | ||
7140 | CMPL CX, SI | ||
7141 | JBE match_extend_back_end_encodeBetterBlockAsm4MB | ||
7142 | MOVB -1(DX)(BX*1), DI | ||
7143 | MOVB -1(DX)(CX*1), R8 | ||
7144 | CMPB DI, R8 | ||
7145 | JNE match_extend_back_end_encodeBetterBlockAsm4MB | ||
7146 | LEAL -1(CX), CX | ||
7147 | DECL BX | ||
7148 | JZ match_extend_back_end_encodeBetterBlockAsm4MB | ||
7149 | JMP match_extend_back_loop_encodeBetterBlockAsm4MB | ||
7150 | |||
7151 | match_extend_back_end_encodeBetterBlockAsm4MB: | ||
7152 | MOVL CX, SI | ||
7153 | SUBL 12(SP), SI | ||
7154 | LEAQ 4(AX)(SI*1), SI | ||
7155 | CMPQ SI, (SP) | ||
7156 | JB match_dst_size_check_encodeBetterBlockAsm4MB | ||
7157 | MOVQ $0x00000000, ret+48(FP) | ||
7158 | RET | ||
7159 | |||
7160 | match_dst_size_check_encodeBetterBlockAsm4MB: | ||
7161 | MOVL CX, SI | ||
7162 | ADDL $0x04, CX | ||
7163 | ADDL $0x04, BX | ||
7164 | MOVQ src_len+32(FP), DI | ||
7165 | SUBL CX, DI | ||
7166 | LEAQ (DX)(CX*1), R8 | ||
7167 | LEAQ (DX)(BX*1), R9 | ||
7168 | |||
7169 | // matchLen | ||
7170 | XORL R11, R11 | ||
7171 | |||
7172 | matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB: | ||
7173 | CMPL DI, $0x10 | ||
7174 | JB matchlen_match8_match_nolit_encodeBetterBlockAsm4MB | ||
7175 | MOVQ (R8)(R11*1), R10 | ||
7176 | MOVQ 8(R8)(R11*1), R12 | ||
7177 | XORQ (R9)(R11*1), R10 | ||
7178 | JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB | ||
7179 | XORQ 8(R9)(R11*1), R12 | ||
7180 | JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB | ||
7181 | LEAL -16(DI), DI | ||
7182 | LEAL 16(R11), R11 | ||
7183 | JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB | ||
7184 | |||
7185 | matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB: | ||
7186 | #ifdef GOAMD64_v3 | ||
7187 | TZCNTQ R12, R12 | ||
7188 | |||
7189 | #else | ||
7190 | BSFQ R12, R12 | ||
7191 | |||
7192 | #endif | ||
7193 | SARQ $0x03, R12 | ||
7194 | LEAL 8(R11)(R12*1), R11 | ||
7195 | JMP match_nolit_end_encodeBetterBlockAsm4MB | ||
7196 | |||
7197 | matchlen_match8_match_nolit_encodeBetterBlockAsm4MB: | ||
7198 | CMPL DI, $0x08 | ||
7199 | JB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB | ||
7200 | MOVQ (R8)(R11*1), R10 | ||
7201 | XORQ (R9)(R11*1), R10 | ||
7202 | JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB | ||
7203 | LEAL -8(DI), DI | ||
7204 | LEAL 8(R11), R11 | ||
7205 | JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4MB | ||
7206 | |||
7207 | matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB: | ||
7208 | #ifdef GOAMD64_v3 | ||
7209 | TZCNTQ R10, R10 | ||
7210 | |||
7211 | #else | ||
7212 | BSFQ R10, R10 | ||
7213 | |||
7214 | #endif | ||
7215 | SARQ $0x03, R10 | ||
7216 | LEAL (R11)(R10*1), R11 | ||
7217 | JMP match_nolit_end_encodeBetterBlockAsm4MB | ||
7218 | |||
7219 | matchlen_match4_match_nolit_encodeBetterBlockAsm4MB: | ||
7220 | CMPL DI, $0x04 | ||
7221 | JB matchlen_match2_match_nolit_encodeBetterBlockAsm4MB | ||
7222 | MOVL (R8)(R11*1), R10 | ||
7223 | CMPL (R9)(R11*1), R10 | ||
7224 | JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB | ||
7225 | LEAL -4(DI), DI | ||
7226 | LEAL 4(R11), R11 | ||
7227 | |||
7228 | matchlen_match2_match_nolit_encodeBetterBlockAsm4MB: | ||
7229 | CMPL DI, $0x01 | ||
7230 | JE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB | ||
7231 | JB match_nolit_end_encodeBetterBlockAsm4MB | ||
7232 | MOVW (R8)(R11*1), R10 | ||
7233 | CMPW (R9)(R11*1), R10 | ||
7234 | JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB | ||
7235 | LEAL 2(R11), R11 | ||
7236 | SUBL $0x02, DI | ||
7237 | JZ match_nolit_end_encodeBetterBlockAsm4MB | ||
7238 | |||
7239 | matchlen_match1_match_nolit_encodeBetterBlockAsm4MB: | ||
7240 | MOVB (R8)(R11*1), R10 | ||
7241 | CMPB (R9)(R11*1), R10 | ||
7242 | JNE match_nolit_end_encodeBetterBlockAsm4MB | ||
7243 | LEAL 1(R11), R11 | ||
7244 | |||
7245 | match_nolit_end_encodeBetterBlockAsm4MB: | ||
7246 | MOVL CX, DI | ||
7247 | SUBL BX, DI | ||
7248 | |||
7249 | // Check if repeat | ||
7250 | CMPL 16(SP), DI | ||
7251 | JEQ match_is_repeat_encodeBetterBlockAsm4MB | ||
7252 | CMPL R11, $0x01 | ||
7253 | JA match_length_ok_encodeBetterBlockAsm4MB | ||
7254 | CMPL DI, $0x0000ffff | ||
7255 | JBE match_length_ok_encodeBetterBlockAsm4MB | ||
7256 | MOVL 20(SP), CX | ||
7257 | INCL CX | ||
7258 | JMP search_loop_encodeBetterBlockAsm4MB | ||
7259 | |||
7260 | match_length_ok_encodeBetterBlockAsm4MB: | ||
7261 | MOVL DI, 16(SP) | ||
7262 | MOVL 12(SP), BX | ||
7263 | CMPL BX, SI | ||
7264 | JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB | ||
7265 | MOVL SI, R8 | ||
7266 | MOVL SI, 12(SP) | ||
7267 | LEAQ (DX)(BX*1), R9 | ||
7268 | SUBL BX, R8 | ||
7269 | LEAL -1(R8), BX | ||
7270 | CMPL BX, $0x3c | ||
7271 | JB one_byte_match_emit_encodeBetterBlockAsm4MB | ||
7272 | CMPL BX, $0x00000100 | ||
7273 | JB two_bytes_match_emit_encodeBetterBlockAsm4MB | ||
7274 | CMPL BX, $0x00010000 | ||
7275 | JB three_bytes_match_emit_encodeBetterBlockAsm4MB | ||
7276 | MOVL BX, R10 | ||
7277 | SHRL $0x10, R10 | ||
7278 | MOVB $0xf8, (AX) | ||
7279 | MOVW BX, 1(AX) | ||
7280 | MOVB R10, 3(AX) | ||
7281 | ADDQ $0x04, AX | ||
7282 | JMP memmove_long_match_emit_encodeBetterBlockAsm4MB | ||
7283 | |||
7284 | three_bytes_match_emit_encodeBetterBlockAsm4MB: | ||
7285 | MOVB $0xf4, (AX) | ||
7286 | MOVW BX, 1(AX) | ||
7287 | ADDQ $0x03, AX | ||
7288 | JMP memmove_long_match_emit_encodeBetterBlockAsm4MB | ||
7289 | |||
7290 | two_bytes_match_emit_encodeBetterBlockAsm4MB: | ||
7291 | MOVB $0xf0, (AX) | ||
7292 | MOVB BL, 1(AX) | ||
7293 | ADDQ $0x02, AX | ||
7294 | CMPL BX, $0x40 | ||
7295 | JB memmove_match_emit_encodeBetterBlockAsm4MB | ||
7296 | JMP memmove_long_match_emit_encodeBetterBlockAsm4MB | ||
7297 | |||
7298 | one_byte_match_emit_encodeBetterBlockAsm4MB: | ||
7299 | SHLB $0x02, BL | ||
7300 | MOVB BL, (AX) | ||
7301 | ADDQ $0x01, AX | ||
7302 | |||
7303 | memmove_match_emit_encodeBetterBlockAsm4MB: | ||
7304 | LEAQ (AX)(R8*1), BX | ||
7305 | |||
7306 | // genMemMoveShort | ||
7307 | CMPQ R8, $0x04 | ||
7308 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 | ||
7309 | CMPQ R8, $0x08 | ||
7310 | JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 | ||
7311 | CMPQ R8, $0x10 | ||
7312 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 | ||
7313 | CMPQ R8, $0x20 | ||
7314 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 | ||
7315 | JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 | ||
7316 | |||
7317 | emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: | ||
7318 | MOVL (R9), R10 | ||
7319 | MOVL R10, (AX) | ||
7320 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB | ||
7321 | |||
7322 | emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: | ||
7323 | MOVL (R9), R10 | ||
7324 | MOVL -4(R9)(R8*1), R9 | ||
7325 | MOVL R10, (AX) | ||
7326 | MOVL R9, -4(AX)(R8*1) | ||
7327 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB | ||
7328 | |||
7329 | emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: | ||
7330 | MOVQ (R9), R10 | ||
7331 | MOVQ -8(R9)(R8*1), R9 | ||
7332 | MOVQ R10, (AX) | ||
7333 | MOVQ R9, -8(AX)(R8*1) | ||
7334 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB | ||
7335 | |||
7336 | emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: | ||
7337 | MOVOU (R9), X0 | ||
7338 | MOVOU -16(R9)(R8*1), X1 | ||
7339 | MOVOU X0, (AX) | ||
7340 | MOVOU X1, -16(AX)(R8*1) | ||
7341 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB | ||
7342 | |||
7343 | emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: | ||
7344 | MOVOU (R9), X0 | ||
7345 | MOVOU 16(R9), X1 | ||
7346 | MOVOU -32(R9)(R8*1), X2 | ||
7347 | MOVOU -16(R9)(R8*1), X3 | ||
7348 | MOVOU X0, (AX) | ||
7349 | MOVOU X1, 16(AX) | ||
7350 | MOVOU X2, -32(AX)(R8*1) | ||
7351 | MOVOU X3, -16(AX)(R8*1) | ||
7352 | |||
7353 | memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: | ||
7354 | MOVQ BX, AX | ||
7355 | JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB | ||
7356 | |||
7357 | memmove_long_match_emit_encodeBetterBlockAsm4MB: | ||
7358 | LEAQ (AX)(R8*1), BX | ||
7359 | |||
7360 | // genMemMoveLong | ||
7361 | MOVOU (R9), X0 | ||
7362 | MOVOU 16(R9), X1 | ||
7363 | MOVOU -32(R9)(R8*1), X2 | ||
7364 | MOVOU -16(R9)(R8*1), X3 | ||
7365 | MOVQ R8, R12 | ||
7366 | SHRQ $0x05, R12 | ||
7367 | MOVQ AX, R10 | ||
7368 | ANDL $0x0000001f, R10 | ||
7369 | MOVQ $0x00000040, R13 | ||
7370 | SUBQ R10, R13 | ||
7371 | DECQ R12 | ||
7372 | JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 | ||
7373 | LEAQ -32(R9)(R13*1), R10 | ||
7374 | LEAQ -32(AX)(R13*1), R14 | ||
7375 | |||
7376 | emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: | ||
7377 | MOVOU (R10), X4 | ||
7378 | MOVOU 16(R10), X5 | ||
7379 | MOVOA X4, (R14) | ||
7380 | MOVOA X5, 16(R14) | ||
7381 | ADDQ $0x20, R14 | ||
7382 | ADDQ $0x20, R10 | ||
7383 | ADDQ $0x20, R13 | ||
7384 | DECQ R12 | ||
7385 | JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back | ||
7386 | |||
7387 | emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: | ||
7388 | MOVOU -32(R9)(R13*1), X4 | ||
7389 | MOVOU -16(R9)(R13*1), X5 | ||
7390 | MOVOA X4, -32(AX)(R13*1) | ||
7391 | MOVOA X5, -16(AX)(R13*1) | ||
7392 | ADDQ $0x20, R13 | ||
7393 | CMPQ R8, R13 | ||
7394 | JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 | ||
7395 | MOVOU X0, (AX) | ||
7396 | MOVOU X1, 16(AX) | ||
7397 | MOVOU X2, -32(AX)(R8*1) | ||
7398 | MOVOU X3, -16(AX)(R8*1) | ||
7399 | MOVQ BX, AX | ||
7400 | |||
7401 | emit_literal_done_match_emit_encodeBetterBlockAsm4MB: | ||
7402 | ADDL R11, CX | ||
7403 | ADDL $0x04, R11 | ||
7404 | MOVL CX, 12(SP) | ||
7405 | |||
7406 | // emitCopy | ||
7407 | CMPL DI, $0x00010000 | ||
7408 | JB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB | ||
7409 | CMPL R11, $0x40 | ||
7410 | JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB | ||
7411 | MOVB $0xff, (AX) | ||
7412 | MOVL DI, 1(AX) | ||
7413 | LEAL -64(R11), R11 | ||
7414 | ADDQ $0x05, AX | ||
7415 | CMPL R11, $0x04 | ||
7416 | JB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB | ||
7417 | |||
7418 | // emitRepeat | ||
7419 | MOVL R11, BX | ||
7420 | LEAL -4(R11), R11 | ||
7421 | CMPL BX, $0x08 | ||
7422 | JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy | ||
7423 | CMPL BX, $0x0c | ||
7424 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy | ||
7425 | CMPL DI, $0x00000800 | ||
7426 | JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy | ||
7427 | |||
7428 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: | ||
7429 | CMPL R11, $0x00000104 | ||
7430 | JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy | ||
7431 | CMPL R11, $0x00010100 | ||
7432 | JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy | ||
7433 | LEAL -65536(R11), R11 | ||
7434 | MOVL R11, DI | ||
7435 | MOVW $0x001d, (AX) | ||
7436 | MOVW R11, 2(AX) | ||
7437 | SARL $0x10, DI | ||
7438 | MOVB DI, 4(AX) | ||
7439 | ADDQ $0x05, AX | ||
7440 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7441 | |||
7442 | repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: | ||
7443 | LEAL -256(R11), R11 | ||
7444 | MOVW $0x0019, (AX) | ||
7445 | MOVW R11, 2(AX) | ||
7446 | ADDQ $0x04, AX | ||
7447 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7448 | |||
7449 | repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: | ||
7450 | LEAL -4(R11), R11 | ||
7451 | MOVW $0x0015, (AX) | ||
7452 | MOVB R11, 2(AX) | ||
7453 | ADDQ $0x03, AX | ||
7454 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7455 | |||
7456 | repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: | ||
7457 | SHLL $0x02, R11 | ||
7458 | ORL $0x01, R11 | ||
7459 | MOVW R11, (AX) | ||
7460 | ADDQ $0x02, AX | ||
7461 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7462 | |||
7463 | repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: | ||
7464 | XORQ BX, BX | ||
7465 | LEAL 1(BX)(R11*4), R11 | ||
7466 | MOVB DI, 1(AX) | ||
7467 | SARL $0x08, DI | ||
7468 | SHLL $0x05, DI | ||
7469 | ORL DI, R11 | ||
7470 | MOVB R11, (AX) | ||
7471 | ADDQ $0x02, AX | ||
7472 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7473 | |||
7474 | four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: | ||
7475 | TESTL R11, R11 | ||
7476 | JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7477 | XORL BX, BX | ||
7478 | LEAL -1(BX)(R11*4), R11 | ||
7479 | MOVB R11, (AX) | ||
7480 | MOVL DI, 1(AX) | ||
7481 | ADDQ $0x05, AX | ||
7482 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7483 | |||
7484 | two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: | ||
7485 | CMPL R11, $0x40 | ||
7486 | JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB | ||
7487 | CMPL DI, $0x00000800 | ||
7488 | JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB | ||
7489 | MOVL $0x00000001, BX | ||
7490 | LEAL 16(BX), BX | ||
7491 | MOVB DI, 1(AX) | ||
7492 | SHRL $0x08, DI | ||
7493 | SHLL $0x05, DI | ||
7494 | ORL DI, BX | ||
7495 | MOVB BL, (AX) | ||
7496 | ADDQ $0x02, AX | ||
7497 | SUBL $0x08, R11 | ||
7498 | |||
7499 | // emitRepeat | ||
7500 | LEAL -4(R11), R11 | ||
7501 | JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b | ||
7502 | MOVL R11, BX | ||
7503 | LEAL -4(R11), R11 | ||
7504 | CMPL BX, $0x08 | ||
7505 | JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b | ||
7506 | CMPL BX, $0x0c | ||
7507 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b | ||
7508 | CMPL DI, $0x00000800 | ||
7509 | JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b | ||
7510 | |||
7511 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: | ||
7512 | CMPL R11, $0x00000104 | ||
7513 | JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b | ||
7514 | CMPL R11, $0x00010100 | ||
7515 | JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b | ||
7516 | LEAL -65536(R11), R11 | ||
7517 | MOVL R11, DI | ||
7518 | MOVW $0x001d, (AX) | ||
7519 | MOVW R11, 2(AX) | ||
7520 | SARL $0x10, DI | ||
7521 | MOVB DI, 4(AX) | ||
7522 | ADDQ $0x05, AX | ||
7523 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7524 | |||
7525 | repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: | ||
7526 | LEAL -256(R11), R11 | ||
7527 | MOVW $0x0019, (AX) | ||
7528 | MOVW R11, 2(AX) | ||
7529 | ADDQ $0x04, AX | ||
7530 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7531 | |||
7532 | repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: | ||
7533 | LEAL -4(R11), R11 | ||
7534 | MOVW $0x0015, (AX) | ||
7535 | MOVB R11, 2(AX) | ||
7536 | ADDQ $0x03, AX | ||
7537 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7538 | |||
7539 | repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: | ||
7540 | SHLL $0x02, R11 | ||
7541 | ORL $0x01, R11 | ||
7542 | MOVW R11, (AX) | ||
7543 | ADDQ $0x02, AX | ||
7544 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7545 | |||
7546 | repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: | ||
7547 | XORQ BX, BX | ||
7548 | LEAL 1(BX)(R11*4), R11 | ||
7549 | MOVB DI, 1(AX) | ||
7550 | SARL $0x08, DI | ||
7551 | SHLL $0x05, DI | ||
7552 | ORL DI, R11 | ||
7553 | MOVB R11, (AX) | ||
7554 | ADDQ $0x02, AX | ||
7555 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7556 | |||
7557 | long_offset_short_match_nolit_encodeBetterBlockAsm4MB: | ||
7558 | MOVB $0xee, (AX) | ||
7559 | MOVW DI, 1(AX) | ||
7560 | LEAL -60(R11), R11 | ||
7561 | ADDQ $0x03, AX | ||
7562 | |||
7563 | // emitRepeat | ||
7564 | MOVL R11, BX | ||
7565 | LEAL -4(R11), R11 | ||
7566 | CMPL BX, $0x08 | ||
7567 | JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short | ||
7568 | CMPL BX, $0x0c | ||
7569 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short | ||
7570 | CMPL DI, $0x00000800 | ||
7571 | JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short | ||
7572 | |||
7573 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: | ||
7574 | CMPL R11, $0x00000104 | ||
7575 | JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short | ||
7576 | CMPL R11, $0x00010100 | ||
7577 | JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short | ||
7578 | LEAL -65536(R11), R11 | ||
7579 | MOVL R11, DI | ||
7580 | MOVW $0x001d, (AX) | ||
7581 | MOVW R11, 2(AX) | ||
7582 | SARL $0x10, DI | ||
7583 | MOVB DI, 4(AX) | ||
7584 | ADDQ $0x05, AX | ||
7585 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7586 | |||
7587 | repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: | ||
7588 | LEAL -256(R11), R11 | ||
7589 | MOVW $0x0019, (AX) | ||
7590 | MOVW R11, 2(AX) | ||
7591 | ADDQ $0x04, AX | ||
7592 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7593 | |||
7594 | repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: | ||
7595 | LEAL -4(R11), R11 | ||
7596 | MOVW $0x0015, (AX) | ||
7597 | MOVB R11, 2(AX) | ||
7598 | ADDQ $0x03, AX | ||
7599 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7600 | |||
7601 | repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: | ||
7602 | SHLL $0x02, R11 | ||
7603 | ORL $0x01, R11 | ||
7604 | MOVW R11, (AX) | ||
7605 | ADDQ $0x02, AX | ||
7606 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7607 | |||
7608 | repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: | ||
7609 | XORQ BX, BX | ||
7610 | LEAL 1(BX)(R11*4), R11 | ||
7611 | MOVB DI, 1(AX) | ||
7612 | SARL $0x08, DI | ||
7613 | SHLL $0x05, DI | ||
7614 | ORL DI, R11 | ||
7615 | MOVB R11, (AX) | ||
7616 | ADDQ $0x02, AX | ||
7617 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7618 | |||
7619 | two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: | ||
7620 | MOVL R11, BX | ||
7621 | SHLL $0x02, BX | ||
7622 | CMPL R11, $0x0c | ||
7623 | JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB | ||
7624 | CMPL DI, $0x00000800 | ||
7625 | JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB | ||
7626 | LEAL -15(BX), BX | ||
7627 | MOVB DI, 1(AX) | ||
7628 | SHRL $0x08, DI | ||
7629 | SHLL $0x05, DI | ||
7630 | ORL DI, BX | ||
7631 | MOVB BL, (AX) | ||
7632 | ADDQ $0x02, AX | ||
7633 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7634 | |||
7635 | emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: | ||
7636 | LEAL -2(BX), BX | ||
7637 | MOVB BL, (AX) | ||
7638 | MOVW DI, 1(AX) | ||
7639 | ADDQ $0x03, AX | ||
7640 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7641 | |||
7642 | match_is_repeat_encodeBetterBlockAsm4MB: | ||
7643 | MOVL 12(SP), BX | ||
7644 | CMPL BX, SI | ||
7645 | JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7646 | MOVL SI, R8 | ||
7647 | MOVL SI, 12(SP) | ||
7648 | LEAQ (DX)(BX*1), R9 | ||
7649 | SUBL BX, R8 | ||
7650 | LEAL -1(R8), BX | ||
7651 | CMPL BX, $0x3c | ||
7652 | JB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7653 | CMPL BX, $0x00000100 | ||
7654 | JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7655 | CMPL BX, $0x00010000 | ||
7656 | JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7657 | MOVL BX, R10 | ||
7658 | SHRL $0x10, R10 | ||
7659 | MOVB $0xf8, (AX) | ||
7660 | MOVW BX, 1(AX) | ||
7661 | MOVB R10, 3(AX) | ||
7662 | ADDQ $0x04, AX | ||
7663 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7664 | |||
7665 | three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: | ||
7666 | MOVB $0xf4, (AX) | ||
7667 | MOVW BX, 1(AX) | ||
7668 | ADDQ $0x03, AX | ||
7669 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7670 | |||
7671 | two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: | ||
7672 | MOVB $0xf0, (AX) | ||
7673 | MOVB BL, 1(AX) | ||
7674 | ADDQ $0x02, AX | ||
7675 | CMPL BX, $0x40 | ||
7676 | JB memmove_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7677 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7678 | |||
7679 | one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: | ||
7680 | SHLB $0x02, BL | ||
7681 | MOVB BL, (AX) | ||
7682 | ADDQ $0x01, AX | ||
7683 | |||
7684 | memmove_match_emit_repeat_encodeBetterBlockAsm4MB: | ||
7685 | LEAQ (AX)(R8*1), BX | ||
7686 | |||
7687 | // genMemMoveShort | ||
7688 | CMPQ R8, $0x04 | ||
7689 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 | ||
7690 | CMPQ R8, $0x08 | ||
7691 | JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 | ||
7692 | CMPQ R8, $0x10 | ||
7693 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 | ||
7694 | CMPQ R8, $0x20 | ||
7695 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 | ||
7696 | JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 | ||
7697 | |||
7698 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: | ||
7699 | MOVL (R9), R10 | ||
7700 | MOVL R10, (AX) | ||
7701 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7702 | |||
7703 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: | ||
7704 | MOVL (R9), R10 | ||
7705 | MOVL -4(R9)(R8*1), R9 | ||
7706 | MOVL R10, (AX) | ||
7707 | MOVL R9, -4(AX)(R8*1) | ||
7708 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7709 | |||
7710 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: | ||
7711 | MOVQ (R9), R10 | ||
7712 | MOVQ -8(R9)(R8*1), R9 | ||
7713 | MOVQ R10, (AX) | ||
7714 | MOVQ R9, -8(AX)(R8*1) | ||
7715 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7716 | |||
7717 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: | ||
7718 | MOVOU (R9), X0 | ||
7719 | MOVOU -16(R9)(R8*1), X1 | ||
7720 | MOVOU X0, (AX) | ||
7721 | MOVOU X1, -16(AX)(R8*1) | ||
7722 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7723 | |||
7724 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: | ||
7725 | MOVOU (R9), X0 | ||
7726 | MOVOU 16(R9), X1 | ||
7727 | MOVOU -32(R9)(R8*1), X2 | ||
7728 | MOVOU -16(R9)(R8*1), X3 | ||
7729 | MOVOU X0, (AX) | ||
7730 | MOVOU X1, 16(AX) | ||
7731 | MOVOU X2, -32(AX)(R8*1) | ||
7732 | MOVOU X3, -16(AX)(R8*1) | ||
7733 | |||
7734 | memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: | ||
7735 | MOVQ BX, AX | ||
7736 | JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB | ||
7737 | |||
7738 | memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: | ||
7739 | LEAQ (AX)(R8*1), BX | ||
7740 | |||
7741 | // genMemMoveLong | ||
7742 | MOVOU (R9), X0 | ||
7743 | MOVOU 16(R9), X1 | ||
7744 | MOVOU -32(R9)(R8*1), X2 | ||
7745 | MOVOU -16(R9)(R8*1), X3 | ||
7746 | MOVQ R8, R12 | ||
7747 | SHRQ $0x05, R12 | ||
7748 | MOVQ AX, R10 | ||
7749 | ANDL $0x0000001f, R10 | ||
7750 | MOVQ $0x00000040, R13 | ||
7751 | SUBQ R10, R13 | ||
7752 | DECQ R12 | ||
7753 | JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 | ||
7754 | LEAQ -32(R9)(R13*1), R10 | ||
7755 | LEAQ -32(AX)(R13*1), R14 | ||
7756 | |||
7757 | emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: | ||
7758 | MOVOU (R10), X4 | ||
7759 | MOVOU 16(R10), X5 | ||
7760 | MOVOA X4, (R14) | ||
7761 | MOVOA X5, 16(R14) | ||
7762 | ADDQ $0x20, R14 | ||
7763 | ADDQ $0x20, R10 | ||
7764 | ADDQ $0x20, R13 | ||
7765 | DECQ R12 | ||
7766 | JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back | ||
7767 | |||
7768 | emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: | ||
7769 | MOVOU -32(R9)(R13*1), X4 | ||
7770 | MOVOU -16(R9)(R13*1), X5 | ||
7771 | MOVOA X4, -32(AX)(R13*1) | ||
7772 | MOVOA X5, -16(AX)(R13*1) | ||
7773 | ADDQ $0x20, R13 | ||
7774 | CMPQ R8, R13 | ||
7775 | JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 | ||
7776 | MOVOU X0, (AX) | ||
7777 | MOVOU X1, 16(AX) | ||
7778 | MOVOU X2, -32(AX)(R8*1) | ||
7779 | MOVOU X3, -16(AX)(R8*1) | ||
7780 | MOVQ BX, AX | ||
7781 | |||
7782 | emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: | ||
7783 | ADDL R11, CX | ||
7784 | ADDL $0x04, R11 | ||
7785 | MOVL CX, 12(SP) | ||
7786 | |||
7787 | // emitRepeat | ||
7788 | MOVL R11, BX | ||
7789 | LEAL -4(R11), R11 | ||
7790 | CMPL BX, $0x08 | ||
7791 | JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB | ||
7792 | CMPL BX, $0x0c | ||
7793 | JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB | ||
7794 | CMPL DI, $0x00000800 | ||
7795 | JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB | ||
7796 | |||
7797 | cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: | ||
7798 | CMPL R11, $0x00000104 | ||
7799 | JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB | ||
7800 | CMPL R11, $0x00010100 | ||
7801 | JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB | ||
7802 | LEAL -65536(R11), R11 | ||
7803 | MOVL R11, DI | ||
7804 | MOVW $0x001d, (AX) | ||
7805 | MOVW R11, 2(AX) | ||
7806 | SARL $0x10, DI | ||
7807 | MOVB DI, 4(AX) | ||
7808 | ADDQ $0x05, AX | ||
7809 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7810 | |||
7811 | repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: | ||
7812 | LEAL -256(R11), R11 | ||
7813 | MOVW $0x0019, (AX) | ||
7814 | MOVW R11, 2(AX) | ||
7815 | ADDQ $0x04, AX | ||
7816 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7817 | |||
7818 | repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: | ||
7819 | LEAL -4(R11), R11 | ||
7820 | MOVW $0x0015, (AX) | ||
7821 | MOVB R11, 2(AX) | ||
7822 | ADDQ $0x03, AX | ||
7823 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7824 | |||
7825 | repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: | ||
7826 | SHLL $0x02, R11 | ||
7827 | ORL $0x01, R11 | ||
7828 | MOVW R11, (AX) | ||
7829 | ADDQ $0x02, AX | ||
7830 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB | ||
7831 | |||
7832 | repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: | ||
7833 | XORQ BX, BX | ||
7834 | LEAL 1(BX)(R11*4), R11 | ||
7835 | MOVB DI, 1(AX) | ||
7836 | SARL $0x08, DI | ||
7837 | SHLL $0x05, DI | ||
7838 | ORL DI, R11 | ||
7839 | MOVB R11, (AX) | ||
7840 | ADDQ $0x02, AX | ||
7841 | |||
7842 | match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: | ||
7843 | CMPL CX, 8(SP) | ||
7844 | JAE emit_remainder_encodeBetterBlockAsm4MB | ||
7845 | CMPQ AX, (SP) | ||
7846 | JB match_nolit_dst_ok_encodeBetterBlockAsm4MB | ||
7847 | MOVQ $0x00000000, ret+48(FP) | ||
7848 | RET | ||
7849 | |||
7850 | match_nolit_dst_ok_encodeBetterBlockAsm4MB: | ||
7851 | MOVQ $0x00cf1bbcdcbfa563, BX | ||
7852 | MOVQ $0x9e3779b1, DI | ||
7853 | LEAQ 1(SI), SI | ||
7854 | LEAQ -2(CX), R8 | ||
7855 | MOVQ (DX)(SI*1), R9 | ||
7856 | MOVQ 1(DX)(SI*1), R10 | ||
7857 | MOVQ (DX)(R8*1), R11 | ||
7858 | MOVQ 1(DX)(R8*1), R12 | ||
7859 | SHLQ $0x08, R9 | ||
7860 | IMULQ BX, R9 | ||
7861 | SHRQ $0x2f, R9 | ||
7862 | SHLQ $0x20, R10 | ||
7863 | IMULQ DI, R10 | ||
7864 | SHRQ $0x32, R10 | ||
7865 | SHLQ $0x08, R11 | ||
7866 | IMULQ BX, R11 | ||
7867 | SHRQ $0x2f, R11 | ||
7868 | SHLQ $0x20, R12 | ||
7869 | IMULQ DI, R12 | ||
7870 | SHRQ $0x32, R12 | ||
7871 | LEAQ 1(SI), DI | ||
7872 | LEAQ 1(R8), R13 | ||
7873 | MOVL SI, 24(SP)(R9*4) | ||
7874 | MOVL R8, 24(SP)(R11*4) | ||
7875 | MOVL DI, 524312(SP)(R10*4) | ||
7876 | MOVL R13, 524312(SP)(R12*4) | ||
7877 | LEAQ 1(R8)(SI*1), DI | ||
7878 | SHRQ $0x01, DI | ||
7879 | ADDQ $0x01, SI | ||
7880 | SUBQ $0x01, R8 | ||
7881 | |||
7882 | index_loop_encodeBetterBlockAsm4MB: | ||
7883 | CMPQ DI, R8 | ||
7884 | JAE search_loop_encodeBetterBlockAsm4MB | ||
7885 | MOVQ (DX)(SI*1), R9 | ||
7886 | MOVQ (DX)(DI*1), R10 | ||
7887 | SHLQ $0x08, R9 | ||
7888 | IMULQ BX, R9 | ||
7889 | SHRQ $0x2f, R9 | ||
7890 | SHLQ $0x08, R10 | ||
7891 | IMULQ BX, R10 | ||
7892 | SHRQ $0x2f, R10 | ||
7893 | MOVL SI, 24(SP)(R9*4) | ||
7894 | MOVL DI, 24(SP)(R10*4) | ||
7895 | ADDQ $0x02, SI | ||
7896 | ADDQ $0x02, DI | ||
7897 | JMP index_loop_encodeBetterBlockAsm4MB | ||
7898 | |||
7899 | emit_remainder_encodeBetterBlockAsm4MB: | ||
7900 | MOVQ src_len+32(FP), CX | ||
7901 | SUBL 12(SP), CX | ||
7902 | LEAQ 4(AX)(CX*1), CX | ||
7903 | CMPQ CX, (SP) | ||
7904 | JB emit_remainder_ok_encodeBetterBlockAsm4MB | ||
7905 | MOVQ $0x00000000, ret+48(FP) | ||
7906 | RET | ||
7907 | |||
7908 | emit_remainder_ok_encodeBetterBlockAsm4MB: | ||
7909 | MOVQ src_len+32(FP), CX | ||
7910 | MOVL 12(SP), BX | ||
7911 | CMPL BX, CX | ||
7912 | JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB | ||
7913 | MOVL CX, SI | ||
7914 | MOVL CX, 12(SP) | ||
7915 | LEAQ (DX)(BX*1), CX | ||
7916 | SUBL BX, SI | ||
7917 | LEAL -1(SI), DX | ||
7918 | CMPL DX, $0x3c | ||
7919 | JB one_byte_emit_remainder_encodeBetterBlockAsm4MB | ||
7920 | CMPL DX, $0x00000100 | ||
7921 | JB two_bytes_emit_remainder_encodeBetterBlockAsm4MB | ||
7922 | CMPL DX, $0x00010000 | ||
7923 | JB three_bytes_emit_remainder_encodeBetterBlockAsm4MB | ||
7924 | MOVL DX, BX | ||
7925 | SHRL $0x10, BX | ||
7926 | MOVB $0xf8, (AX) | ||
7927 | MOVW DX, 1(AX) | ||
7928 | MOVB BL, 3(AX) | ||
7929 | ADDQ $0x04, AX | ||
7930 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB | ||
7931 | |||
7932 | three_bytes_emit_remainder_encodeBetterBlockAsm4MB: | ||
7933 | MOVB $0xf4, (AX) | ||
7934 | MOVW DX, 1(AX) | ||
7935 | ADDQ $0x03, AX | ||
7936 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB | ||
7937 | |||
7938 | two_bytes_emit_remainder_encodeBetterBlockAsm4MB: | ||
7939 | MOVB $0xf0, (AX) | ||
7940 | MOVB DL, 1(AX) | ||
7941 | ADDQ $0x02, AX | ||
7942 | CMPL DX, $0x40 | ||
7943 | JB memmove_emit_remainder_encodeBetterBlockAsm4MB | ||
7944 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB | ||
7945 | |||
7946 | one_byte_emit_remainder_encodeBetterBlockAsm4MB: | ||
7947 | SHLB $0x02, DL | ||
7948 | MOVB DL, (AX) | ||
7949 | ADDQ $0x01, AX | ||
7950 | |||
7951 | memmove_emit_remainder_encodeBetterBlockAsm4MB: | ||
7952 | LEAQ (AX)(SI*1), DX | ||
7953 | MOVL SI, BX | ||
7954 | |||
7955 | // genMemMoveShort | ||
7956 | CMPQ BX, $0x03 | ||
7957 | JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 | ||
7958 | JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 | ||
7959 | CMPQ BX, $0x08 | ||
7960 | JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 | ||
7961 | CMPQ BX, $0x10 | ||
7962 | JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 | ||
7963 | CMPQ BX, $0x20 | ||
7964 | JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 | ||
7965 | JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 | ||
7966 | |||
7967 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: | ||
7968 | MOVB (CX), SI | ||
7969 | MOVB -1(CX)(BX*1), CL | ||
7970 | MOVB SI, (AX) | ||
7971 | MOVB CL, -1(AX)(BX*1) | ||
7972 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB | ||
7973 | |||
7974 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: | ||
7975 | MOVW (CX), SI | ||
7976 | MOVB 2(CX), CL | ||
7977 | MOVW SI, (AX) | ||
7978 | MOVB CL, 2(AX) | ||
7979 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB | ||
7980 | |||
7981 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: | ||
7982 | MOVL (CX), SI | ||
7983 | MOVL -4(CX)(BX*1), CX | ||
7984 | MOVL SI, (AX) | ||
7985 | MOVL CX, -4(AX)(BX*1) | ||
7986 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB | ||
7987 | |||
7988 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: | ||
7989 | MOVQ (CX), SI | ||
7990 | MOVQ -8(CX)(BX*1), CX | ||
7991 | MOVQ SI, (AX) | ||
7992 | MOVQ CX, -8(AX)(BX*1) | ||
7993 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB | ||
7994 | |||
7995 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: | ||
7996 | MOVOU (CX), X0 | ||
7997 | MOVOU -16(CX)(BX*1), X1 | ||
7998 | MOVOU X0, (AX) | ||
7999 | MOVOU X1, -16(AX)(BX*1) | ||
8000 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB | ||
8001 | |||
8002 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: | ||
8003 | MOVOU (CX), X0 | ||
8004 | MOVOU 16(CX), X1 | ||
8005 | MOVOU -32(CX)(BX*1), X2 | ||
8006 | MOVOU -16(CX)(BX*1), X3 | ||
8007 | MOVOU X0, (AX) | ||
8008 | MOVOU X1, 16(AX) | ||
8009 | MOVOU X2, -32(AX)(BX*1) | ||
8010 | MOVOU X3, -16(AX)(BX*1) | ||
8011 | |||
8012 | memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: | ||
8013 | MOVQ DX, AX | ||
8014 | JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB | ||
8015 | |||
8016 | memmove_long_emit_remainder_encodeBetterBlockAsm4MB: | ||
8017 | LEAQ (AX)(SI*1), DX | ||
8018 | MOVL SI, BX | ||
8019 | |||
8020 | // genMemMoveLong | ||
8021 | MOVOU (CX), X0 | ||
8022 | MOVOU 16(CX), X1 | ||
8023 | MOVOU -32(CX)(BX*1), X2 | ||
8024 | MOVOU -16(CX)(BX*1), X3 | ||
8025 | MOVQ BX, DI | ||
8026 | SHRQ $0x05, DI | ||
8027 | MOVQ AX, SI | ||
8028 | ANDL $0x0000001f, SI | ||
8029 | MOVQ $0x00000040, R8 | ||
8030 | SUBQ SI, R8 | ||
8031 | DECQ DI | ||
8032 | JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 | ||
8033 | LEAQ -32(CX)(R8*1), SI | ||
8034 | LEAQ -32(AX)(R8*1), R9 | ||
8035 | |||
8036 | emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: | ||
8037 | MOVOU (SI), X4 | ||
8038 | MOVOU 16(SI), X5 | ||
8039 | MOVOA X4, (R9) | ||
8040 | MOVOA X5, 16(R9) | ||
8041 | ADDQ $0x20, R9 | ||
8042 | ADDQ $0x20, SI | ||
8043 | ADDQ $0x20, R8 | ||
8044 | DECQ DI | ||
8045 | JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back | ||
8046 | |||
8047 | emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: | ||
8048 | MOVOU -32(CX)(R8*1), X4 | ||
8049 | MOVOU -16(CX)(R8*1), X5 | ||
8050 | MOVOA X4, -32(AX)(R8*1) | ||
8051 | MOVOA X5, -16(AX)(R8*1) | ||
8052 | ADDQ $0x20, R8 | ||
8053 | CMPQ BX, R8 | ||
8054 | JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 | ||
8055 | MOVOU X0, (AX) | ||
8056 | MOVOU X1, 16(AX) | ||
8057 | MOVOU X2, -32(AX)(BX*1) | ||
8058 | MOVOU X3, -16(AX)(BX*1) | ||
8059 | MOVQ DX, AX | ||
8060 | |||
8061 | emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: | ||
8062 | MOVQ dst_base+0(FP), CX | ||
8063 | SUBQ CX, AX | ||
8064 | MOVQ AX, ret+48(FP) | ||
8065 | RET | ||
8066 | |||
8067 | // func encodeBetterBlockAsm12B(dst []byte, src []byte) int | ||
8068 | // Requires: BMI, SSE2 | ||
8069 | TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 | ||
8070 | MOVQ dst_base+0(FP), AX | ||
8071 | MOVQ $0x00000280, CX | ||
8072 | LEAQ 24(SP), DX | ||
8073 | PXOR X0, X0 | ||
8074 | |||
8075 | zero_loop_encodeBetterBlockAsm12B: | ||
8076 | MOVOU X0, (DX) | ||
8077 | MOVOU X0, 16(DX) | ||
8078 | MOVOU X0, 32(DX) | ||
8079 | MOVOU X0, 48(DX) | ||
8080 | MOVOU X0, 64(DX) | ||
8081 | MOVOU X0, 80(DX) | ||
8082 | MOVOU X0, 96(DX) | ||
8083 | MOVOU X0, 112(DX) | ||
8084 | ADDQ $0x80, DX | ||
8085 | DECQ CX | ||
8086 | JNZ zero_loop_encodeBetterBlockAsm12B | ||
8087 | MOVL $0x00000000, 12(SP) | ||
8088 | MOVQ src_len+32(FP), CX | ||
8089 | LEAQ -6(CX), DX | ||
8090 | LEAQ -8(CX), BX | ||
8091 | MOVL BX, 8(SP) | ||
8092 | SHRQ $0x05, CX | ||
8093 | SUBL CX, DX | ||
8094 | LEAQ (AX)(DX*1), DX | ||
8095 | MOVQ DX, (SP) | ||
8096 | MOVL $0x00000001, CX | ||
8097 | MOVL $0x00000000, 16(SP) | ||
8098 | MOVQ src_base+24(FP), DX | ||
8099 | |||
8100 | search_loop_encodeBetterBlockAsm12B: | ||
8101 | MOVL CX, BX | ||
8102 | SUBL 12(SP), BX | ||
8103 | SHRL $0x06, BX | ||
8104 | LEAL 1(CX)(BX*1), BX | ||
8105 | CMPL BX, 8(SP) | ||
8106 | JAE emit_remainder_encodeBetterBlockAsm12B | ||
8107 | MOVQ (DX)(CX*1), SI | ||
8108 | MOVL BX, 20(SP) | ||
8109 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
8110 | MOVQ $0x9e3779b1, BX | ||
8111 | MOVQ SI, R9 | ||
8112 | MOVQ SI, R10 | ||
8113 | SHLQ $0x10, R9 | ||
8114 | IMULQ R8, R9 | ||
8115 | SHRQ $0x32, R9 | ||
8116 | SHLQ $0x20, R10 | ||
8117 | IMULQ BX, R10 | ||
8118 | SHRQ $0x34, R10 | ||
8119 | MOVL 24(SP)(R9*4), BX | ||
8120 | MOVL 65560(SP)(R10*4), DI | ||
8121 | MOVL CX, 24(SP)(R9*4) | ||
8122 | MOVL CX, 65560(SP)(R10*4) | ||
8123 | MOVQ (DX)(BX*1), R9 | ||
8124 | MOVQ (DX)(DI*1), R10 | ||
8125 | CMPQ R9, SI | ||
8126 | JEQ candidate_match_encodeBetterBlockAsm12B | ||
8127 | CMPQ R10, SI | ||
8128 | JNE no_short_found_encodeBetterBlockAsm12B | ||
8129 | MOVL DI, BX | ||
8130 | JMP candidate_match_encodeBetterBlockAsm12B | ||
8131 | |||
8132 | no_short_found_encodeBetterBlockAsm12B: | ||
8133 | CMPL R9, SI | ||
8134 | JEQ candidate_match_encodeBetterBlockAsm12B | ||
8135 | CMPL R10, SI | ||
8136 | JEQ candidateS_match_encodeBetterBlockAsm12B | ||
8137 | MOVL 20(SP), CX | ||
8138 | JMP search_loop_encodeBetterBlockAsm12B | ||
8139 | |||
8140 | candidateS_match_encodeBetterBlockAsm12B: | ||
8141 | SHRQ $0x08, SI | ||
8142 | MOVQ SI, R9 | ||
8143 | SHLQ $0x10, R9 | ||
8144 | IMULQ R8, R9 | ||
8145 | SHRQ $0x32, R9 | ||
8146 | MOVL 24(SP)(R9*4), BX | ||
8147 | INCL CX | ||
8148 | MOVL CX, 24(SP)(R9*4) | ||
8149 | CMPL (DX)(BX*1), SI | ||
8150 | JEQ candidate_match_encodeBetterBlockAsm12B | ||
8151 | DECL CX | ||
8152 | MOVL DI, BX | ||
8153 | |||
8154 | candidate_match_encodeBetterBlockAsm12B: | ||
8155 | MOVL 12(SP), SI | ||
8156 | TESTL BX, BX | ||
8157 | JZ match_extend_back_end_encodeBetterBlockAsm12B | ||
8158 | |||
8159 | match_extend_back_loop_encodeBetterBlockAsm12B: | ||
8160 | CMPL CX, SI | ||
8161 | JBE match_extend_back_end_encodeBetterBlockAsm12B | ||
8162 | MOVB -1(DX)(BX*1), DI | ||
8163 | MOVB -1(DX)(CX*1), R8 | ||
8164 | CMPB DI, R8 | ||
8165 | JNE match_extend_back_end_encodeBetterBlockAsm12B | ||
8166 | LEAL -1(CX), CX | ||
8167 | DECL BX | ||
8168 | JZ match_extend_back_end_encodeBetterBlockAsm12B | ||
8169 | JMP match_extend_back_loop_encodeBetterBlockAsm12B | ||
8170 | |||
8171 | match_extend_back_end_encodeBetterBlockAsm12B: | ||
8172 | MOVL CX, SI | ||
8173 | SUBL 12(SP), SI | ||
8174 | LEAQ 3(AX)(SI*1), SI | ||
8175 | CMPQ SI, (SP) | ||
8176 | JB match_dst_size_check_encodeBetterBlockAsm12B | ||
8177 | MOVQ $0x00000000, ret+48(FP) | ||
8178 | RET | ||
8179 | |||
8180 | match_dst_size_check_encodeBetterBlockAsm12B: | ||
8181 | MOVL CX, SI | ||
8182 | ADDL $0x04, CX | ||
8183 | ADDL $0x04, BX | ||
8184 | MOVQ src_len+32(FP), DI | ||
8185 | SUBL CX, DI | ||
8186 | LEAQ (DX)(CX*1), R8 | ||
8187 | LEAQ (DX)(BX*1), R9 | ||
8188 | |||
8189 | // matchLen | ||
8190 | XORL R11, R11 | ||
8191 | |||
8192 | matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B: | ||
8193 | CMPL DI, $0x10 | ||
8194 | JB matchlen_match8_match_nolit_encodeBetterBlockAsm12B | ||
8195 | MOVQ (R8)(R11*1), R10 | ||
8196 | MOVQ 8(R8)(R11*1), R12 | ||
8197 | XORQ (R9)(R11*1), R10 | ||
8198 | JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B | ||
8199 | XORQ 8(R9)(R11*1), R12 | ||
8200 | JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B | ||
8201 | LEAL -16(DI), DI | ||
8202 | LEAL 16(R11), R11 | ||
8203 | JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B | ||
8204 | |||
8205 | matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B: | ||
8206 | #ifdef GOAMD64_v3 | ||
8207 | TZCNTQ R12, R12 | ||
8208 | |||
8209 | #else | ||
8210 | BSFQ R12, R12 | ||
8211 | |||
8212 | #endif | ||
8213 | SARQ $0x03, R12 | ||
8214 | LEAL 8(R11)(R12*1), R11 | ||
8215 | JMP match_nolit_end_encodeBetterBlockAsm12B | ||
8216 | |||
8217 | matchlen_match8_match_nolit_encodeBetterBlockAsm12B: | ||
8218 | CMPL DI, $0x08 | ||
8219 | JB matchlen_match4_match_nolit_encodeBetterBlockAsm12B | ||
8220 | MOVQ (R8)(R11*1), R10 | ||
8221 | XORQ (R9)(R11*1), R10 | ||
8222 | JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B | ||
8223 | LEAL -8(DI), DI | ||
8224 | LEAL 8(R11), R11 | ||
8225 | JMP matchlen_match4_match_nolit_encodeBetterBlockAsm12B | ||
8226 | |||
8227 | matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B: | ||
8228 | #ifdef GOAMD64_v3 | ||
8229 | TZCNTQ R10, R10 | ||
8230 | |||
8231 | #else | ||
8232 | BSFQ R10, R10 | ||
8233 | |||
8234 | #endif | ||
8235 | SARQ $0x03, R10 | ||
8236 | LEAL (R11)(R10*1), R11 | ||
8237 | JMP match_nolit_end_encodeBetterBlockAsm12B | ||
8238 | |||
8239 | matchlen_match4_match_nolit_encodeBetterBlockAsm12B: | ||
8240 | CMPL DI, $0x04 | ||
8241 | JB matchlen_match2_match_nolit_encodeBetterBlockAsm12B | ||
8242 | MOVL (R8)(R11*1), R10 | ||
8243 | CMPL (R9)(R11*1), R10 | ||
8244 | JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B | ||
8245 | LEAL -4(DI), DI | ||
8246 | LEAL 4(R11), R11 | ||
8247 | |||
8248 | matchlen_match2_match_nolit_encodeBetterBlockAsm12B: | ||
8249 | CMPL DI, $0x01 | ||
8250 | JE matchlen_match1_match_nolit_encodeBetterBlockAsm12B | ||
8251 | JB match_nolit_end_encodeBetterBlockAsm12B | ||
8252 | MOVW (R8)(R11*1), R10 | ||
8253 | CMPW (R9)(R11*1), R10 | ||
8254 | JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B | ||
8255 | LEAL 2(R11), R11 | ||
8256 | SUBL $0x02, DI | ||
8257 | JZ match_nolit_end_encodeBetterBlockAsm12B | ||
8258 | |||
8259 | matchlen_match1_match_nolit_encodeBetterBlockAsm12B: | ||
8260 | MOVB (R8)(R11*1), R10 | ||
8261 | CMPB (R9)(R11*1), R10 | ||
8262 | JNE match_nolit_end_encodeBetterBlockAsm12B | ||
8263 | LEAL 1(R11), R11 | ||
8264 | |||
8265 | match_nolit_end_encodeBetterBlockAsm12B: | ||
8266 | MOVL CX, DI | ||
8267 | SUBL BX, DI | ||
8268 | |||
8269 | // Check if repeat | ||
8270 | CMPL 16(SP), DI | ||
8271 | JEQ match_is_repeat_encodeBetterBlockAsm12B | ||
8272 | MOVL DI, 16(SP) | ||
8273 | MOVL 12(SP), BX | ||
8274 | CMPL BX, SI | ||
8275 | JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B | ||
8276 | MOVL SI, R8 | ||
8277 | MOVL SI, 12(SP) | ||
8278 | LEAQ (DX)(BX*1), R9 | ||
8279 | SUBL BX, R8 | ||
8280 | LEAL -1(R8), BX | ||
8281 | CMPL BX, $0x3c | ||
8282 | JB one_byte_match_emit_encodeBetterBlockAsm12B | ||
8283 | CMPL BX, $0x00000100 | ||
8284 | JB two_bytes_match_emit_encodeBetterBlockAsm12B | ||
8285 | JB three_bytes_match_emit_encodeBetterBlockAsm12B | ||
8286 | |||
8287 | three_bytes_match_emit_encodeBetterBlockAsm12B: | ||
8288 | MOVB $0xf4, (AX) | ||
8289 | MOVW BX, 1(AX) | ||
8290 | ADDQ $0x03, AX | ||
8291 | JMP memmove_long_match_emit_encodeBetterBlockAsm12B | ||
8292 | |||
8293 | two_bytes_match_emit_encodeBetterBlockAsm12B: | ||
8294 | MOVB $0xf0, (AX) | ||
8295 | MOVB BL, 1(AX) | ||
8296 | ADDQ $0x02, AX | ||
8297 | CMPL BX, $0x40 | ||
8298 | JB memmove_match_emit_encodeBetterBlockAsm12B | ||
8299 | JMP memmove_long_match_emit_encodeBetterBlockAsm12B | ||
8300 | |||
8301 | one_byte_match_emit_encodeBetterBlockAsm12B: | ||
8302 | SHLB $0x02, BL | ||
8303 | MOVB BL, (AX) | ||
8304 | ADDQ $0x01, AX | ||
8305 | |||
8306 | memmove_match_emit_encodeBetterBlockAsm12B: | ||
8307 | LEAQ (AX)(R8*1), BX | ||
8308 | |||
8309 | // genMemMoveShort | ||
8310 | CMPQ R8, $0x04 | ||
8311 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 | ||
8312 | CMPQ R8, $0x08 | ||
8313 | JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 | ||
8314 | CMPQ R8, $0x10 | ||
8315 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 | ||
8316 | CMPQ R8, $0x20 | ||
8317 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 | ||
8318 | JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 | ||
8319 | |||
8320 | emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: | ||
8321 | MOVL (R9), R10 | ||
8322 | MOVL R10, (AX) | ||
8323 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B | ||
8324 | |||
8325 | emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: | ||
8326 | MOVL (R9), R10 | ||
8327 | MOVL -4(R9)(R8*1), R9 | ||
8328 | MOVL R10, (AX) | ||
8329 | MOVL R9, -4(AX)(R8*1) | ||
8330 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B | ||
8331 | |||
8332 | emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: | ||
8333 | MOVQ (R9), R10 | ||
8334 | MOVQ -8(R9)(R8*1), R9 | ||
8335 | MOVQ R10, (AX) | ||
8336 | MOVQ R9, -8(AX)(R8*1) | ||
8337 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B | ||
8338 | |||
8339 | emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: | ||
8340 | MOVOU (R9), X0 | ||
8341 | MOVOU -16(R9)(R8*1), X1 | ||
8342 | MOVOU X0, (AX) | ||
8343 | MOVOU X1, -16(AX)(R8*1) | ||
8344 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B | ||
8345 | |||
8346 | emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: | ||
8347 | MOVOU (R9), X0 | ||
8348 | MOVOU 16(R9), X1 | ||
8349 | MOVOU -32(R9)(R8*1), X2 | ||
8350 | MOVOU -16(R9)(R8*1), X3 | ||
8351 | MOVOU X0, (AX) | ||
8352 | MOVOU X1, 16(AX) | ||
8353 | MOVOU X2, -32(AX)(R8*1) | ||
8354 | MOVOU X3, -16(AX)(R8*1) | ||
8355 | |||
8356 | memmove_end_copy_match_emit_encodeBetterBlockAsm12B: | ||
8357 | MOVQ BX, AX | ||
8358 | JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B | ||
8359 | |||
8360 | memmove_long_match_emit_encodeBetterBlockAsm12B: | ||
8361 | LEAQ (AX)(R8*1), BX | ||
8362 | |||
8363 | // genMemMoveLong | ||
8364 | MOVOU (R9), X0 | ||
8365 | MOVOU 16(R9), X1 | ||
8366 | MOVOU -32(R9)(R8*1), X2 | ||
8367 | MOVOU -16(R9)(R8*1), X3 | ||
8368 | MOVQ R8, R12 | ||
8369 | SHRQ $0x05, R12 | ||
8370 | MOVQ AX, R10 | ||
8371 | ANDL $0x0000001f, R10 | ||
8372 | MOVQ $0x00000040, R13 | ||
8373 | SUBQ R10, R13 | ||
8374 | DECQ R12 | ||
8375 | JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 | ||
8376 | LEAQ -32(R9)(R13*1), R10 | ||
8377 | LEAQ -32(AX)(R13*1), R14 | ||
8378 | |||
8379 | emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: | ||
8380 | MOVOU (R10), X4 | ||
8381 | MOVOU 16(R10), X5 | ||
8382 | MOVOA X4, (R14) | ||
8383 | MOVOA X5, 16(R14) | ||
8384 | ADDQ $0x20, R14 | ||
8385 | ADDQ $0x20, R10 | ||
8386 | ADDQ $0x20, R13 | ||
8387 | DECQ R12 | ||
8388 | JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back | ||
8389 | |||
8390 | emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: | ||
8391 | MOVOU -32(R9)(R13*1), X4 | ||
8392 | MOVOU -16(R9)(R13*1), X5 | ||
8393 | MOVOA X4, -32(AX)(R13*1) | ||
8394 | MOVOA X5, -16(AX)(R13*1) | ||
8395 | ADDQ $0x20, R13 | ||
8396 | CMPQ R8, R13 | ||
8397 | JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 | ||
8398 | MOVOU X0, (AX) | ||
8399 | MOVOU X1, 16(AX) | ||
8400 | MOVOU X2, -32(AX)(R8*1) | ||
8401 | MOVOU X3, -16(AX)(R8*1) | ||
8402 | MOVQ BX, AX | ||
8403 | |||
8404 | emit_literal_done_match_emit_encodeBetterBlockAsm12B: | ||
8405 | ADDL R11, CX | ||
8406 | ADDL $0x04, R11 | ||
8407 | MOVL CX, 12(SP) | ||
8408 | |||
8409 | // emitCopy | ||
8410 | CMPL R11, $0x40 | ||
8411 | JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B | ||
8412 | CMPL DI, $0x00000800 | ||
8413 | JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B | ||
8414 | MOVL $0x00000001, BX | ||
8415 | LEAL 16(BX), BX | ||
8416 | MOVB DI, 1(AX) | ||
8417 | SHRL $0x08, DI | ||
8418 | SHLL $0x05, DI | ||
8419 | ORL DI, BX | ||
8420 | MOVB BL, (AX) | ||
8421 | ADDQ $0x02, AX | ||
8422 | SUBL $0x08, R11 | ||
8423 | |||
8424 | // emitRepeat | ||
8425 | LEAL -4(R11), R11 | ||
8426 | JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b | ||
8427 | MOVL R11, BX | ||
8428 | LEAL -4(R11), R11 | ||
8429 | CMPL BX, $0x08 | ||
8430 | JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b | ||
8431 | CMPL BX, $0x0c | ||
8432 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b | ||
8433 | CMPL DI, $0x00000800 | ||
8434 | JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b | ||
8435 | |||
8436 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: | ||
8437 | CMPL R11, $0x00000104 | ||
8438 | JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b | ||
8439 | LEAL -256(R11), R11 | ||
8440 | MOVW $0x0019, (AX) | ||
8441 | MOVW R11, 2(AX) | ||
8442 | ADDQ $0x04, AX | ||
8443 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8444 | |||
8445 | repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: | ||
8446 | LEAL -4(R11), R11 | ||
8447 | MOVW $0x0015, (AX) | ||
8448 | MOVB R11, 2(AX) | ||
8449 | ADDQ $0x03, AX | ||
8450 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8451 | |||
8452 | repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: | ||
8453 | SHLL $0x02, R11 | ||
8454 | ORL $0x01, R11 | ||
8455 | MOVW R11, (AX) | ||
8456 | ADDQ $0x02, AX | ||
8457 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8458 | |||
8459 | repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: | ||
8460 | XORQ BX, BX | ||
8461 | LEAL 1(BX)(R11*4), R11 | ||
8462 | MOVB DI, 1(AX) | ||
8463 | SARL $0x08, DI | ||
8464 | SHLL $0x05, DI | ||
8465 | ORL DI, R11 | ||
8466 | MOVB R11, (AX) | ||
8467 | ADDQ $0x02, AX | ||
8468 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8469 | |||
8470 | long_offset_short_match_nolit_encodeBetterBlockAsm12B: | ||
8471 | MOVB $0xee, (AX) | ||
8472 | MOVW DI, 1(AX) | ||
8473 | LEAL -60(R11), R11 | ||
8474 | ADDQ $0x03, AX | ||
8475 | |||
8476 | // emitRepeat | ||
8477 | MOVL R11, BX | ||
8478 | LEAL -4(R11), R11 | ||
8479 | CMPL BX, $0x08 | ||
8480 | JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short | ||
8481 | CMPL BX, $0x0c | ||
8482 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short | ||
8483 | CMPL DI, $0x00000800 | ||
8484 | JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short | ||
8485 | |||
8486 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: | ||
8487 | CMPL R11, $0x00000104 | ||
8488 | JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short | ||
8489 | LEAL -256(R11), R11 | ||
8490 | MOVW $0x0019, (AX) | ||
8491 | MOVW R11, 2(AX) | ||
8492 | ADDQ $0x04, AX | ||
8493 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8494 | |||
8495 | repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: | ||
8496 | LEAL -4(R11), R11 | ||
8497 | MOVW $0x0015, (AX) | ||
8498 | MOVB R11, 2(AX) | ||
8499 | ADDQ $0x03, AX | ||
8500 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8501 | |||
8502 | repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: | ||
8503 | SHLL $0x02, R11 | ||
8504 | ORL $0x01, R11 | ||
8505 | MOVW R11, (AX) | ||
8506 | ADDQ $0x02, AX | ||
8507 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8508 | |||
8509 | repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: | ||
8510 | XORQ BX, BX | ||
8511 | LEAL 1(BX)(R11*4), R11 | ||
8512 | MOVB DI, 1(AX) | ||
8513 | SARL $0x08, DI | ||
8514 | SHLL $0x05, DI | ||
8515 | ORL DI, R11 | ||
8516 | MOVB R11, (AX) | ||
8517 | ADDQ $0x02, AX | ||
8518 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8519 | |||
8520 | two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: | ||
8521 | MOVL R11, BX | ||
8522 | SHLL $0x02, BX | ||
8523 | CMPL R11, $0x0c | ||
8524 | JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B | ||
8525 | CMPL DI, $0x00000800 | ||
8526 | JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B | ||
8527 | LEAL -15(BX), BX | ||
8528 | MOVB DI, 1(AX) | ||
8529 | SHRL $0x08, DI | ||
8530 | SHLL $0x05, DI | ||
8531 | ORL DI, BX | ||
8532 | MOVB BL, (AX) | ||
8533 | ADDQ $0x02, AX | ||
8534 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8535 | |||
8536 | emit_copy_three_match_nolit_encodeBetterBlockAsm12B: | ||
8537 | LEAL -2(BX), BX | ||
8538 | MOVB BL, (AX) | ||
8539 | MOVW DI, 1(AX) | ||
8540 | ADDQ $0x03, AX | ||
8541 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8542 | |||
8543 | match_is_repeat_encodeBetterBlockAsm12B: | ||
8544 | MOVL 12(SP), BX | ||
8545 | CMPL BX, SI | ||
8546 | JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B | ||
8547 | MOVL SI, R8 | ||
8548 | MOVL SI, 12(SP) | ||
8549 | LEAQ (DX)(BX*1), R9 | ||
8550 | SUBL BX, R8 | ||
8551 | LEAL -1(R8), BX | ||
8552 | CMPL BX, $0x3c | ||
8553 | JB one_byte_match_emit_repeat_encodeBetterBlockAsm12B | ||
8554 | CMPL BX, $0x00000100 | ||
8555 | JB two_bytes_match_emit_repeat_encodeBetterBlockAsm12B | ||
8556 | JB three_bytes_match_emit_repeat_encodeBetterBlockAsm12B | ||
8557 | |||
8558 | three_bytes_match_emit_repeat_encodeBetterBlockAsm12B: | ||
8559 | MOVB $0xf4, (AX) | ||
8560 | MOVW BX, 1(AX) | ||
8561 | ADDQ $0x03, AX | ||
8562 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B | ||
8563 | |||
8564 | two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: | ||
8565 | MOVB $0xf0, (AX) | ||
8566 | MOVB BL, 1(AX) | ||
8567 | ADDQ $0x02, AX | ||
8568 | CMPL BX, $0x40 | ||
8569 | JB memmove_match_emit_repeat_encodeBetterBlockAsm12B | ||
8570 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B | ||
8571 | |||
8572 | one_byte_match_emit_repeat_encodeBetterBlockAsm12B: | ||
8573 | SHLB $0x02, BL | ||
8574 | MOVB BL, (AX) | ||
8575 | ADDQ $0x01, AX | ||
8576 | |||
8577 | memmove_match_emit_repeat_encodeBetterBlockAsm12B: | ||
8578 | LEAQ (AX)(R8*1), BX | ||
8579 | |||
8580 | // genMemMoveShort | ||
8581 | CMPQ R8, $0x04 | ||
8582 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 | ||
8583 | CMPQ R8, $0x08 | ||
8584 | JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 | ||
8585 | CMPQ R8, $0x10 | ||
8586 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 | ||
8587 | CMPQ R8, $0x20 | ||
8588 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 | ||
8589 | JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 | ||
8590 | |||
8591 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: | ||
8592 | MOVL (R9), R10 | ||
8593 | MOVL R10, (AX) | ||
8594 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B | ||
8595 | |||
8596 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: | ||
8597 | MOVL (R9), R10 | ||
8598 | MOVL -4(R9)(R8*1), R9 | ||
8599 | MOVL R10, (AX) | ||
8600 | MOVL R9, -4(AX)(R8*1) | ||
8601 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B | ||
8602 | |||
8603 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: | ||
8604 | MOVQ (R9), R10 | ||
8605 | MOVQ -8(R9)(R8*1), R9 | ||
8606 | MOVQ R10, (AX) | ||
8607 | MOVQ R9, -8(AX)(R8*1) | ||
8608 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B | ||
8609 | |||
8610 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: | ||
8611 | MOVOU (R9), X0 | ||
8612 | MOVOU -16(R9)(R8*1), X1 | ||
8613 | MOVOU X0, (AX) | ||
8614 | MOVOU X1, -16(AX)(R8*1) | ||
8615 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B | ||
8616 | |||
8617 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: | ||
8618 | MOVOU (R9), X0 | ||
8619 | MOVOU 16(R9), X1 | ||
8620 | MOVOU -32(R9)(R8*1), X2 | ||
8621 | MOVOU -16(R9)(R8*1), X3 | ||
8622 | MOVOU X0, (AX) | ||
8623 | MOVOU X1, 16(AX) | ||
8624 | MOVOU X2, -32(AX)(R8*1) | ||
8625 | MOVOU X3, -16(AX)(R8*1) | ||
8626 | |||
8627 | memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: | ||
8628 | MOVQ BX, AX | ||
8629 | JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B | ||
8630 | |||
8631 | memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: | ||
8632 | LEAQ (AX)(R8*1), BX | ||
8633 | |||
8634 | // genMemMoveLong | ||
8635 | MOVOU (R9), X0 | ||
8636 | MOVOU 16(R9), X1 | ||
8637 | MOVOU -32(R9)(R8*1), X2 | ||
8638 | MOVOU -16(R9)(R8*1), X3 | ||
8639 | MOVQ R8, R12 | ||
8640 | SHRQ $0x05, R12 | ||
8641 | MOVQ AX, R10 | ||
8642 | ANDL $0x0000001f, R10 | ||
8643 | MOVQ $0x00000040, R13 | ||
8644 | SUBQ R10, R13 | ||
8645 | DECQ R12 | ||
8646 | JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 | ||
8647 | LEAQ -32(R9)(R13*1), R10 | ||
8648 | LEAQ -32(AX)(R13*1), R14 | ||
8649 | |||
8650 | emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: | ||
8651 | MOVOU (R10), X4 | ||
8652 | MOVOU 16(R10), X5 | ||
8653 | MOVOA X4, (R14) | ||
8654 | MOVOA X5, 16(R14) | ||
8655 | ADDQ $0x20, R14 | ||
8656 | ADDQ $0x20, R10 | ||
8657 | ADDQ $0x20, R13 | ||
8658 | DECQ R12 | ||
8659 | JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back | ||
8660 | |||
8661 | emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: | ||
8662 | MOVOU -32(R9)(R13*1), X4 | ||
8663 | MOVOU -16(R9)(R13*1), X5 | ||
8664 | MOVOA X4, -32(AX)(R13*1) | ||
8665 | MOVOA X5, -16(AX)(R13*1) | ||
8666 | ADDQ $0x20, R13 | ||
8667 | CMPQ R8, R13 | ||
8668 | JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 | ||
8669 | MOVOU X0, (AX) | ||
8670 | MOVOU X1, 16(AX) | ||
8671 | MOVOU X2, -32(AX)(R8*1) | ||
8672 | MOVOU X3, -16(AX)(R8*1) | ||
8673 | MOVQ BX, AX | ||
8674 | |||
8675 | emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: | ||
8676 | ADDL R11, CX | ||
8677 | ADDL $0x04, R11 | ||
8678 | MOVL CX, 12(SP) | ||
8679 | |||
8680 | // emitRepeat | ||
8681 | MOVL R11, BX | ||
8682 | LEAL -4(R11), R11 | ||
8683 | CMPL BX, $0x08 | ||
8684 | JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B | ||
8685 | CMPL BX, $0x0c | ||
8686 | JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B | ||
8687 | CMPL DI, $0x00000800 | ||
8688 | JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B | ||
8689 | |||
8690 | cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: | ||
8691 | CMPL R11, $0x00000104 | ||
8692 | JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B | ||
8693 | LEAL -256(R11), R11 | ||
8694 | MOVW $0x0019, (AX) | ||
8695 | MOVW R11, 2(AX) | ||
8696 | ADDQ $0x04, AX | ||
8697 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8698 | |||
8699 | repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: | ||
8700 | LEAL -4(R11), R11 | ||
8701 | MOVW $0x0015, (AX) | ||
8702 | MOVB R11, 2(AX) | ||
8703 | ADDQ $0x03, AX | ||
8704 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8705 | |||
8706 | repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: | ||
8707 | SHLL $0x02, R11 | ||
8708 | ORL $0x01, R11 | ||
8709 | MOVW R11, (AX) | ||
8710 | ADDQ $0x02, AX | ||
8711 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B | ||
8712 | |||
8713 | repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: | ||
8714 | XORQ BX, BX | ||
8715 | LEAL 1(BX)(R11*4), R11 | ||
8716 | MOVB DI, 1(AX) | ||
8717 | SARL $0x08, DI | ||
8718 | SHLL $0x05, DI | ||
8719 | ORL DI, R11 | ||
8720 | MOVB R11, (AX) | ||
8721 | ADDQ $0x02, AX | ||
8722 | |||
8723 | match_nolit_emitcopy_end_encodeBetterBlockAsm12B: | ||
8724 | CMPL CX, 8(SP) | ||
8725 | JAE emit_remainder_encodeBetterBlockAsm12B | ||
8726 | CMPQ AX, (SP) | ||
8727 | JB match_nolit_dst_ok_encodeBetterBlockAsm12B | ||
8728 | MOVQ $0x00000000, ret+48(FP) | ||
8729 | RET | ||
8730 | |||
8731 | match_nolit_dst_ok_encodeBetterBlockAsm12B: | ||
8732 | MOVQ $0x0000cf1bbcdcbf9b, BX | ||
8733 | MOVQ $0x9e3779b1, DI | ||
8734 | LEAQ 1(SI), SI | ||
8735 | LEAQ -2(CX), R8 | ||
8736 | MOVQ (DX)(SI*1), R9 | ||
8737 | MOVQ 1(DX)(SI*1), R10 | ||
8738 | MOVQ (DX)(R8*1), R11 | ||
8739 | MOVQ 1(DX)(R8*1), R12 | ||
8740 | SHLQ $0x10, R9 | ||
8741 | IMULQ BX, R9 | ||
8742 | SHRQ $0x32, R9 | ||
8743 | SHLQ $0x20, R10 | ||
8744 | IMULQ DI, R10 | ||
8745 | SHRQ $0x34, R10 | ||
8746 | SHLQ $0x10, R11 | ||
8747 | IMULQ BX, R11 | ||
8748 | SHRQ $0x32, R11 | ||
8749 | SHLQ $0x20, R12 | ||
8750 | IMULQ DI, R12 | ||
8751 | SHRQ $0x34, R12 | ||
8752 | LEAQ 1(SI), DI | ||
8753 | LEAQ 1(R8), R13 | ||
8754 | MOVL SI, 24(SP)(R9*4) | ||
8755 | MOVL R8, 24(SP)(R11*4) | ||
8756 | MOVL DI, 65560(SP)(R10*4) | ||
8757 | MOVL R13, 65560(SP)(R12*4) | ||
8758 | LEAQ 1(R8)(SI*1), DI | ||
8759 | SHRQ $0x01, DI | ||
8760 | ADDQ $0x01, SI | ||
8761 | SUBQ $0x01, R8 | ||
8762 | |||
8763 | index_loop_encodeBetterBlockAsm12B: | ||
8764 | CMPQ DI, R8 | ||
8765 | JAE search_loop_encodeBetterBlockAsm12B | ||
8766 | MOVQ (DX)(SI*1), R9 | ||
8767 | MOVQ (DX)(DI*1), R10 | ||
8768 | SHLQ $0x10, R9 | ||
8769 | IMULQ BX, R9 | ||
8770 | SHRQ $0x32, R9 | ||
8771 | SHLQ $0x10, R10 | ||
8772 | IMULQ BX, R10 | ||
8773 | SHRQ $0x32, R10 | ||
8774 | MOVL SI, 24(SP)(R9*4) | ||
8775 | MOVL DI, 24(SP)(R10*4) | ||
8776 | ADDQ $0x02, SI | ||
8777 | ADDQ $0x02, DI | ||
8778 | JMP index_loop_encodeBetterBlockAsm12B | ||
8779 | |||
8780 | emit_remainder_encodeBetterBlockAsm12B: | ||
8781 | MOVQ src_len+32(FP), CX | ||
8782 | SUBL 12(SP), CX | ||
8783 | LEAQ 3(AX)(CX*1), CX | ||
8784 | CMPQ CX, (SP) | ||
8785 | JB emit_remainder_ok_encodeBetterBlockAsm12B | ||
8786 | MOVQ $0x00000000, ret+48(FP) | ||
8787 | RET | ||
8788 | |||
8789 | emit_remainder_ok_encodeBetterBlockAsm12B: | ||
8790 | MOVQ src_len+32(FP), CX | ||
8791 | MOVL 12(SP), BX | ||
8792 | CMPL BX, CX | ||
8793 | JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B | ||
8794 | MOVL CX, SI | ||
8795 | MOVL CX, 12(SP) | ||
8796 | LEAQ (DX)(BX*1), CX | ||
8797 | SUBL BX, SI | ||
8798 | LEAL -1(SI), DX | ||
8799 | CMPL DX, $0x3c | ||
8800 | JB one_byte_emit_remainder_encodeBetterBlockAsm12B | ||
8801 | CMPL DX, $0x00000100 | ||
8802 | JB two_bytes_emit_remainder_encodeBetterBlockAsm12B | ||
8803 | JB three_bytes_emit_remainder_encodeBetterBlockAsm12B | ||
8804 | |||
8805 | three_bytes_emit_remainder_encodeBetterBlockAsm12B: | ||
8806 | MOVB $0xf4, (AX) | ||
8807 | MOVW DX, 1(AX) | ||
8808 | ADDQ $0x03, AX | ||
8809 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B | ||
8810 | |||
8811 | two_bytes_emit_remainder_encodeBetterBlockAsm12B: | ||
8812 | MOVB $0xf0, (AX) | ||
8813 | MOVB DL, 1(AX) | ||
8814 | ADDQ $0x02, AX | ||
8815 | CMPL DX, $0x40 | ||
8816 | JB memmove_emit_remainder_encodeBetterBlockAsm12B | ||
8817 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B | ||
8818 | |||
8819 | one_byte_emit_remainder_encodeBetterBlockAsm12B: | ||
8820 | SHLB $0x02, DL | ||
8821 | MOVB DL, (AX) | ||
8822 | ADDQ $0x01, AX | ||
8823 | |||
8824 | memmove_emit_remainder_encodeBetterBlockAsm12B: | ||
8825 | LEAQ (AX)(SI*1), DX | ||
8826 | MOVL SI, BX | ||
8827 | |||
8828 | // genMemMoveShort | ||
8829 | CMPQ BX, $0x03 | ||
8830 | JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 | ||
8831 | JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 | ||
8832 | CMPQ BX, $0x08 | ||
8833 | JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 | ||
8834 | CMPQ BX, $0x10 | ||
8835 | JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 | ||
8836 | CMPQ BX, $0x20 | ||
8837 | JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 | ||
8838 | JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 | ||
8839 | |||
8840 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: | ||
8841 | MOVB (CX), SI | ||
8842 | MOVB -1(CX)(BX*1), CL | ||
8843 | MOVB SI, (AX) | ||
8844 | MOVB CL, -1(AX)(BX*1) | ||
8845 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B | ||
8846 | |||
8847 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: | ||
8848 | MOVW (CX), SI | ||
8849 | MOVB 2(CX), CL | ||
8850 | MOVW SI, (AX) | ||
8851 | MOVB CL, 2(AX) | ||
8852 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B | ||
8853 | |||
8854 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: | ||
8855 | MOVL (CX), SI | ||
8856 | MOVL -4(CX)(BX*1), CX | ||
8857 | MOVL SI, (AX) | ||
8858 | MOVL CX, -4(AX)(BX*1) | ||
8859 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B | ||
8860 | |||
8861 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: | ||
8862 | MOVQ (CX), SI | ||
8863 | MOVQ -8(CX)(BX*1), CX | ||
8864 | MOVQ SI, (AX) | ||
8865 | MOVQ CX, -8(AX)(BX*1) | ||
8866 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B | ||
8867 | |||
8868 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: | ||
8869 | MOVOU (CX), X0 | ||
8870 | MOVOU -16(CX)(BX*1), X1 | ||
8871 | MOVOU X0, (AX) | ||
8872 | MOVOU X1, -16(AX)(BX*1) | ||
8873 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B | ||
8874 | |||
8875 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: | ||
8876 | MOVOU (CX), X0 | ||
8877 | MOVOU 16(CX), X1 | ||
8878 | MOVOU -32(CX)(BX*1), X2 | ||
8879 | MOVOU -16(CX)(BX*1), X3 | ||
8880 | MOVOU X0, (AX) | ||
8881 | MOVOU X1, 16(AX) | ||
8882 | MOVOU X2, -32(AX)(BX*1) | ||
8883 | MOVOU X3, -16(AX)(BX*1) | ||
8884 | |||
8885 | memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: | ||
8886 | MOVQ DX, AX | ||
8887 | JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B | ||
8888 | |||
8889 | memmove_long_emit_remainder_encodeBetterBlockAsm12B: | ||
8890 | LEAQ (AX)(SI*1), DX | ||
8891 | MOVL SI, BX | ||
8892 | |||
8893 | // genMemMoveLong | ||
8894 | MOVOU (CX), X0 | ||
8895 | MOVOU 16(CX), X1 | ||
8896 | MOVOU -32(CX)(BX*1), X2 | ||
8897 | MOVOU -16(CX)(BX*1), X3 | ||
8898 | MOVQ BX, DI | ||
8899 | SHRQ $0x05, DI | ||
8900 | MOVQ AX, SI | ||
8901 | ANDL $0x0000001f, SI | ||
8902 | MOVQ $0x00000040, R8 | ||
8903 | SUBQ SI, R8 | ||
8904 | DECQ DI | ||
8905 | JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 | ||
8906 | LEAQ -32(CX)(R8*1), SI | ||
8907 | LEAQ -32(AX)(R8*1), R9 | ||
8908 | |||
8909 | emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: | ||
8910 | MOVOU (SI), X4 | ||
8911 | MOVOU 16(SI), X5 | ||
8912 | MOVOA X4, (R9) | ||
8913 | MOVOA X5, 16(R9) | ||
8914 | ADDQ $0x20, R9 | ||
8915 | ADDQ $0x20, SI | ||
8916 | ADDQ $0x20, R8 | ||
8917 | DECQ DI | ||
8918 | JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back | ||
8919 | |||
8920 | emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: | ||
8921 | MOVOU -32(CX)(R8*1), X4 | ||
8922 | MOVOU -16(CX)(R8*1), X5 | ||
8923 | MOVOA X4, -32(AX)(R8*1) | ||
8924 | MOVOA X5, -16(AX)(R8*1) | ||
8925 | ADDQ $0x20, R8 | ||
8926 | CMPQ BX, R8 | ||
8927 | JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 | ||
8928 | MOVOU X0, (AX) | ||
8929 | MOVOU X1, 16(AX) | ||
8930 | MOVOU X2, -32(AX)(BX*1) | ||
8931 | MOVOU X3, -16(AX)(BX*1) | ||
8932 | MOVQ DX, AX | ||
8933 | |||
8934 | emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: | ||
8935 | MOVQ dst_base+0(FP), CX | ||
8936 | SUBQ CX, AX | ||
8937 | MOVQ AX, ret+48(FP) | ||
8938 | RET | ||
8939 | |||
8940 | // func encodeBetterBlockAsm10B(dst []byte, src []byte) int | ||
8941 | // Requires: BMI, SSE2 | ||
8942 | TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 | ||
8943 | MOVQ dst_base+0(FP), AX | ||
8944 | MOVQ $0x000000a0, CX | ||
8945 | LEAQ 24(SP), DX | ||
8946 | PXOR X0, X0 | ||
8947 | |||
8948 | zero_loop_encodeBetterBlockAsm10B: | ||
8949 | MOVOU X0, (DX) | ||
8950 | MOVOU X0, 16(DX) | ||
8951 | MOVOU X0, 32(DX) | ||
8952 | MOVOU X0, 48(DX) | ||
8953 | MOVOU X0, 64(DX) | ||
8954 | MOVOU X0, 80(DX) | ||
8955 | MOVOU X0, 96(DX) | ||
8956 | MOVOU X0, 112(DX) | ||
8957 | ADDQ $0x80, DX | ||
8958 | DECQ CX | ||
8959 | JNZ zero_loop_encodeBetterBlockAsm10B | ||
8960 | MOVL $0x00000000, 12(SP) | ||
8961 | MOVQ src_len+32(FP), CX | ||
8962 | LEAQ -6(CX), DX | ||
8963 | LEAQ -8(CX), BX | ||
8964 | MOVL BX, 8(SP) | ||
8965 | SHRQ $0x05, CX | ||
8966 | SUBL CX, DX | ||
8967 | LEAQ (AX)(DX*1), DX | ||
8968 | MOVQ DX, (SP) | ||
8969 | MOVL $0x00000001, CX | ||
8970 | MOVL $0x00000000, 16(SP) | ||
8971 | MOVQ src_base+24(FP), DX | ||
8972 | |||
8973 | search_loop_encodeBetterBlockAsm10B: | ||
8974 | MOVL CX, BX | ||
8975 | SUBL 12(SP), BX | ||
8976 | SHRL $0x05, BX | ||
8977 | LEAL 1(CX)(BX*1), BX | ||
8978 | CMPL BX, 8(SP) | ||
8979 | JAE emit_remainder_encodeBetterBlockAsm10B | ||
8980 | MOVQ (DX)(CX*1), SI | ||
8981 | MOVL BX, 20(SP) | ||
8982 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
8983 | MOVQ $0x9e3779b1, BX | ||
8984 | MOVQ SI, R9 | ||
8985 | MOVQ SI, R10 | ||
8986 | SHLQ $0x10, R9 | ||
8987 | IMULQ R8, R9 | ||
8988 | SHRQ $0x34, R9 | ||
8989 | SHLQ $0x20, R10 | ||
8990 | IMULQ BX, R10 | ||
8991 | SHRQ $0x36, R10 | ||
8992 | MOVL 24(SP)(R9*4), BX | ||
8993 | MOVL 16408(SP)(R10*4), DI | ||
8994 | MOVL CX, 24(SP)(R9*4) | ||
8995 | MOVL CX, 16408(SP)(R10*4) | ||
8996 | MOVQ (DX)(BX*1), R9 | ||
8997 | MOVQ (DX)(DI*1), R10 | ||
8998 | CMPQ R9, SI | ||
8999 | JEQ candidate_match_encodeBetterBlockAsm10B | ||
9000 | CMPQ R10, SI | ||
9001 | JNE no_short_found_encodeBetterBlockAsm10B | ||
9002 | MOVL DI, BX | ||
9003 | JMP candidate_match_encodeBetterBlockAsm10B | ||
9004 | |||
9005 | no_short_found_encodeBetterBlockAsm10B: | ||
9006 | CMPL R9, SI | ||
9007 | JEQ candidate_match_encodeBetterBlockAsm10B | ||
9008 | CMPL R10, SI | ||
9009 | JEQ candidateS_match_encodeBetterBlockAsm10B | ||
9010 | MOVL 20(SP), CX | ||
9011 | JMP search_loop_encodeBetterBlockAsm10B | ||
9012 | |||
9013 | candidateS_match_encodeBetterBlockAsm10B: | ||
9014 | SHRQ $0x08, SI | ||
9015 | MOVQ SI, R9 | ||
9016 | SHLQ $0x10, R9 | ||
9017 | IMULQ R8, R9 | ||
9018 | SHRQ $0x34, R9 | ||
9019 | MOVL 24(SP)(R9*4), BX | ||
9020 | INCL CX | ||
9021 | MOVL CX, 24(SP)(R9*4) | ||
9022 | CMPL (DX)(BX*1), SI | ||
9023 | JEQ candidate_match_encodeBetterBlockAsm10B | ||
9024 | DECL CX | ||
9025 | MOVL DI, BX | ||
9026 | |||
9027 | candidate_match_encodeBetterBlockAsm10B: | ||
9028 | MOVL 12(SP), SI | ||
9029 | TESTL BX, BX | ||
9030 | JZ match_extend_back_end_encodeBetterBlockAsm10B | ||
9031 | |||
9032 | match_extend_back_loop_encodeBetterBlockAsm10B: | ||
9033 | CMPL CX, SI | ||
9034 | JBE match_extend_back_end_encodeBetterBlockAsm10B | ||
9035 | MOVB -1(DX)(BX*1), DI | ||
9036 | MOVB -1(DX)(CX*1), R8 | ||
9037 | CMPB DI, R8 | ||
9038 | JNE match_extend_back_end_encodeBetterBlockAsm10B | ||
9039 | LEAL -1(CX), CX | ||
9040 | DECL BX | ||
9041 | JZ match_extend_back_end_encodeBetterBlockAsm10B | ||
9042 | JMP match_extend_back_loop_encodeBetterBlockAsm10B | ||
9043 | |||
9044 | match_extend_back_end_encodeBetterBlockAsm10B: | ||
9045 | MOVL CX, SI | ||
9046 | SUBL 12(SP), SI | ||
9047 | LEAQ 3(AX)(SI*1), SI | ||
9048 | CMPQ SI, (SP) | ||
9049 | JB match_dst_size_check_encodeBetterBlockAsm10B | ||
9050 | MOVQ $0x00000000, ret+48(FP) | ||
9051 | RET | ||
9052 | |||
9053 | match_dst_size_check_encodeBetterBlockAsm10B: | ||
9054 | MOVL CX, SI | ||
9055 | ADDL $0x04, CX | ||
9056 | ADDL $0x04, BX | ||
9057 | MOVQ src_len+32(FP), DI | ||
9058 | SUBL CX, DI | ||
9059 | LEAQ (DX)(CX*1), R8 | ||
9060 | LEAQ (DX)(BX*1), R9 | ||
9061 | |||
9062 | // matchLen | ||
9063 | XORL R11, R11 | ||
9064 | |||
9065 | matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B: | ||
9066 | CMPL DI, $0x10 | ||
9067 | JB matchlen_match8_match_nolit_encodeBetterBlockAsm10B | ||
9068 | MOVQ (R8)(R11*1), R10 | ||
9069 | MOVQ 8(R8)(R11*1), R12 | ||
9070 | XORQ (R9)(R11*1), R10 | ||
9071 | JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B | ||
9072 | XORQ 8(R9)(R11*1), R12 | ||
9073 | JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B | ||
9074 | LEAL -16(DI), DI | ||
9075 | LEAL 16(R11), R11 | ||
9076 | JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B | ||
9077 | |||
9078 | matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B: | ||
9079 | #ifdef GOAMD64_v3 | ||
9080 | TZCNTQ R12, R12 | ||
9081 | |||
9082 | #else | ||
9083 | BSFQ R12, R12 | ||
9084 | |||
9085 | #endif | ||
9086 | SARQ $0x03, R12 | ||
9087 | LEAL 8(R11)(R12*1), R11 | ||
9088 | JMP match_nolit_end_encodeBetterBlockAsm10B | ||
9089 | |||
9090 | matchlen_match8_match_nolit_encodeBetterBlockAsm10B: | ||
9091 | CMPL DI, $0x08 | ||
9092 | JB matchlen_match4_match_nolit_encodeBetterBlockAsm10B | ||
9093 | MOVQ (R8)(R11*1), R10 | ||
9094 | XORQ (R9)(R11*1), R10 | ||
9095 | JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B | ||
9096 | LEAL -8(DI), DI | ||
9097 | LEAL 8(R11), R11 | ||
9098 | JMP matchlen_match4_match_nolit_encodeBetterBlockAsm10B | ||
9099 | |||
9100 | matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B: | ||
9101 | #ifdef GOAMD64_v3 | ||
9102 | TZCNTQ R10, R10 | ||
9103 | |||
9104 | #else | ||
9105 | BSFQ R10, R10 | ||
9106 | |||
9107 | #endif | ||
9108 | SARQ $0x03, R10 | ||
9109 | LEAL (R11)(R10*1), R11 | ||
9110 | JMP match_nolit_end_encodeBetterBlockAsm10B | ||
9111 | |||
9112 | matchlen_match4_match_nolit_encodeBetterBlockAsm10B: | ||
9113 | CMPL DI, $0x04 | ||
9114 | JB matchlen_match2_match_nolit_encodeBetterBlockAsm10B | ||
9115 | MOVL (R8)(R11*1), R10 | ||
9116 | CMPL (R9)(R11*1), R10 | ||
9117 | JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B | ||
9118 | LEAL -4(DI), DI | ||
9119 | LEAL 4(R11), R11 | ||
9120 | |||
9121 | matchlen_match2_match_nolit_encodeBetterBlockAsm10B: | ||
9122 | CMPL DI, $0x01 | ||
9123 | JE matchlen_match1_match_nolit_encodeBetterBlockAsm10B | ||
9124 | JB match_nolit_end_encodeBetterBlockAsm10B | ||
9125 | MOVW (R8)(R11*1), R10 | ||
9126 | CMPW (R9)(R11*1), R10 | ||
9127 | JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B | ||
9128 | LEAL 2(R11), R11 | ||
9129 | SUBL $0x02, DI | ||
9130 | JZ match_nolit_end_encodeBetterBlockAsm10B | ||
9131 | |||
9132 | matchlen_match1_match_nolit_encodeBetterBlockAsm10B: | ||
9133 | MOVB (R8)(R11*1), R10 | ||
9134 | CMPB (R9)(R11*1), R10 | ||
9135 | JNE match_nolit_end_encodeBetterBlockAsm10B | ||
9136 | LEAL 1(R11), R11 | ||
9137 | |||
9138 | match_nolit_end_encodeBetterBlockAsm10B: | ||
9139 | MOVL CX, DI | ||
9140 | SUBL BX, DI | ||
9141 | |||
9142 | // Check if repeat | ||
9143 | CMPL 16(SP), DI | ||
9144 | JEQ match_is_repeat_encodeBetterBlockAsm10B | ||
9145 | MOVL DI, 16(SP) | ||
9146 | MOVL 12(SP), BX | ||
9147 | CMPL BX, SI | ||
9148 | JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B | ||
9149 | MOVL SI, R8 | ||
9150 | MOVL SI, 12(SP) | ||
9151 | LEAQ (DX)(BX*1), R9 | ||
9152 | SUBL BX, R8 | ||
9153 | LEAL -1(R8), BX | ||
9154 | CMPL BX, $0x3c | ||
9155 | JB one_byte_match_emit_encodeBetterBlockAsm10B | ||
9156 | CMPL BX, $0x00000100 | ||
9157 | JB two_bytes_match_emit_encodeBetterBlockAsm10B | ||
9158 | JB three_bytes_match_emit_encodeBetterBlockAsm10B | ||
9159 | |||
9160 | three_bytes_match_emit_encodeBetterBlockAsm10B: | ||
9161 | MOVB $0xf4, (AX) | ||
9162 | MOVW BX, 1(AX) | ||
9163 | ADDQ $0x03, AX | ||
9164 | JMP memmove_long_match_emit_encodeBetterBlockAsm10B | ||
9165 | |||
9166 | two_bytes_match_emit_encodeBetterBlockAsm10B: | ||
9167 | MOVB $0xf0, (AX) | ||
9168 | MOVB BL, 1(AX) | ||
9169 | ADDQ $0x02, AX | ||
9170 | CMPL BX, $0x40 | ||
9171 | JB memmove_match_emit_encodeBetterBlockAsm10B | ||
9172 | JMP memmove_long_match_emit_encodeBetterBlockAsm10B | ||
9173 | |||
9174 | one_byte_match_emit_encodeBetterBlockAsm10B: | ||
9175 | SHLB $0x02, BL | ||
9176 | MOVB BL, (AX) | ||
9177 | ADDQ $0x01, AX | ||
9178 | |||
9179 | memmove_match_emit_encodeBetterBlockAsm10B: | ||
9180 | LEAQ (AX)(R8*1), BX | ||
9181 | |||
9182 | // genMemMoveShort | ||
9183 | CMPQ R8, $0x04 | ||
9184 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 | ||
9185 | CMPQ R8, $0x08 | ||
9186 | JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 | ||
9187 | CMPQ R8, $0x10 | ||
9188 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 | ||
9189 | CMPQ R8, $0x20 | ||
9190 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 | ||
9191 | JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 | ||
9192 | |||
9193 | emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: | ||
9194 | MOVL (R9), R10 | ||
9195 | MOVL R10, (AX) | ||
9196 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B | ||
9197 | |||
9198 | emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: | ||
9199 | MOVL (R9), R10 | ||
9200 | MOVL -4(R9)(R8*1), R9 | ||
9201 | MOVL R10, (AX) | ||
9202 | MOVL R9, -4(AX)(R8*1) | ||
9203 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B | ||
9204 | |||
9205 | emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: | ||
9206 | MOVQ (R9), R10 | ||
9207 | MOVQ -8(R9)(R8*1), R9 | ||
9208 | MOVQ R10, (AX) | ||
9209 | MOVQ R9, -8(AX)(R8*1) | ||
9210 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B | ||
9211 | |||
9212 | emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: | ||
9213 | MOVOU (R9), X0 | ||
9214 | MOVOU -16(R9)(R8*1), X1 | ||
9215 | MOVOU X0, (AX) | ||
9216 | MOVOU X1, -16(AX)(R8*1) | ||
9217 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B | ||
9218 | |||
9219 | emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: | ||
9220 | MOVOU (R9), X0 | ||
9221 | MOVOU 16(R9), X1 | ||
9222 | MOVOU -32(R9)(R8*1), X2 | ||
9223 | MOVOU -16(R9)(R8*1), X3 | ||
9224 | MOVOU X0, (AX) | ||
9225 | MOVOU X1, 16(AX) | ||
9226 | MOVOU X2, -32(AX)(R8*1) | ||
9227 | MOVOU X3, -16(AX)(R8*1) | ||
9228 | |||
9229 | memmove_end_copy_match_emit_encodeBetterBlockAsm10B: | ||
9230 | MOVQ BX, AX | ||
9231 | JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B | ||
9232 | |||
9233 | memmove_long_match_emit_encodeBetterBlockAsm10B: | ||
9234 | LEAQ (AX)(R8*1), BX | ||
9235 | |||
9236 | // genMemMoveLong | ||
9237 | MOVOU (R9), X0 | ||
9238 | MOVOU 16(R9), X1 | ||
9239 | MOVOU -32(R9)(R8*1), X2 | ||
9240 | MOVOU -16(R9)(R8*1), X3 | ||
9241 | MOVQ R8, R12 | ||
9242 | SHRQ $0x05, R12 | ||
9243 | MOVQ AX, R10 | ||
9244 | ANDL $0x0000001f, R10 | ||
9245 | MOVQ $0x00000040, R13 | ||
9246 | SUBQ R10, R13 | ||
9247 | DECQ R12 | ||
9248 | JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 | ||
9249 | LEAQ -32(R9)(R13*1), R10 | ||
9250 | LEAQ -32(AX)(R13*1), R14 | ||
9251 | |||
9252 | emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: | ||
9253 | MOVOU (R10), X4 | ||
9254 | MOVOU 16(R10), X5 | ||
9255 | MOVOA X4, (R14) | ||
9256 | MOVOA X5, 16(R14) | ||
9257 | ADDQ $0x20, R14 | ||
9258 | ADDQ $0x20, R10 | ||
9259 | ADDQ $0x20, R13 | ||
9260 | DECQ R12 | ||
9261 | JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back | ||
9262 | |||
9263 | emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: | ||
9264 | MOVOU -32(R9)(R13*1), X4 | ||
9265 | MOVOU -16(R9)(R13*1), X5 | ||
9266 | MOVOA X4, -32(AX)(R13*1) | ||
9267 | MOVOA X5, -16(AX)(R13*1) | ||
9268 | ADDQ $0x20, R13 | ||
9269 | CMPQ R8, R13 | ||
9270 | JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 | ||
9271 | MOVOU X0, (AX) | ||
9272 | MOVOU X1, 16(AX) | ||
9273 | MOVOU X2, -32(AX)(R8*1) | ||
9274 | MOVOU X3, -16(AX)(R8*1) | ||
9275 | MOVQ BX, AX | ||
9276 | |||
9277 | emit_literal_done_match_emit_encodeBetterBlockAsm10B: | ||
9278 | ADDL R11, CX | ||
9279 | ADDL $0x04, R11 | ||
9280 | MOVL CX, 12(SP) | ||
9281 | |||
9282 | // emitCopy | ||
9283 | CMPL R11, $0x40 | ||
9284 | JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B | ||
9285 | CMPL DI, $0x00000800 | ||
9286 | JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B | ||
9287 | MOVL $0x00000001, BX | ||
9288 | LEAL 16(BX), BX | ||
9289 | MOVB DI, 1(AX) | ||
9290 | SHRL $0x08, DI | ||
9291 | SHLL $0x05, DI | ||
9292 | ORL DI, BX | ||
9293 | MOVB BL, (AX) | ||
9294 | ADDQ $0x02, AX | ||
9295 | SUBL $0x08, R11 | ||
9296 | |||
9297 | // emitRepeat | ||
9298 | LEAL -4(R11), R11 | ||
9299 | JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b | ||
9300 | MOVL R11, BX | ||
9301 | LEAL -4(R11), R11 | ||
9302 | CMPL BX, $0x08 | ||
9303 | JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b | ||
9304 | CMPL BX, $0x0c | ||
9305 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b | ||
9306 | CMPL DI, $0x00000800 | ||
9307 | JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b | ||
9308 | |||
9309 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: | ||
9310 | CMPL R11, $0x00000104 | ||
9311 | JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b | ||
9312 | LEAL -256(R11), R11 | ||
9313 | MOVW $0x0019, (AX) | ||
9314 | MOVW R11, 2(AX) | ||
9315 | ADDQ $0x04, AX | ||
9316 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9317 | |||
9318 | repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: | ||
9319 | LEAL -4(R11), R11 | ||
9320 | MOVW $0x0015, (AX) | ||
9321 | MOVB R11, 2(AX) | ||
9322 | ADDQ $0x03, AX | ||
9323 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9324 | |||
9325 | repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: | ||
9326 | SHLL $0x02, R11 | ||
9327 | ORL $0x01, R11 | ||
9328 | MOVW R11, (AX) | ||
9329 | ADDQ $0x02, AX | ||
9330 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9331 | |||
9332 | repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: | ||
9333 | XORQ BX, BX | ||
9334 | LEAL 1(BX)(R11*4), R11 | ||
9335 | MOVB DI, 1(AX) | ||
9336 | SARL $0x08, DI | ||
9337 | SHLL $0x05, DI | ||
9338 | ORL DI, R11 | ||
9339 | MOVB R11, (AX) | ||
9340 | ADDQ $0x02, AX | ||
9341 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9342 | |||
9343 | long_offset_short_match_nolit_encodeBetterBlockAsm10B: | ||
9344 | MOVB $0xee, (AX) | ||
9345 | MOVW DI, 1(AX) | ||
9346 | LEAL -60(R11), R11 | ||
9347 | ADDQ $0x03, AX | ||
9348 | |||
9349 | // emitRepeat | ||
9350 | MOVL R11, BX | ||
9351 | LEAL -4(R11), R11 | ||
9352 | CMPL BX, $0x08 | ||
9353 | JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short | ||
9354 | CMPL BX, $0x0c | ||
9355 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short | ||
9356 | CMPL DI, $0x00000800 | ||
9357 | JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short | ||
9358 | |||
9359 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: | ||
9360 | CMPL R11, $0x00000104 | ||
9361 | JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short | ||
9362 | LEAL -256(R11), R11 | ||
9363 | MOVW $0x0019, (AX) | ||
9364 | MOVW R11, 2(AX) | ||
9365 | ADDQ $0x04, AX | ||
9366 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9367 | |||
9368 | repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: | ||
9369 | LEAL -4(R11), R11 | ||
9370 | MOVW $0x0015, (AX) | ||
9371 | MOVB R11, 2(AX) | ||
9372 | ADDQ $0x03, AX | ||
9373 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9374 | |||
9375 | repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: | ||
9376 | SHLL $0x02, R11 | ||
9377 | ORL $0x01, R11 | ||
9378 | MOVW R11, (AX) | ||
9379 | ADDQ $0x02, AX | ||
9380 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9381 | |||
9382 | repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: | ||
9383 | XORQ BX, BX | ||
9384 | LEAL 1(BX)(R11*4), R11 | ||
9385 | MOVB DI, 1(AX) | ||
9386 | SARL $0x08, DI | ||
9387 | SHLL $0x05, DI | ||
9388 | ORL DI, R11 | ||
9389 | MOVB R11, (AX) | ||
9390 | ADDQ $0x02, AX | ||
9391 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9392 | |||
9393 | two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: | ||
9394 | MOVL R11, BX | ||
9395 | SHLL $0x02, BX | ||
9396 | CMPL R11, $0x0c | ||
9397 | JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B | ||
9398 | CMPL DI, $0x00000800 | ||
9399 | JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B | ||
9400 | LEAL -15(BX), BX | ||
9401 | MOVB DI, 1(AX) | ||
9402 | SHRL $0x08, DI | ||
9403 | SHLL $0x05, DI | ||
9404 | ORL DI, BX | ||
9405 | MOVB BL, (AX) | ||
9406 | ADDQ $0x02, AX | ||
9407 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9408 | |||
9409 | emit_copy_three_match_nolit_encodeBetterBlockAsm10B: | ||
9410 | LEAL -2(BX), BX | ||
9411 | MOVB BL, (AX) | ||
9412 | MOVW DI, 1(AX) | ||
9413 | ADDQ $0x03, AX | ||
9414 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9415 | |||
9416 | match_is_repeat_encodeBetterBlockAsm10B: | ||
9417 | MOVL 12(SP), BX | ||
9418 | CMPL BX, SI | ||
9419 | JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B | ||
9420 | MOVL SI, R8 | ||
9421 | MOVL SI, 12(SP) | ||
9422 | LEAQ (DX)(BX*1), R9 | ||
9423 | SUBL BX, R8 | ||
9424 | LEAL -1(R8), BX | ||
9425 | CMPL BX, $0x3c | ||
9426 | JB one_byte_match_emit_repeat_encodeBetterBlockAsm10B | ||
9427 | CMPL BX, $0x00000100 | ||
9428 | JB two_bytes_match_emit_repeat_encodeBetterBlockAsm10B | ||
9429 | JB three_bytes_match_emit_repeat_encodeBetterBlockAsm10B | ||
9430 | |||
9431 | three_bytes_match_emit_repeat_encodeBetterBlockAsm10B: | ||
9432 | MOVB $0xf4, (AX) | ||
9433 | MOVW BX, 1(AX) | ||
9434 | ADDQ $0x03, AX | ||
9435 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B | ||
9436 | |||
9437 | two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: | ||
9438 | MOVB $0xf0, (AX) | ||
9439 | MOVB BL, 1(AX) | ||
9440 | ADDQ $0x02, AX | ||
9441 | CMPL BX, $0x40 | ||
9442 | JB memmove_match_emit_repeat_encodeBetterBlockAsm10B | ||
9443 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B | ||
9444 | |||
9445 | one_byte_match_emit_repeat_encodeBetterBlockAsm10B: | ||
9446 | SHLB $0x02, BL | ||
9447 | MOVB BL, (AX) | ||
9448 | ADDQ $0x01, AX | ||
9449 | |||
9450 | memmove_match_emit_repeat_encodeBetterBlockAsm10B: | ||
9451 | LEAQ (AX)(R8*1), BX | ||
9452 | |||
9453 | // genMemMoveShort | ||
9454 | CMPQ R8, $0x04 | ||
9455 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 | ||
9456 | CMPQ R8, $0x08 | ||
9457 | JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 | ||
9458 | CMPQ R8, $0x10 | ||
9459 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 | ||
9460 | CMPQ R8, $0x20 | ||
9461 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 | ||
9462 | JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 | ||
9463 | |||
9464 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: | ||
9465 | MOVL (R9), R10 | ||
9466 | MOVL R10, (AX) | ||
9467 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B | ||
9468 | |||
9469 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: | ||
9470 | MOVL (R9), R10 | ||
9471 | MOVL -4(R9)(R8*1), R9 | ||
9472 | MOVL R10, (AX) | ||
9473 | MOVL R9, -4(AX)(R8*1) | ||
9474 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B | ||
9475 | |||
9476 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: | ||
9477 | MOVQ (R9), R10 | ||
9478 | MOVQ -8(R9)(R8*1), R9 | ||
9479 | MOVQ R10, (AX) | ||
9480 | MOVQ R9, -8(AX)(R8*1) | ||
9481 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B | ||
9482 | |||
9483 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: | ||
9484 | MOVOU (R9), X0 | ||
9485 | MOVOU -16(R9)(R8*1), X1 | ||
9486 | MOVOU X0, (AX) | ||
9487 | MOVOU X1, -16(AX)(R8*1) | ||
9488 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B | ||
9489 | |||
9490 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: | ||
9491 | MOVOU (R9), X0 | ||
9492 | MOVOU 16(R9), X1 | ||
9493 | MOVOU -32(R9)(R8*1), X2 | ||
9494 | MOVOU -16(R9)(R8*1), X3 | ||
9495 | MOVOU X0, (AX) | ||
9496 | MOVOU X1, 16(AX) | ||
9497 | MOVOU X2, -32(AX)(R8*1) | ||
9498 | MOVOU X3, -16(AX)(R8*1) | ||
9499 | |||
9500 | memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: | ||
9501 | MOVQ BX, AX | ||
9502 | JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B | ||
9503 | |||
9504 | memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: | ||
9505 | LEAQ (AX)(R8*1), BX | ||
9506 | |||
9507 | // genMemMoveLong | ||
9508 | MOVOU (R9), X0 | ||
9509 | MOVOU 16(R9), X1 | ||
9510 | MOVOU -32(R9)(R8*1), X2 | ||
9511 | MOVOU -16(R9)(R8*1), X3 | ||
9512 | MOVQ R8, R12 | ||
9513 | SHRQ $0x05, R12 | ||
9514 | MOVQ AX, R10 | ||
9515 | ANDL $0x0000001f, R10 | ||
9516 | MOVQ $0x00000040, R13 | ||
9517 | SUBQ R10, R13 | ||
9518 | DECQ R12 | ||
9519 | JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 | ||
9520 | LEAQ -32(R9)(R13*1), R10 | ||
9521 | LEAQ -32(AX)(R13*1), R14 | ||
9522 | |||
9523 | emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: | ||
9524 | MOVOU (R10), X4 | ||
9525 | MOVOU 16(R10), X5 | ||
9526 | MOVOA X4, (R14) | ||
9527 | MOVOA X5, 16(R14) | ||
9528 | ADDQ $0x20, R14 | ||
9529 | ADDQ $0x20, R10 | ||
9530 | ADDQ $0x20, R13 | ||
9531 | DECQ R12 | ||
9532 | JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back | ||
9533 | |||
9534 | emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: | ||
9535 | MOVOU -32(R9)(R13*1), X4 | ||
9536 | MOVOU -16(R9)(R13*1), X5 | ||
9537 | MOVOA X4, -32(AX)(R13*1) | ||
9538 | MOVOA X5, -16(AX)(R13*1) | ||
9539 | ADDQ $0x20, R13 | ||
9540 | CMPQ R8, R13 | ||
9541 | JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 | ||
9542 | MOVOU X0, (AX) | ||
9543 | MOVOU X1, 16(AX) | ||
9544 | MOVOU X2, -32(AX)(R8*1) | ||
9545 | MOVOU X3, -16(AX)(R8*1) | ||
9546 | MOVQ BX, AX | ||
9547 | |||
9548 | emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: | ||
9549 | ADDL R11, CX | ||
9550 | ADDL $0x04, R11 | ||
9551 | MOVL CX, 12(SP) | ||
9552 | |||
9553 | // emitRepeat | ||
9554 | MOVL R11, BX | ||
9555 | LEAL -4(R11), R11 | ||
9556 | CMPL BX, $0x08 | ||
9557 | JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B | ||
9558 | CMPL BX, $0x0c | ||
9559 | JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B | ||
9560 | CMPL DI, $0x00000800 | ||
9561 | JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B | ||
9562 | |||
9563 | cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: | ||
9564 | CMPL R11, $0x00000104 | ||
9565 | JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B | ||
9566 | LEAL -256(R11), R11 | ||
9567 | MOVW $0x0019, (AX) | ||
9568 | MOVW R11, 2(AX) | ||
9569 | ADDQ $0x04, AX | ||
9570 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9571 | |||
9572 | repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: | ||
9573 | LEAL -4(R11), R11 | ||
9574 | MOVW $0x0015, (AX) | ||
9575 | MOVB R11, 2(AX) | ||
9576 | ADDQ $0x03, AX | ||
9577 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9578 | |||
9579 | repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: | ||
9580 | SHLL $0x02, R11 | ||
9581 | ORL $0x01, R11 | ||
9582 | MOVW R11, (AX) | ||
9583 | ADDQ $0x02, AX | ||
9584 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B | ||
9585 | |||
9586 | repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: | ||
9587 | XORQ BX, BX | ||
9588 | LEAL 1(BX)(R11*4), R11 | ||
9589 | MOVB DI, 1(AX) | ||
9590 | SARL $0x08, DI | ||
9591 | SHLL $0x05, DI | ||
9592 | ORL DI, R11 | ||
9593 | MOVB R11, (AX) | ||
9594 | ADDQ $0x02, AX | ||
9595 | |||
9596 | match_nolit_emitcopy_end_encodeBetterBlockAsm10B: | ||
9597 | CMPL CX, 8(SP) | ||
9598 | JAE emit_remainder_encodeBetterBlockAsm10B | ||
9599 | CMPQ AX, (SP) | ||
9600 | JB match_nolit_dst_ok_encodeBetterBlockAsm10B | ||
9601 | MOVQ $0x00000000, ret+48(FP) | ||
9602 | RET | ||
9603 | |||
9604 | match_nolit_dst_ok_encodeBetterBlockAsm10B: | ||
9605 | MOVQ $0x0000cf1bbcdcbf9b, BX | ||
9606 | MOVQ $0x9e3779b1, DI | ||
9607 | LEAQ 1(SI), SI | ||
9608 | LEAQ -2(CX), R8 | ||
9609 | MOVQ (DX)(SI*1), R9 | ||
9610 | MOVQ 1(DX)(SI*1), R10 | ||
9611 | MOVQ (DX)(R8*1), R11 | ||
9612 | MOVQ 1(DX)(R8*1), R12 | ||
9613 | SHLQ $0x10, R9 | ||
9614 | IMULQ BX, R9 | ||
9615 | SHRQ $0x34, R9 | ||
9616 | SHLQ $0x20, R10 | ||
9617 | IMULQ DI, R10 | ||
9618 | SHRQ $0x36, R10 | ||
9619 | SHLQ $0x10, R11 | ||
9620 | IMULQ BX, R11 | ||
9621 | SHRQ $0x34, R11 | ||
9622 | SHLQ $0x20, R12 | ||
9623 | IMULQ DI, R12 | ||
9624 | SHRQ $0x36, R12 | ||
9625 | LEAQ 1(SI), DI | ||
9626 | LEAQ 1(R8), R13 | ||
9627 | MOVL SI, 24(SP)(R9*4) | ||
9628 | MOVL R8, 24(SP)(R11*4) | ||
9629 | MOVL DI, 16408(SP)(R10*4) | ||
9630 | MOVL R13, 16408(SP)(R12*4) | ||
9631 | LEAQ 1(R8)(SI*1), DI | ||
9632 | SHRQ $0x01, DI | ||
9633 | ADDQ $0x01, SI | ||
9634 | SUBQ $0x01, R8 | ||
9635 | |||
9636 | index_loop_encodeBetterBlockAsm10B: | ||
9637 | CMPQ DI, R8 | ||
9638 | JAE search_loop_encodeBetterBlockAsm10B | ||
9639 | MOVQ (DX)(SI*1), R9 | ||
9640 | MOVQ (DX)(DI*1), R10 | ||
9641 | SHLQ $0x10, R9 | ||
9642 | IMULQ BX, R9 | ||
9643 | SHRQ $0x34, R9 | ||
9644 | SHLQ $0x10, R10 | ||
9645 | IMULQ BX, R10 | ||
9646 | SHRQ $0x34, R10 | ||
9647 | MOVL SI, 24(SP)(R9*4) | ||
9648 | MOVL DI, 24(SP)(R10*4) | ||
9649 | ADDQ $0x02, SI | ||
9650 | ADDQ $0x02, DI | ||
9651 | JMP index_loop_encodeBetterBlockAsm10B | ||
9652 | |||
9653 | emit_remainder_encodeBetterBlockAsm10B: | ||
9654 | MOVQ src_len+32(FP), CX | ||
9655 | SUBL 12(SP), CX | ||
9656 | LEAQ 3(AX)(CX*1), CX | ||
9657 | CMPQ CX, (SP) | ||
9658 | JB emit_remainder_ok_encodeBetterBlockAsm10B | ||
9659 | MOVQ $0x00000000, ret+48(FP) | ||
9660 | RET | ||
9661 | |||
9662 | emit_remainder_ok_encodeBetterBlockAsm10B: | ||
9663 | MOVQ src_len+32(FP), CX | ||
9664 | MOVL 12(SP), BX | ||
9665 | CMPL BX, CX | ||
9666 | JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B | ||
9667 | MOVL CX, SI | ||
9668 | MOVL CX, 12(SP) | ||
9669 | LEAQ (DX)(BX*1), CX | ||
9670 | SUBL BX, SI | ||
9671 | LEAL -1(SI), DX | ||
9672 | CMPL DX, $0x3c | ||
9673 | JB one_byte_emit_remainder_encodeBetterBlockAsm10B | ||
9674 | CMPL DX, $0x00000100 | ||
9675 | JB two_bytes_emit_remainder_encodeBetterBlockAsm10B | ||
9676 | JB three_bytes_emit_remainder_encodeBetterBlockAsm10B | ||
9677 | |||
9678 | three_bytes_emit_remainder_encodeBetterBlockAsm10B: | ||
9679 | MOVB $0xf4, (AX) | ||
9680 | MOVW DX, 1(AX) | ||
9681 | ADDQ $0x03, AX | ||
9682 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B | ||
9683 | |||
9684 | two_bytes_emit_remainder_encodeBetterBlockAsm10B: | ||
9685 | MOVB $0xf0, (AX) | ||
9686 | MOVB DL, 1(AX) | ||
9687 | ADDQ $0x02, AX | ||
9688 | CMPL DX, $0x40 | ||
9689 | JB memmove_emit_remainder_encodeBetterBlockAsm10B | ||
9690 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B | ||
9691 | |||
9692 | one_byte_emit_remainder_encodeBetterBlockAsm10B: | ||
9693 | SHLB $0x02, DL | ||
9694 | MOVB DL, (AX) | ||
9695 | ADDQ $0x01, AX | ||
9696 | |||
9697 | memmove_emit_remainder_encodeBetterBlockAsm10B: | ||
9698 | LEAQ (AX)(SI*1), DX | ||
9699 | MOVL SI, BX | ||
9700 | |||
9701 | // genMemMoveShort | ||
9702 | CMPQ BX, $0x03 | ||
9703 | JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 | ||
9704 | JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 | ||
9705 | CMPQ BX, $0x08 | ||
9706 | JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 | ||
9707 | CMPQ BX, $0x10 | ||
9708 | JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 | ||
9709 | CMPQ BX, $0x20 | ||
9710 | JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 | ||
9711 | JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 | ||
9712 | |||
9713 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: | ||
9714 | MOVB (CX), SI | ||
9715 | MOVB -1(CX)(BX*1), CL | ||
9716 | MOVB SI, (AX) | ||
9717 | MOVB CL, -1(AX)(BX*1) | ||
9718 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B | ||
9719 | |||
9720 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: | ||
9721 | MOVW (CX), SI | ||
9722 | MOVB 2(CX), CL | ||
9723 | MOVW SI, (AX) | ||
9724 | MOVB CL, 2(AX) | ||
9725 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B | ||
9726 | |||
9727 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: | ||
9728 | MOVL (CX), SI | ||
9729 | MOVL -4(CX)(BX*1), CX | ||
9730 | MOVL SI, (AX) | ||
9731 | MOVL CX, -4(AX)(BX*1) | ||
9732 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B | ||
9733 | |||
9734 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: | ||
9735 | MOVQ (CX), SI | ||
9736 | MOVQ -8(CX)(BX*1), CX | ||
9737 | MOVQ SI, (AX) | ||
9738 | MOVQ CX, -8(AX)(BX*1) | ||
9739 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B | ||
9740 | |||
9741 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: | ||
9742 | MOVOU (CX), X0 | ||
9743 | MOVOU -16(CX)(BX*1), X1 | ||
9744 | MOVOU X0, (AX) | ||
9745 | MOVOU X1, -16(AX)(BX*1) | ||
9746 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B | ||
9747 | |||
9748 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: | ||
9749 | MOVOU (CX), X0 | ||
9750 | MOVOU 16(CX), X1 | ||
9751 | MOVOU -32(CX)(BX*1), X2 | ||
9752 | MOVOU -16(CX)(BX*1), X3 | ||
9753 | MOVOU X0, (AX) | ||
9754 | MOVOU X1, 16(AX) | ||
9755 | MOVOU X2, -32(AX)(BX*1) | ||
9756 | MOVOU X3, -16(AX)(BX*1) | ||
9757 | |||
9758 | memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: | ||
9759 | MOVQ DX, AX | ||
9760 | JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B | ||
9761 | |||
9762 | memmove_long_emit_remainder_encodeBetterBlockAsm10B: | ||
9763 | LEAQ (AX)(SI*1), DX | ||
9764 | MOVL SI, BX | ||
9765 | |||
9766 | // genMemMoveLong | ||
9767 | MOVOU (CX), X0 | ||
9768 | MOVOU 16(CX), X1 | ||
9769 | MOVOU -32(CX)(BX*1), X2 | ||
9770 | MOVOU -16(CX)(BX*1), X3 | ||
9771 | MOVQ BX, DI | ||
9772 | SHRQ $0x05, DI | ||
9773 | MOVQ AX, SI | ||
9774 | ANDL $0x0000001f, SI | ||
9775 | MOVQ $0x00000040, R8 | ||
9776 | SUBQ SI, R8 | ||
9777 | DECQ DI | ||
9778 | JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 | ||
9779 | LEAQ -32(CX)(R8*1), SI | ||
9780 | LEAQ -32(AX)(R8*1), R9 | ||
9781 | |||
9782 | emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: | ||
9783 | MOVOU (SI), X4 | ||
9784 | MOVOU 16(SI), X5 | ||
9785 | MOVOA X4, (R9) | ||
9786 | MOVOA X5, 16(R9) | ||
9787 | ADDQ $0x20, R9 | ||
9788 | ADDQ $0x20, SI | ||
9789 | ADDQ $0x20, R8 | ||
9790 | DECQ DI | ||
9791 | JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back | ||
9792 | |||
9793 | emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: | ||
9794 | MOVOU -32(CX)(R8*1), X4 | ||
9795 | MOVOU -16(CX)(R8*1), X5 | ||
9796 | MOVOA X4, -32(AX)(R8*1) | ||
9797 | MOVOA X5, -16(AX)(R8*1) | ||
9798 | ADDQ $0x20, R8 | ||
9799 | CMPQ BX, R8 | ||
9800 | JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 | ||
9801 | MOVOU X0, (AX) | ||
9802 | MOVOU X1, 16(AX) | ||
9803 | MOVOU X2, -32(AX)(BX*1) | ||
9804 | MOVOU X3, -16(AX)(BX*1) | ||
9805 | MOVQ DX, AX | ||
9806 | |||
9807 | emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: | ||
9808 | MOVQ dst_base+0(FP), CX | ||
9809 | SUBQ CX, AX | ||
9810 | MOVQ AX, ret+48(FP) | ||
9811 | RET | ||
9812 | |||
9813 | // func encodeBetterBlockAsm8B(dst []byte, src []byte) int | ||
9814 | // Requires: BMI, SSE2 | ||
9815 | TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 | ||
9816 | MOVQ dst_base+0(FP), AX | ||
9817 | MOVQ $0x00000028, CX | ||
9818 | LEAQ 24(SP), DX | ||
9819 | PXOR X0, X0 | ||
9820 | |||
9821 | zero_loop_encodeBetterBlockAsm8B: | ||
9822 | MOVOU X0, (DX) | ||
9823 | MOVOU X0, 16(DX) | ||
9824 | MOVOU X0, 32(DX) | ||
9825 | MOVOU X0, 48(DX) | ||
9826 | MOVOU X0, 64(DX) | ||
9827 | MOVOU X0, 80(DX) | ||
9828 | MOVOU X0, 96(DX) | ||
9829 | MOVOU X0, 112(DX) | ||
9830 | ADDQ $0x80, DX | ||
9831 | DECQ CX | ||
9832 | JNZ zero_loop_encodeBetterBlockAsm8B | ||
9833 | MOVL $0x00000000, 12(SP) | ||
9834 | MOVQ src_len+32(FP), CX | ||
9835 | LEAQ -6(CX), DX | ||
9836 | LEAQ -8(CX), BX | ||
9837 | MOVL BX, 8(SP) | ||
9838 | SHRQ $0x05, CX | ||
9839 | SUBL CX, DX | ||
9840 | LEAQ (AX)(DX*1), DX | ||
9841 | MOVQ DX, (SP) | ||
9842 | MOVL $0x00000001, CX | ||
9843 | MOVL $0x00000000, 16(SP) | ||
9844 | MOVQ src_base+24(FP), DX | ||
9845 | |||
9846 | search_loop_encodeBetterBlockAsm8B: | ||
9847 | MOVL CX, BX | ||
9848 | SUBL 12(SP), BX | ||
9849 | SHRL $0x04, BX | ||
9850 | LEAL 1(CX)(BX*1), BX | ||
9851 | CMPL BX, 8(SP) | ||
9852 | JAE emit_remainder_encodeBetterBlockAsm8B | ||
9853 | MOVQ (DX)(CX*1), SI | ||
9854 | MOVL BX, 20(SP) | ||
9855 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
9856 | MOVQ $0x9e3779b1, BX | ||
9857 | MOVQ SI, R9 | ||
9858 | MOVQ SI, R10 | ||
9859 | SHLQ $0x10, R9 | ||
9860 | IMULQ R8, R9 | ||
9861 | SHRQ $0x36, R9 | ||
9862 | SHLQ $0x20, R10 | ||
9863 | IMULQ BX, R10 | ||
9864 | SHRQ $0x38, R10 | ||
9865 | MOVL 24(SP)(R9*4), BX | ||
9866 | MOVL 4120(SP)(R10*4), DI | ||
9867 | MOVL CX, 24(SP)(R9*4) | ||
9868 | MOVL CX, 4120(SP)(R10*4) | ||
9869 | MOVQ (DX)(BX*1), R9 | ||
9870 | MOVQ (DX)(DI*1), R10 | ||
9871 | CMPQ R9, SI | ||
9872 | JEQ candidate_match_encodeBetterBlockAsm8B | ||
9873 | CMPQ R10, SI | ||
9874 | JNE no_short_found_encodeBetterBlockAsm8B | ||
9875 | MOVL DI, BX | ||
9876 | JMP candidate_match_encodeBetterBlockAsm8B | ||
9877 | |||
9878 | no_short_found_encodeBetterBlockAsm8B: | ||
9879 | CMPL R9, SI | ||
9880 | JEQ candidate_match_encodeBetterBlockAsm8B | ||
9881 | CMPL R10, SI | ||
9882 | JEQ candidateS_match_encodeBetterBlockAsm8B | ||
9883 | MOVL 20(SP), CX | ||
9884 | JMP search_loop_encodeBetterBlockAsm8B | ||
9885 | |||
9886 | candidateS_match_encodeBetterBlockAsm8B: | ||
9887 | SHRQ $0x08, SI | ||
9888 | MOVQ SI, R9 | ||
9889 | SHLQ $0x10, R9 | ||
9890 | IMULQ R8, R9 | ||
9891 | SHRQ $0x36, R9 | ||
9892 | MOVL 24(SP)(R9*4), BX | ||
9893 | INCL CX | ||
9894 | MOVL CX, 24(SP)(R9*4) | ||
9895 | CMPL (DX)(BX*1), SI | ||
9896 | JEQ candidate_match_encodeBetterBlockAsm8B | ||
9897 | DECL CX | ||
9898 | MOVL DI, BX | ||
9899 | |||
9900 | candidate_match_encodeBetterBlockAsm8B: | ||
9901 | MOVL 12(SP), SI | ||
9902 | TESTL BX, BX | ||
9903 | JZ match_extend_back_end_encodeBetterBlockAsm8B | ||
9904 | |||
9905 | match_extend_back_loop_encodeBetterBlockAsm8B: | ||
9906 | CMPL CX, SI | ||
9907 | JBE match_extend_back_end_encodeBetterBlockAsm8B | ||
9908 | MOVB -1(DX)(BX*1), DI | ||
9909 | MOVB -1(DX)(CX*1), R8 | ||
9910 | CMPB DI, R8 | ||
9911 | JNE match_extend_back_end_encodeBetterBlockAsm8B | ||
9912 | LEAL -1(CX), CX | ||
9913 | DECL BX | ||
9914 | JZ match_extend_back_end_encodeBetterBlockAsm8B | ||
9915 | JMP match_extend_back_loop_encodeBetterBlockAsm8B | ||
9916 | |||
9917 | match_extend_back_end_encodeBetterBlockAsm8B: | ||
9918 | MOVL CX, SI | ||
9919 | SUBL 12(SP), SI | ||
9920 | LEAQ 3(AX)(SI*1), SI | ||
9921 | CMPQ SI, (SP) | ||
9922 | JB match_dst_size_check_encodeBetterBlockAsm8B | ||
9923 | MOVQ $0x00000000, ret+48(FP) | ||
9924 | RET | ||
9925 | |||
9926 | match_dst_size_check_encodeBetterBlockAsm8B: | ||
9927 | MOVL CX, SI | ||
9928 | ADDL $0x04, CX | ||
9929 | ADDL $0x04, BX | ||
9930 | MOVQ src_len+32(FP), DI | ||
9931 | SUBL CX, DI | ||
9932 | LEAQ (DX)(CX*1), R8 | ||
9933 | LEAQ (DX)(BX*1), R9 | ||
9934 | |||
9935 | // matchLen | ||
9936 | XORL R11, R11 | ||
9937 | |||
9938 | matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B: | ||
9939 | CMPL DI, $0x10 | ||
9940 | JB matchlen_match8_match_nolit_encodeBetterBlockAsm8B | ||
9941 | MOVQ (R8)(R11*1), R10 | ||
9942 | MOVQ 8(R8)(R11*1), R12 | ||
9943 | XORQ (R9)(R11*1), R10 | ||
9944 | JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B | ||
9945 | XORQ 8(R9)(R11*1), R12 | ||
9946 | JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B | ||
9947 | LEAL -16(DI), DI | ||
9948 | LEAL 16(R11), R11 | ||
9949 | JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B | ||
9950 | |||
9951 | matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B: | ||
9952 | #ifdef GOAMD64_v3 | ||
9953 | TZCNTQ R12, R12 | ||
9954 | |||
9955 | #else | ||
9956 | BSFQ R12, R12 | ||
9957 | |||
9958 | #endif | ||
9959 | SARQ $0x03, R12 | ||
9960 | LEAL 8(R11)(R12*1), R11 | ||
9961 | JMP match_nolit_end_encodeBetterBlockAsm8B | ||
9962 | |||
9963 | matchlen_match8_match_nolit_encodeBetterBlockAsm8B: | ||
9964 | CMPL DI, $0x08 | ||
9965 | JB matchlen_match4_match_nolit_encodeBetterBlockAsm8B | ||
9966 | MOVQ (R8)(R11*1), R10 | ||
9967 | XORQ (R9)(R11*1), R10 | ||
9968 | JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B | ||
9969 | LEAL -8(DI), DI | ||
9970 | LEAL 8(R11), R11 | ||
9971 | JMP matchlen_match4_match_nolit_encodeBetterBlockAsm8B | ||
9972 | |||
9973 | matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B: | ||
9974 | #ifdef GOAMD64_v3 | ||
9975 | TZCNTQ R10, R10 | ||
9976 | |||
9977 | #else | ||
9978 | BSFQ R10, R10 | ||
9979 | |||
9980 | #endif | ||
9981 | SARQ $0x03, R10 | ||
9982 | LEAL (R11)(R10*1), R11 | ||
9983 | JMP match_nolit_end_encodeBetterBlockAsm8B | ||
9984 | |||
9985 | matchlen_match4_match_nolit_encodeBetterBlockAsm8B: | ||
9986 | CMPL DI, $0x04 | ||
9987 | JB matchlen_match2_match_nolit_encodeBetterBlockAsm8B | ||
9988 | MOVL (R8)(R11*1), R10 | ||
9989 | CMPL (R9)(R11*1), R10 | ||
9990 | JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B | ||
9991 | LEAL -4(DI), DI | ||
9992 | LEAL 4(R11), R11 | ||
9993 | |||
9994 | matchlen_match2_match_nolit_encodeBetterBlockAsm8B: | ||
9995 | CMPL DI, $0x01 | ||
9996 | JE matchlen_match1_match_nolit_encodeBetterBlockAsm8B | ||
9997 | JB match_nolit_end_encodeBetterBlockAsm8B | ||
9998 | MOVW (R8)(R11*1), R10 | ||
9999 | CMPW (R9)(R11*1), R10 | ||
10000 | JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B | ||
10001 | LEAL 2(R11), R11 | ||
10002 | SUBL $0x02, DI | ||
10003 | JZ match_nolit_end_encodeBetterBlockAsm8B | ||
10004 | |||
10005 | matchlen_match1_match_nolit_encodeBetterBlockAsm8B: | ||
10006 | MOVB (R8)(R11*1), R10 | ||
10007 | CMPB (R9)(R11*1), R10 | ||
10008 | JNE match_nolit_end_encodeBetterBlockAsm8B | ||
10009 | LEAL 1(R11), R11 | ||
10010 | |||
10011 | match_nolit_end_encodeBetterBlockAsm8B: | ||
10012 | MOVL CX, DI | ||
10013 | SUBL BX, DI | ||
10014 | |||
10015 | // Check if repeat | ||
10016 | CMPL 16(SP), DI | ||
10017 | JEQ match_is_repeat_encodeBetterBlockAsm8B | ||
10018 | MOVL DI, 16(SP) | ||
10019 | MOVL 12(SP), BX | ||
10020 | CMPL BX, SI | ||
10021 | JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B | ||
10022 | MOVL SI, R8 | ||
10023 | MOVL SI, 12(SP) | ||
10024 | LEAQ (DX)(BX*1), R9 | ||
10025 | SUBL BX, R8 | ||
10026 | LEAL -1(R8), BX | ||
10027 | CMPL BX, $0x3c | ||
10028 | JB one_byte_match_emit_encodeBetterBlockAsm8B | ||
10029 | CMPL BX, $0x00000100 | ||
10030 | JB two_bytes_match_emit_encodeBetterBlockAsm8B | ||
10031 | JB three_bytes_match_emit_encodeBetterBlockAsm8B | ||
10032 | |||
10033 | three_bytes_match_emit_encodeBetterBlockAsm8B: | ||
10034 | MOVB $0xf4, (AX) | ||
10035 | MOVW BX, 1(AX) | ||
10036 | ADDQ $0x03, AX | ||
10037 | JMP memmove_long_match_emit_encodeBetterBlockAsm8B | ||
10038 | |||
10039 | two_bytes_match_emit_encodeBetterBlockAsm8B: | ||
10040 | MOVB $0xf0, (AX) | ||
10041 | MOVB BL, 1(AX) | ||
10042 | ADDQ $0x02, AX | ||
10043 | CMPL BX, $0x40 | ||
10044 | JB memmove_match_emit_encodeBetterBlockAsm8B | ||
10045 | JMP memmove_long_match_emit_encodeBetterBlockAsm8B | ||
10046 | |||
10047 | one_byte_match_emit_encodeBetterBlockAsm8B: | ||
10048 | SHLB $0x02, BL | ||
10049 | MOVB BL, (AX) | ||
10050 | ADDQ $0x01, AX | ||
10051 | |||
10052 | memmove_match_emit_encodeBetterBlockAsm8B: | ||
10053 | LEAQ (AX)(R8*1), BX | ||
10054 | |||
10055 | // genMemMoveShort | ||
10056 | CMPQ R8, $0x04 | ||
10057 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 | ||
10058 | CMPQ R8, $0x08 | ||
10059 | JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 | ||
10060 | CMPQ R8, $0x10 | ||
10061 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 | ||
10062 | CMPQ R8, $0x20 | ||
10063 | JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 | ||
10064 | JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 | ||
10065 | |||
10066 | emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: | ||
10067 | MOVL (R9), R10 | ||
10068 | MOVL R10, (AX) | ||
10069 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B | ||
10070 | |||
10071 | emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: | ||
10072 | MOVL (R9), R10 | ||
10073 | MOVL -4(R9)(R8*1), R9 | ||
10074 | MOVL R10, (AX) | ||
10075 | MOVL R9, -4(AX)(R8*1) | ||
10076 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B | ||
10077 | |||
10078 | emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: | ||
10079 | MOVQ (R9), R10 | ||
10080 | MOVQ -8(R9)(R8*1), R9 | ||
10081 | MOVQ R10, (AX) | ||
10082 | MOVQ R9, -8(AX)(R8*1) | ||
10083 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B | ||
10084 | |||
10085 | emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: | ||
10086 | MOVOU (R9), X0 | ||
10087 | MOVOU -16(R9)(R8*1), X1 | ||
10088 | MOVOU X0, (AX) | ||
10089 | MOVOU X1, -16(AX)(R8*1) | ||
10090 | JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B | ||
10091 | |||
10092 | emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: | ||
10093 | MOVOU (R9), X0 | ||
10094 | MOVOU 16(R9), X1 | ||
10095 | MOVOU -32(R9)(R8*1), X2 | ||
10096 | MOVOU -16(R9)(R8*1), X3 | ||
10097 | MOVOU X0, (AX) | ||
10098 | MOVOU X1, 16(AX) | ||
10099 | MOVOU X2, -32(AX)(R8*1) | ||
10100 | MOVOU X3, -16(AX)(R8*1) | ||
10101 | |||
10102 | memmove_end_copy_match_emit_encodeBetterBlockAsm8B: | ||
10103 | MOVQ BX, AX | ||
10104 | JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B | ||
10105 | |||
10106 | memmove_long_match_emit_encodeBetterBlockAsm8B: | ||
10107 | LEAQ (AX)(R8*1), BX | ||
10108 | |||
10109 | // genMemMoveLong | ||
10110 | MOVOU (R9), X0 | ||
10111 | MOVOU 16(R9), X1 | ||
10112 | MOVOU -32(R9)(R8*1), X2 | ||
10113 | MOVOU -16(R9)(R8*1), X3 | ||
10114 | MOVQ R8, R12 | ||
10115 | SHRQ $0x05, R12 | ||
10116 | MOVQ AX, R10 | ||
10117 | ANDL $0x0000001f, R10 | ||
10118 | MOVQ $0x00000040, R13 | ||
10119 | SUBQ R10, R13 | ||
10120 | DECQ R12 | ||
10121 | JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 | ||
10122 | LEAQ -32(R9)(R13*1), R10 | ||
10123 | LEAQ -32(AX)(R13*1), R14 | ||
10124 | |||
10125 | emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: | ||
10126 | MOVOU (R10), X4 | ||
10127 | MOVOU 16(R10), X5 | ||
10128 | MOVOA X4, (R14) | ||
10129 | MOVOA X5, 16(R14) | ||
10130 | ADDQ $0x20, R14 | ||
10131 | ADDQ $0x20, R10 | ||
10132 | ADDQ $0x20, R13 | ||
10133 | DECQ R12 | ||
10134 | JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back | ||
10135 | |||
10136 | emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: | ||
10137 | MOVOU -32(R9)(R13*1), X4 | ||
10138 | MOVOU -16(R9)(R13*1), X5 | ||
10139 | MOVOA X4, -32(AX)(R13*1) | ||
10140 | MOVOA X5, -16(AX)(R13*1) | ||
10141 | ADDQ $0x20, R13 | ||
10142 | CMPQ R8, R13 | ||
10143 | JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 | ||
10144 | MOVOU X0, (AX) | ||
10145 | MOVOU X1, 16(AX) | ||
10146 | MOVOU X2, -32(AX)(R8*1) | ||
10147 | MOVOU X3, -16(AX)(R8*1) | ||
10148 | MOVQ BX, AX | ||
10149 | |||
10150 | emit_literal_done_match_emit_encodeBetterBlockAsm8B: | ||
10151 | ADDL R11, CX | ||
10152 | ADDL $0x04, R11 | ||
10153 | MOVL CX, 12(SP) | ||
10154 | |||
10155 | // emitCopy | ||
10156 | CMPL R11, $0x40 | ||
10157 | JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B | ||
10158 | CMPL DI, $0x00000800 | ||
10159 | JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B | ||
10160 | MOVL $0x00000001, BX | ||
10161 | LEAL 16(BX), BX | ||
10162 | MOVB DI, 1(AX) | ||
10163 | SHRL $0x08, DI | ||
10164 | SHLL $0x05, DI | ||
10165 | ORL DI, BX | ||
10166 | MOVB BL, (AX) | ||
10167 | ADDQ $0x02, AX | ||
10168 | SUBL $0x08, R11 | ||
10169 | |||
10170 | // emitRepeat | ||
10171 | LEAL -4(R11), R11 | ||
10172 | JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b | ||
10173 | MOVL R11, BX | ||
10174 | LEAL -4(R11), R11 | ||
10175 | CMPL BX, $0x08 | ||
10176 | JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b | ||
10177 | CMPL BX, $0x0c | ||
10178 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b | ||
10179 | |||
10180 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: | ||
10181 | CMPL R11, $0x00000104 | ||
10182 | JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b | ||
10183 | LEAL -256(R11), R11 | ||
10184 | MOVW $0x0019, (AX) | ||
10185 | MOVW R11, 2(AX) | ||
10186 | ADDQ $0x04, AX | ||
10187 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10188 | |||
10189 | repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: | ||
10190 | LEAL -4(R11), R11 | ||
10191 | MOVW $0x0015, (AX) | ||
10192 | MOVB R11, 2(AX) | ||
10193 | ADDQ $0x03, AX | ||
10194 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10195 | |||
10196 | repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: | ||
10197 | SHLL $0x02, R11 | ||
10198 | ORL $0x01, R11 | ||
10199 | MOVW R11, (AX) | ||
10200 | ADDQ $0x02, AX | ||
10201 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10202 | XORQ BX, BX | ||
10203 | LEAL 1(BX)(R11*4), R11 | ||
10204 | MOVB DI, 1(AX) | ||
10205 | SARL $0x08, DI | ||
10206 | SHLL $0x05, DI | ||
10207 | ORL DI, R11 | ||
10208 | MOVB R11, (AX) | ||
10209 | ADDQ $0x02, AX | ||
10210 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10211 | |||
10212 | long_offset_short_match_nolit_encodeBetterBlockAsm8B: | ||
10213 | MOVB $0xee, (AX) | ||
10214 | MOVW DI, 1(AX) | ||
10215 | LEAL -60(R11), R11 | ||
10216 | ADDQ $0x03, AX | ||
10217 | |||
10218 | // emitRepeat | ||
10219 | MOVL R11, BX | ||
10220 | LEAL -4(R11), R11 | ||
10221 | CMPL BX, $0x08 | ||
10222 | JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short | ||
10223 | CMPL BX, $0x0c | ||
10224 | JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short | ||
10225 | |||
10226 | cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: | ||
10227 | CMPL R11, $0x00000104 | ||
10228 | JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short | ||
10229 | LEAL -256(R11), R11 | ||
10230 | MOVW $0x0019, (AX) | ||
10231 | MOVW R11, 2(AX) | ||
10232 | ADDQ $0x04, AX | ||
10233 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10234 | |||
10235 | repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: | ||
10236 | LEAL -4(R11), R11 | ||
10237 | MOVW $0x0015, (AX) | ||
10238 | MOVB R11, 2(AX) | ||
10239 | ADDQ $0x03, AX | ||
10240 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10241 | |||
10242 | repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: | ||
10243 | SHLL $0x02, R11 | ||
10244 | ORL $0x01, R11 | ||
10245 | MOVW R11, (AX) | ||
10246 | ADDQ $0x02, AX | ||
10247 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10248 | XORQ BX, BX | ||
10249 | LEAL 1(BX)(R11*4), R11 | ||
10250 | MOVB DI, 1(AX) | ||
10251 | SARL $0x08, DI | ||
10252 | SHLL $0x05, DI | ||
10253 | ORL DI, R11 | ||
10254 | MOVB R11, (AX) | ||
10255 | ADDQ $0x02, AX | ||
10256 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10257 | |||
10258 | two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: | ||
10259 | MOVL R11, BX | ||
10260 | SHLL $0x02, BX | ||
10261 | CMPL R11, $0x0c | ||
10262 | JAE emit_copy_three_match_nolit_encodeBetterBlockAsm8B | ||
10263 | LEAL -15(BX), BX | ||
10264 | MOVB DI, 1(AX) | ||
10265 | SHRL $0x08, DI | ||
10266 | SHLL $0x05, DI | ||
10267 | ORL DI, BX | ||
10268 | MOVB BL, (AX) | ||
10269 | ADDQ $0x02, AX | ||
10270 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10271 | |||
10272 | emit_copy_three_match_nolit_encodeBetterBlockAsm8B: | ||
10273 | LEAL -2(BX), BX | ||
10274 | MOVB BL, (AX) | ||
10275 | MOVW DI, 1(AX) | ||
10276 | ADDQ $0x03, AX | ||
10277 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10278 | |||
10279 | match_is_repeat_encodeBetterBlockAsm8B: | ||
10280 | MOVL 12(SP), BX | ||
10281 | CMPL BX, SI | ||
10282 | JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B | ||
10283 | MOVL SI, DI | ||
10284 | MOVL SI, 12(SP) | ||
10285 | LEAQ (DX)(BX*1), R8 | ||
10286 | SUBL BX, DI | ||
10287 | LEAL -1(DI), BX | ||
10288 | CMPL BX, $0x3c | ||
10289 | JB one_byte_match_emit_repeat_encodeBetterBlockAsm8B | ||
10290 | CMPL BX, $0x00000100 | ||
10291 | JB two_bytes_match_emit_repeat_encodeBetterBlockAsm8B | ||
10292 | JB three_bytes_match_emit_repeat_encodeBetterBlockAsm8B | ||
10293 | |||
10294 | three_bytes_match_emit_repeat_encodeBetterBlockAsm8B: | ||
10295 | MOVB $0xf4, (AX) | ||
10296 | MOVW BX, 1(AX) | ||
10297 | ADDQ $0x03, AX | ||
10298 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B | ||
10299 | |||
10300 | two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: | ||
10301 | MOVB $0xf0, (AX) | ||
10302 | MOVB BL, 1(AX) | ||
10303 | ADDQ $0x02, AX | ||
10304 | CMPL BX, $0x40 | ||
10305 | JB memmove_match_emit_repeat_encodeBetterBlockAsm8B | ||
10306 | JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B | ||
10307 | |||
10308 | one_byte_match_emit_repeat_encodeBetterBlockAsm8B: | ||
10309 | SHLB $0x02, BL | ||
10310 | MOVB BL, (AX) | ||
10311 | ADDQ $0x01, AX | ||
10312 | |||
10313 | memmove_match_emit_repeat_encodeBetterBlockAsm8B: | ||
10314 | LEAQ (AX)(DI*1), BX | ||
10315 | |||
10316 | // genMemMoveShort | ||
10317 | CMPQ DI, $0x04 | ||
10318 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 | ||
10319 | CMPQ DI, $0x08 | ||
10320 | JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 | ||
10321 | CMPQ DI, $0x10 | ||
10322 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 | ||
10323 | CMPQ DI, $0x20 | ||
10324 | JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 | ||
10325 | JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 | ||
10326 | |||
10327 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: | ||
10328 | MOVL (R8), R9 | ||
10329 | MOVL R9, (AX) | ||
10330 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B | ||
10331 | |||
10332 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: | ||
10333 | MOVL (R8), R9 | ||
10334 | MOVL -4(R8)(DI*1), R8 | ||
10335 | MOVL R9, (AX) | ||
10336 | MOVL R8, -4(AX)(DI*1) | ||
10337 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B | ||
10338 | |||
10339 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: | ||
10340 | MOVQ (R8), R9 | ||
10341 | MOVQ -8(R8)(DI*1), R8 | ||
10342 | MOVQ R9, (AX) | ||
10343 | MOVQ R8, -8(AX)(DI*1) | ||
10344 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B | ||
10345 | |||
10346 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: | ||
10347 | MOVOU (R8), X0 | ||
10348 | MOVOU -16(R8)(DI*1), X1 | ||
10349 | MOVOU X0, (AX) | ||
10350 | MOVOU X1, -16(AX)(DI*1) | ||
10351 | JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B | ||
10352 | |||
10353 | emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: | ||
10354 | MOVOU (R8), X0 | ||
10355 | MOVOU 16(R8), X1 | ||
10356 | MOVOU -32(R8)(DI*1), X2 | ||
10357 | MOVOU -16(R8)(DI*1), X3 | ||
10358 | MOVOU X0, (AX) | ||
10359 | MOVOU X1, 16(AX) | ||
10360 | MOVOU X2, -32(AX)(DI*1) | ||
10361 | MOVOU X3, -16(AX)(DI*1) | ||
10362 | |||
10363 | memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: | ||
10364 | MOVQ BX, AX | ||
10365 | JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B | ||
10366 | |||
10367 | memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: | ||
10368 | LEAQ (AX)(DI*1), BX | ||
10369 | |||
10370 | // genMemMoveLong | ||
10371 | MOVOU (R8), X0 | ||
10372 | MOVOU 16(R8), X1 | ||
10373 | MOVOU -32(R8)(DI*1), X2 | ||
10374 | MOVOU -16(R8)(DI*1), X3 | ||
10375 | MOVQ DI, R10 | ||
10376 | SHRQ $0x05, R10 | ||
10377 | MOVQ AX, R9 | ||
10378 | ANDL $0x0000001f, R9 | ||
10379 | MOVQ $0x00000040, R12 | ||
10380 | SUBQ R9, R12 | ||
10381 | DECQ R10 | ||
10382 | JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 | ||
10383 | LEAQ -32(R8)(R12*1), R9 | ||
10384 | LEAQ -32(AX)(R12*1), R13 | ||
10385 | |||
10386 | emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: | ||
10387 | MOVOU (R9), X4 | ||
10388 | MOVOU 16(R9), X5 | ||
10389 | MOVOA X4, (R13) | ||
10390 | MOVOA X5, 16(R13) | ||
10391 | ADDQ $0x20, R13 | ||
10392 | ADDQ $0x20, R9 | ||
10393 | ADDQ $0x20, R12 | ||
10394 | DECQ R10 | ||
10395 | JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back | ||
10396 | |||
10397 | emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: | ||
10398 | MOVOU -32(R8)(R12*1), X4 | ||
10399 | MOVOU -16(R8)(R12*1), X5 | ||
10400 | MOVOA X4, -32(AX)(R12*1) | ||
10401 | MOVOA X5, -16(AX)(R12*1) | ||
10402 | ADDQ $0x20, R12 | ||
10403 | CMPQ DI, R12 | ||
10404 | JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 | ||
10405 | MOVOU X0, (AX) | ||
10406 | MOVOU X1, 16(AX) | ||
10407 | MOVOU X2, -32(AX)(DI*1) | ||
10408 | MOVOU X3, -16(AX)(DI*1) | ||
10409 | MOVQ BX, AX | ||
10410 | |||
10411 | emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: | ||
10412 | ADDL R11, CX | ||
10413 | ADDL $0x04, R11 | ||
10414 | MOVL CX, 12(SP) | ||
10415 | |||
10416 | // emitRepeat | ||
10417 | MOVL R11, BX | ||
10418 | LEAL -4(R11), R11 | ||
10419 | CMPL BX, $0x08 | ||
10420 | JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B | ||
10421 | CMPL BX, $0x0c | ||
10422 | JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B | ||
10423 | |||
10424 | cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: | ||
10425 | CMPL R11, $0x00000104 | ||
10426 | JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B | ||
10427 | LEAL -256(R11), R11 | ||
10428 | MOVW $0x0019, (AX) | ||
10429 | MOVW R11, 2(AX) | ||
10430 | ADDQ $0x04, AX | ||
10431 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10432 | |||
10433 | repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: | ||
10434 | LEAL -4(R11), R11 | ||
10435 | MOVW $0x0015, (AX) | ||
10436 | MOVB R11, 2(AX) | ||
10437 | ADDQ $0x03, AX | ||
10438 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10439 | |||
10440 | repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: | ||
10441 | SHLL $0x02, R11 | ||
10442 | ORL $0x01, R11 | ||
10443 | MOVW R11, (AX) | ||
10444 | ADDQ $0x02, AX | ||
10445 | JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B | ||
10446 | XORQ BX, BX | ||
10447 | LEAL 1(BX)(R11*4), R11 | ||
10448 | MOVB DI, 1(AX) | ||
10449 | SARL $0x08, DI | ||
10450 | SHLL $0x05, DI | ||
10451 | ORL DI, R11 | ||
10452 | MOVB R11, (AX) | ||
10453 | ADDQ $0x02, AX | ||
10454 | |||
10455 | match_nolit_emitcopy_end_encodeBetterBlockAsm8B: | ||
10456 | CMPL CX, 8(SP) | ||
10457 | JAE emit_remainder_encodeBetterBlockAsm8B | ||
10458 | CMPQ AX, (SP) | ||
10459 | JB match_nolit_dst_ok_encodeBetterBlockAsm8B | ||
10460 | MOVQ $0x00000000, ret+48(FP) | ||
10461 | RET | ||
10462 | |||
10463 | match_nolit_dst_ok_encodeBetterBlockAsm8B: | ||
10464 | MOVQ $0x0000cf1bbcdcbf9b, BX | ||
10465 | MOVQ $0x9e3779b1, DI | ||
10466 | LEAQ 1(SI), SI | ||
10467 | LEAQ -2(CX), R8 | ||
10468 | MOVQ (DX)(SI*1), R9 | ||
10469 | MOVQ 1(DX)(SI*1), R10 | ||
10470 | MOVQ (DX)(R8*1), R11 | ||
10471 | MOVQ 1(DX)(R8*1), R12 | ||
10472 | SHLQ $0x10, R9 | ||
10473 | IMULQ BX, R9 | ||
10474 | SHRQ $0x36, R9 | ||
10475 | SHLQ $0x20, R10 | ||
10476 | IMULQ DI, R10 | ||
10477 | SHRQ $0x38, R10 | ||
10478 | SHLQ $0x10, R11 | ||
10479 | IMULQ BX, R11 | ||
10480 | SHRQ $0x36, R11 | ||
10481 | SHLQ $0x20, R12 | ||
10482 | IMULQ DI, R12 | ||
10483 | SHRQ $0x38, R12 | ||
10484 | LEAQ 1(SI), DI | ||
10485 | LEAQ 1(R8), R13 | ||
10486 | MOVL SI, 24(SP)(R9*4) | ||
10487 | MOVL R8, 24(SP)(R11*4) | ||
10488 | MOVL DI, 4120(SP)(R10*4) | ||
10489 | MOVL R13, 4120(SP)(R12*4) | ||
10490 | LEAQ 1(R8)(SI*1), DI | ||
10491 | SHRQ $0x01, DI | ||
10492 | ADDQ $0x01, SI | ||
10493 | SUBQ $0x01, R8 | ||
10494 | |||
10495 | index_loop_encodeBetterBlockAsm8B: | ||
10496 | CMPQ DI, R8 | ||
10497 | JAE search_loop_encodeBetterBlockAsm8B | ||
10498 | MOVQ (DX)(SI*1), R9 | ||
10499 | MOVQ (DX)(DI*1), R10 | ||
10500 | SHLQ $0x10, R9 | ||
10501 | IMULQ BX, R9 | ||
10502 | SHRQ $0x36, R9 | ||
10503 | SHLQ $0x10, R10 | ||
10504 | IMULQ BX, R10 | ||
10505 | SHRQ $0x36, R10 | ||
10506 | MOVL SI, 24(SP)(R9*4) | ||
10507 | MOVL DI, 24(SP)(R10*4) | ||
10508 | ADDQ $0x02, SI | ||
10509 | ADDQ $0x02, DI | ||
10510 | JMP index_loop_encodeBetterBlockAsm8B | ||
10511 | |||
10512 | emit_remainder_encodeBetterBlockAsm8B: | ||
10513 | MOVQ src_len+32(FP), CX | ||
10514 | SUBL 12(SP), CX | ||
10515 | LEAQ 3(AX)(CX*1), CX | ||
10516 | CMPQ CX, (SP) | ||
10517 | JB emit_remainder_ok_encodeBetterBlockAsm8B | ||
10518 | MOVQ $0x00000000, ret+48(FP) | ||
10519 | RET | ||
10520 | |||
10521 | emit_remainder_ok_encodeBetterBlockAsm8B: | ||
10522 | MOVQ src_len+32(FP), CX | ||
10523 | MOVL 12(SP), BX | ||
10524 | CMPL BX, CX | ||
10525 | JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B | ||
10526 | MOVL CX, SI | ||
10527 | MOVL CX, 12(SP) | ||
10528 | LEAQ (DX)(BX*1), CX | ||
10529 | SUBL BX, SI | ||
10530 | LEAL -1(SI), DX | ||
10531 | CMPL DX, $0x3c | ||
10532 | JB one_byte_emit_remainder_encodeBetterBlockAsm8B | ||
10533 | CMPL DX, $0x00000100 | ||
10534 | JB two_bytes_emit_remainder_encodeBetterBlockAsm8B | ||
10535 | JB three_bytes_emit_remainder_encodeBetterBlockAsm8B | ||
10536 | |||
10537 | three_bytes_emit_remainder_encodeBetterBlockAsm8B: | ||
10538 | MOVB $0xf4, (AX) | ||
10539 | MOVW DX, 1(AX) | ||
10540 | ADDQ $0x03, AX | ||
10541 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B | ||
10542 | |||
10543 | two_bytes_emit_remainder_encodeBetterBlockAsm8B: | ||
10544 | MOVB $0xf0, (AX) | ||
10545 | MOVB DL, 1(AX) | ||
10546 | ADDQ $0x02, AX | ||
10547 | CMPL DX, $0x40 | ||
10548 | JB memmove_emit_remainder_encodeBetterBlockAsm8B | ||
10549 | JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B | ||
10550 | |||
10551 | one_byte_emit_remainder_encodeBetterBlockAsm8B: | ||
10552 | SHLB $0x02, DL | ||
10553 | MOVB DL, (AX) | ||
10554 | ADDQ $0x01, AX | ||
10555 | |||
10556 | memmove_emit_remainder_encodeBetterBlockAsm8B: | ||
10557 | LEAQ (AX)(SI*1), DX | ||
10558 | MOVL SI, BX | ||
10559 | |||
10560 | // genMemMoveShort | ||
10561 | CMPQ BX, $0x03 | ||
10562 | JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 | ||
10563 | JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 | ||
10564 | CMPQ BX, $0x08 | ||
10565 | JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 | ||
10566 | CMPQ BX, $0x10 | ||
10567 | JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 | ||
10568 | CMPQ BX, $0x20 | ||
10569 | JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 | ||
10570 | JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 | ||
10571 | |||
10572 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: | ||
10573 | MOVB (CX), SI | ||
10574 | MOVB -1(CX)(BX*1), CL | ||
10575 | MOVB SI, (AX) | ||
10576 | MOVB CL, -1(AX)(BX*1) | ||
10577 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B | ||
10578 | |||
10579 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: | ||
10580 | MOVW (CX), SI | ||
10581 | MOVB 2(CX), CL | ||
10582 | MOVW SI, (AX) | ||
10583 | MOVB CL, 2(AX) | ||
10584 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B | ||
10585 | |||
10586 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: | ||
10587 | MOVL (CX), SI | ||
10588 | MOVL -4(CX)(BX*1), CX | ||
10589 | MOVL SI, (AX) | ||
10590 | MOVL CX, -4(AX)(BX*1) | ||
10591 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B | ||
10592 | |||
10593 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: | ||
10594 | MOVQ (CX), SI | ||
10595 | MOVQ -8(CX)(BX*1), CX | ||
10596 | MOVQ SI, (AX) | ||
10597 | MOVQ CX, -8(AX)(BX*1) | ||
10598 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B | ||
10599 | |||
10600 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: | ||
10601 | MOVOU (CX), X0 | ||
10602 | MOVOU -16(CX)(BX*1), X1 | ||
10603 | MOVOU X0, (AX) | ||
10604 | MOVOU X1, -16(AX)(BX*1) | ||
10605 | JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B | ||
10606 | |||
10607 | emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: | ||
10608 | MOVOU (CX), X0 | ||
10609 | MOVOU 16(CX), X1 | ||
10610 | MOVOU -32(CX)(BX*1), X2 | ||
10611 | MOVOU -16(CX)(BX*1), X3 | ||
10612 | MOVOU X0, (AX) | ||
10613 | MOVOU X1, 16(AX) | ||
10614 | MOVOU X2, -32(AX)(BX*1) | ||
10615 | MOVOU X3, -16(AX)(BX*1) | ||
10616 | |||
10617 | memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: | ||
10618 | MOVQ DX, AX | ||
10619 | JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B | ||
10620 | |||
10621 | memmove_long_emit_remainder_encodeBetterBlockAsm8B: | ||
10622 | LEAQ (AX)(SI*1), DX | ||
10623 | MOVL SI, BX | ||
10624 | |||
10625 | // genMemMoveLong | ||
10626 | MOVOU (CX), X0 | ||
10627 | MOVOU 16(CX), X1 | ||
10628 | MOVOU -32(CX)(BX*1), X2 | ||
10629 | MOVOU -16(CX)(BX*1), X3 | ||
10630 | MOVQ BX, DI | ||
10631 | SHRQ $0x05, DI | ||
10632 | MOVQ AX, SI | ||
10633 | ANDL $0x0000001f, SI | ||
10634 | MOVQ $0x00000040, R8 | ||
10635 | SUBQ SI, R8 | ||
10636 | DECQ DI | ||
10637 | JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 | ||
10638 | LEAQ -32(CX)(R8*1), SI | ||
10639 | LEAQ -32(AX)(R8*1), R9 | ||
10640 | |||
10641 | emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: | ||
10642 | MOVOU (SI), X4 | ||
10643 | MOVOU 16(SI), X5 | ||
10644 | MOVOA X4, (R9) | ||
10645 | MOVOA X5, 16(R9) | ||
10646 | ADDQ $0x20, R9 | ||
10647 | ADDQ $0x20, SI | ||
10648 | ADDQ $0x20, R8 | ||
10649 | DECQ DI | ||
10650 | JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back | ||
10651 | |||
10652 | emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: | ||
10653 | MOVOU -32(CX)(R8*1), X4 | ||
10654 | MOVOU -16(CX)(R8*1), X5 | ||
10655 | MOVOA X4, -32(AX)(R8*1) | ||
10656 | MOVOA X5, -16(AX)(R8*1) | ||
10657 | ADDQ $0x20, R8 | ||
10658 | CMPQ BX, R8 | ||
10659 | JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 | ||
10660 | MOVOU X0, (AX) | ||
10661 | MOVOU X1, 16(AX) | ||
10662 | MOVOU X2, -32(AX)(BX*1) | ||
10663 | MOVOU X3, -16(AX)(BX*1) | ||
10664 | MOVQ DX, AX | ||
10665 | |||
10666 | emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: | ||
10667 | MOVQ dst_base+0(FP), CX | ||
10668 | SUBQ CX, AX | ||
10669 | MOVQ AX, ret+48(FP) | ||
10670 | RET | ||
10671 | |||
10672 | // func encodeSnappyBlockAsm(dst []byte, src []byte) int | ||
10673 | // Requires: BMI, SSE2 | ||
10674 | TEXT ·encodeSnappyBlockAsm(SB), $65560-56 | ||
10675 | MOVQ dst_base+0(FP), AX | ||
10676 | MOVQ $0x00000200, CX | ||
10677 | LEAQ 24(SP), DX | ||
10678 | PXOR X0, X0 | ||
10679 | |||
10680 | zero_loop_encodeSnappyBlockAsm: | ||
10681 | MOVOU X0, (DX) | ||
10682 | MOVOU X0, 16(DX) | ||
10683 | MOVOU X0, 32(DX) | ||
10684 | MOVOU X0, 48(DX) | ||
10685 | MOVOU X0, 64(DX) | ||
10686 | MOVOU X0, 80(DX) | ||
10687 | MOVOU X0, 96(DX) | ||
10688 | MOVOU X0, 112(DX) | ||
10689 | ADDQ $0x80, DX | ||
10690 | DECQ CX | ||
10691 | JNZ zero_loop_encodeSnappyBlockAsm | ||
10692 | MOVL $0x00000000, 12(SP) | ||
10693 | MOVQ src_len+32(FP), CX | ||
10694 | LEAQ -9(CX), DX | ||
10695 | LEAQ -8(CX), BX | ||
10696 | MOVL BX, 8(SP) | ||
10697 | SHRQ $0x05, CX | ||
10698 | SUBL CX, DX | ||
10699 | LEAQ (AX)(DX*1), DX | ||
10700 | MOVQ DX, (SP) | ||
10701 | MOVL $0x00000001, CX | ||
10702 | MOVL CX, 16(SP) | ||
10703 | MOVQ src_base+24(FP), DX | ||
10704 | |||
10705 | search_loop_encodeSnappyBlockAsm: | ||
10706 | MOVL CX, BX | ||
10707 | SUBL 12(SP), BX | ||
10708 | SHRL $0x06, BX | ||
10709 | LEAL 4(CX)(BX*1), BX | ||
10710 | CMPL BX, 8(SP) | ||
10711 | JAE emit_remainder_encodeSnappyBlockAsm | ||
10712 | MOVQ (DX)(CX*1), SI | ||
10713 | MOVL BX, 20(SP) | ||
10714 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
10715 | MOVQ SI, R9 | ||
10716 | MOVQ SI, R10 | ||
10717 | SHRQ $0x08, R10 | ||
10718 | SHLQ $0x10, R9 | ||
10719 | IMULQ R8, R9 | ||
10720 | SHRQ $0x32, R9 | ||
10721 | SHLQ $0x10, R10 | ||
10722 | IMULQ R8, R10 | ||
10723 | SHRQ $0x32, R10 | ||
10724 | MOVL 24(SP)(R9*4), BX | ||
10725 | MOVL 24(SP)(R10*4), DI | ||
10726 | MOVL CX, 24(SP)(R9*4) | ||
10727 | LEAL 1(CX), R9 | ||
10728 | MOVL R9, 24(SP)(R10*4) | ||
10729 | MOVQ SI, R9 | ||
10730 | SHRQ $0x10, R9 | ||
10731 | SHLQ $0x10, R9 | ||
10732 | IMULQ R8, R9 | ||
10733 | SHRQ $0x32, R9 | ||
10734 | MOVL CX, R8 | ||
10735 | SUBL 16(SP), R8 | ||
10736 | MOVL 1(DX)(R8*1), R10 | ||
10737 | MOVQ SI, R8 | ||
10738 | SHRQ $0x08, R8 | ||
10739 | CMPL R8, R10 | ||
10740 | JNE no_repeat_found_encodeSnappyBlockAsm | ||
10741 | LEAL 1(CX), SI | ||
10742 | MOVL 12(SP), BX | ||
10743 | MOVL SI, DI | ||
10744 | SUBL 16(SP), DI | ||
10745 | JZ repeat_extend_back_end_encodeSnappyBlockAsm | ||
10746 | |||
10747 | repeat_extend_back_loop_encodeSnappyBlockAsm: | ||
10748 | CMPL SI, BX | ||
10749 | JBE repeat_extend_back_end_encodeSnappyBlockAsm | ||
10750 | MOVB -1(DX)(DI*1), R8 | ||
10751 | MOVB -1(DX)(SI*1), R9 | ||
10752 | CMPB R8, R9 | ||
10753 | JNE repeat_extend_back_end_encodeSnappyBlockAsm | ||
10754 | LEAL -1(SI), SI | ||
10755 | DECL DI | ||
10756 | JNZ repeat_extend_back_loop_encodeSnappyBlockAsm | ||
10757 | |||
10758 | repeat_extend_back_end_encodeSnappyBlockAsm: | ||
10759 | MOVL 12(SP), BX | ||
10760 | CMPL BX, SI | ||
10761 | JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm | ||
10762 | MOVL SI, DI | ||
10763 | MOVL SI, 12(SP) | ||
10764 | LEAQ (DX)(BX*1), R8 | ||
10765 | SUBL BX, DI | ||
10766 | LEAL -1(DI), BX | ||
10767 | CMPL BX, $0x3c | ||
10768 | JB one_byte_repeat_emit_encodeSnappyBlockAsm | ||
10769 | CMPL BX, $0x00000100 | ||
10770 | JB two_bytes_repeat_emit_encodeSnappyBlockAsm | ||
10771 | CMPL BX, $0x00010000 | ||
10772 | JB three_bytes_repeat_emit_encodeSnappyBlockAsm | ||
10773 | CMPL BX, $0x01000000 | ||
10774 | JB four_bytes_repeat_emit_encodeSnappyBlockAsm | ||
10775 | MOVB $0xfc, (AX) | ||
10776 | MOVL BX, 1(AX) | ||
10777 | ADDQ $0x05, AX | ||
10778 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm | ||
10779 | |||
10780 | four_bytes_repeat_emit_encodeSnappyBlockAsm: | ||
10781 | MOVL BX, R9 | ||
10782 | SHRL $0x10, R9 | ||
10783 | MOVB $0xf8, (AX) | ||
10784 | MOVW BX, 1(AX) | ||
10785 | MOVB R9, 3(AX) | ||
10786 | ADDQ $0x04, AX | ||
10787 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm | ||
10788 | |||
10789 | three_bytes_repeat_emit_encodeSnappyBlockAsm: | ||
10790 | MOVB $0xf4, (AX) | ||
10791 | MOVW BX, 1(AX) | ||
10792 | ADDQ $0x03, AX | ||
10793 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm | ||
10794 | |||
10795 | two_bytes_repeat_emit_encodeSnappyBlockAsm: | ||
10796 | MOVB $0xf0, (AX) | ||
10797 | MOVB BL, 1(AX) | ||
10798 | ADDQ $0x02, AX | ||
10799 | CMPL BX, $0x40 | ||
10800 | JB memmove_repeat_emit_encodeSnappyBlockAsm | ||
10801 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm | ||
10802 | |||
10803 | one_byte_repeat_emit_encodeSnappyBlockAsm: | ||
10804 | SHLB $0x02, BL | ||
10805 | MOVB BL, (AX) | ||
10806 | ADDQ $0x01, AX | ||
10807 | |||
10808 | memmove_repeat_emit_encodeSnappyBlockAsm: | ||
10809 | LEAQ (AX)(DI*1), BX | ||
10810 | |||
10811 | // genMemMoveShort | ||
10812 | CMPQ DI, $0x08 | ||
10813 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 | ||
10814 | CMPQ DI, $0x10 | ||
10815 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 | ||
10816 | CMPQ DI, $0x20 | ||
10817 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 | ||
10818 | JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 | ||
10819 | |||
10820 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: | ||
10821 | MOVQ (R8), R9 | ||
10822 | MOVQ R9, (AX) | ||
10823 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm | ||
10824 | |||
10825 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: | ||
10826 | MOVQ (R8), R9 | ||
10827 | MOVQ -8(R8)(DI*1), R8 | ||
10828 | MOVQ R9, (AX) | ||
10829 | MOVQ R8, -8(AX)(DI*1) | ||
10830 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm | ||
10831 | |||
10832 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: | ||
10833 | MOVOU (R8), X0 | ||
10834 | MOVOU -16(R8)(DI*1), X1 | ||
10835 | MOVOU X0, (AX) | ||
10836 | MOVOU X1, -16(AX)(DI*1) | ||
10837 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm | ||
10838 | |||
10839 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: | ||
10840 | MOVOU (R8), X0 | ||
10841 | MOVOU 16(R8), X1 | ||
10842 | MOVOU -32(R8)(DI*1), X2 | ||
10843 | MOVOU -16(R8)(DI*1), X3 | ||
10844 | MOVOU X0, (AX) | ||
10845 | MOVOU X1, 16(AX) | ||
10846 | MOVOU X2, -32(AX)(DI*1) | ||
10847 | MOVOU X3, -16(AX)(DI*1) | ||
10848 | |||
10849 | memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: | ||
10850 | MOVQ BX, AX | ||
10851 | JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm | ||
10852 | |||
10853 | memmove_long_repeat_emit_encodeSnappyBlockAsm: | ||
10854 | LEAQ (AX)(DI*1), BX | ||
10855 | |||
10856 | // genMemMoveLong | ||
10857 | MOVOU (R8), X0 | ||
10858 | MOVOU 16(R8), X1 | ||
10859 | MOVOU -32(R8)(DI*1), X2 | ||
10860 | MOVOU -16(R8)(DI*1), X3 | ||
10861 | MOVQ DI, R10 | ||
10862 | SHRQ $0x05, R10 | ||
10863 | MOVQ AX, R9 | ||
10864 | ANDL $0x0000001f, R9 | ||
10865 | MOVQ $0x00000040, R11 | ||
10866 | SUBQ R9, R11 | ||
10867 | DECQ R10 | ||
10868 | JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 | ||
10869 | LEAQ -32(R8)(R11*1), R9 | ||
10870 | LEAQ -32(AX)(R11*1), R12 | ||
10871 | |||
10872 | emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: | ||
10873 | MOVOU (R9), X4 | ||
10874 | MOVOU 16(R9), X5 | ||
10875 | MOVOA X4, (R12) | ||
10876 | MOVOA X5, 16(R12) | ||
10877 | ADDQ $0x20, R12 | ||
10878 | ADDQ $0x20, R9 | ||
10879 | ADDQ $0x20, R11 | ||
10880 | DECQ R10 | ||
10881 | JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back | ||
10882 | |||
10883 | emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: | ||
10884 | MOVOU -32(R8)(R11*1), X4 | ||
10885 | MOVOU -16(R8)(R11*1), X5 | ||
10886 | MOVOA X4, -32(AX)(R11*1) | ||
10887 | MOVOA X5, -16(AX)(R11*1) | ||
10888 | ADDQ $0x20, R11 | ||
10889 | CMPQ DI, R11 | ||
10890 | JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 | ||
10891 | MOVOU X0, (AX) | ||
10892 | MOVOU X1, 16(AX) | ||
10893 | MOVOU X2, -32(AX)(DI*1) | ||
10894 | MOVOU X3, -16(AX)(DI*1) | ||
10895 | MOVQ BX, AX | ||
10896 | |||
10897 | emit_literal_done_repeat_emit_encodeSnappyBlockAsm: | ||
10898 | ADDL $0x05, CX | ||
10899 | MOVL CX, BX | ||
10900 | SUBL 16(SP), BX | ||
10901 | MOVQ src_len+32(FP), DI | ||
10902 | SUBL CX, DI | ||
10903 | LEAQ (DX)(CX*1), R8 | ||
10904 | LEAQ (DX)(BX*1), BX | ||
10905 | |||
10906 | // matchLen | ||
10907 | XORL R10, R10 | ||
10908 | |||
10909 | matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm: | ||
10910 | CMPL DI, $0x10 | ||
10911 | JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm | ||
10912 | MOVQ (R8)(R10*1), R9 | ||
10913 | MOVQ 8(R8)(R10*1), R11 | ||
10914 | XORQ (BX)(R10*1), R9 | ||
10915 | JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm | ||
10916 | XORQ 8(BX)(R10*1), R11 | ||
10917 | JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm | ||
10918 | LEAL -16(DI), DI | ||
10919 | LEAL 16(R10), R10 | ||
10920 | JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm | ||
10921 | |||
10922 | matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm: | ||
10923 | #ifdef GOAMD64_v3 | ||
10924 | TZCNTQ R11, R11 | ||
10925 | |||
10926 | #else | ||
10927 | BSFQ R11, R11 | ||
10928 | |||
10929 | #endif | ||
10930 | SARQ $0x03, R11 | ||
10931 | LEAL 8(R10)(R11*1), R10 | ||
10932 | JMP repeat_extend_forward_end_encodeSnappyBlockAsm | ||
10933 | |||
10934 | matchlen_match8_repeat_extend_encodeSnappyBlockAsm: | ||
10935 | CMPL DI, $0x08 | ||
10936 | JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm | ||
10937 | MOVQ (R8)(R10*1), R9 | ||
10938 | XORQ (BX)(R10*1), R9 | ||
10939 | JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm | ||
10940 | LEAL -8(DI), DI | ||
10941 | LEAL 8(R10), R10 | ||
10942 | JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm | ||
10943 | |||
10944 | matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm: | ||
10945 | #ifdef GOAMD64_v3 | ||
10946 | TZCNTQ R9, R9 | ||
10947 | |||
10948 | #else | ||
10949 | BSFQ R9, R9 | ||
10950 | |||
10951 | #endif | ||
10952 | SARQ $0x03, R9 | ||
10953 | LEAL (R10)(R9*1), R10 | ||
10954 | JMP repeat_extend_forward_end_encodeSnappyBlockAsm | ||
10955 | |||
10956 | matchlen_match4_repeat_extend_encodeSnappyBlockAsm: | ||
10957 | CMPL DI, $0x04 | ||
10958 | JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm | ||
10959 | MOVL (R8)(R10*1), R9 | ||
10960 | CMPL (BX)(R10*1), R9 | ||
10961 | JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm | ||
10962 | LEAL -4(DI), DI | ||
10963 | LEAL 4(R10), R10 | ||
10964 | |||
10965 | matchlen_match2_repeat_extend_encodeSnappyBlockAsm: | ||
10966 | CMPL DI, $0x01 | ||
10967 | JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm | ||
10968 | JB repeat_extend_forward_end_encodeSnappyBlockAsm | ||
10969 | MOVW (R8)(R10*1), R9 | ||
10970 | CMPW (BX)(R10*1), R9 | ||
10971 | JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm | ||
10972 | LEAL 2(R10), R10 | ||
10973 | SUBL $0x02, DI | ||
10974 | JZ repeat_extend_forward_end_encodeSnappyBlockAsm | ||
10975 | |||
10976 | matchlen_match1_repeat_extend_encodeSnappyBlockAsm: | ||
10977 | MOVB (R8)(R10*1), R9 | ||
10978 | CMPB (BX)(R10*1), R9 | ||
10979 | JNE repeat_extend_forward_end_encodeSnappyBlockAsm | ||
10980 | LEAL 1(R10), R10 | ||
10981 | |||
10982 | repeat_extend_forward_end_encodeSnappyBlockAsm: | ||
10983 | ADDL R10, CX | ||
10984 | MOVL CX, BX | ||
10985 | SUBL SI, BX | ||
10986 | MOVL 16(SP), SI | ||
10987 | |||
10988 | // emitCopy | ||
10989 | CMPL SI, $0x00010000 | ||
10990 | JB two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm | ||
10991 | |||
10992 | four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: | ||
10993 | CMPL BX, $0x40 | ||
10994 | JBE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm | ||
10995 | MOVB $0xff, (AX) | ||
10996 | MOVL SI, 1(AX) | ||
10997 | LEAL -64(BX), BX | ||
10998 | ADDQ $0x05, AX | ||
10999 | CMPL BX, $0x04 | ||
11000 | JB four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm | ||
11001 | JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm | ||
11002 | |||
11003 | four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: | ||
11004 | TESTL BX, BX | ||
11005 | JZ repeat_end_emit_encodeSnappyBlockAsm | ||
11006 | XORL DI, DI | ||
11007 | LEAL -1(DI)(BX*4), BX | ||
11008 | MOVB BL, (AX) | ||
11009 | MOVL SI, 1(AX) | ||
11010 | ADDQ $0x05, AX | ||
11011 | JMP repeat_end_emit_encodeSnappyBlockAsm | ||
11012 | |||
11013 | two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: | ||
11014 | CMPL BX, $0x40 | ||
11015 | JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm | ||
11016 | MOVB $0xee, (AX) | ||
11017 | MOVW SI, 1(AX) | ||
11018 | LEAL -60(BX), BX | ||
11019 | ADDQ $0x03, AX | ||
11020 | JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm | ||
11021 | |||
11022 | two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: | ||
11023 | MOVL BX, DI | ||
11024 | SHLL $0x02, DI | ||
11025 | CMPL BX, $0x0c | ||
11026 | JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm | ||
11027 | CMPL SI, $0x00000800 | ||
11028 | JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm | ||
11029 | LEAL -15(DI), DI | ||
11030 | MOVB SI, 1(AX) | ||
11031 | SHRL $0x08, SI | ||
11032 | SHLL $0x05, SI | ||
11033 | ORL SI, DI | ||
11034 | MOVB DI, (AX) | ||
11035 | ADDQ $0x02, AX | ||
11036 | JMP repeat_end_emit_encodeSnappyBlockAsm | ||
11037 | |||
11038 | emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: | ||
11039 | LEAL -2(DI), DI | ||
11040 | MOVB DI, (AX) | ||
11041 | MOVW SI, 1(AX) | ||
11042 | ADDQ $0x03, AX | ||
11043 | |||
11044 | repeat_end_emit_encodeSnappyBlockAsm: | ||
11045 | MOVL CX, 12(SP) | ||
11046 | JMP search_loop_encodeSnappyBlockAsm | ||
11047 | |||
11048 | no_repeat_found_encodeSnappyBlockAsm: | ||
11049 | CMPL (DX)(BX*1), SI | ||
11050 | JEQ candidate_match_encodeSnappyBlockAsm | ||
11051 | SHRQ $0x08, SI | ||
11052 | MOVL 24(SP)(R9*4), BX | ||
11053 | LEAL 2(CX), R8 | ||
11054 | CMPL (DX)(DI*1), SI | ||
11055 | JEQ candidate2_match_encodeSnappyBlockAsm | ||
11056 | MOVL R8, 24(SP)(R9*4) | ||
11057 | SHRQ $0x08, SI | ||
11058 | CMPL (DX)(BX*1), SI | ||
11059 | JEQ candidate3_match_encodeSnappyBlockAsm | ||
11060 | MOVL 20(SP), CX | ||
11061 | JMP search_loop_encodeSnappyBlockAsm | ||
11062 | |||
11063 | candidate3_match_encodeSnappyBlockAsm: | ||
11064 | ADDL $0x02, CX | ||
11065 | JMP candidate_match_encodeSnappyBlockAsm | ||
11066 | |||
11067 | candidate2_match_encodeSnappyBlockAsm: | ||
11068 | MOVL R8, 24(SP)(R9*4) | ||
11069 | INCL CX | ||
11070 | MOVL DI, BX | ||
11071 | |||
11072 | candidate_match_encodeSnappyBlockAsm: | ||
11073 | MOVL 12(SP), SI | ||
11074 | TESTL BX, BX | ||
11075 | JZ match_extend_back_end_encodeSnappyBlockAsm | ||
11076 | |||
11077 | match_extend_back_loop_encodeSnappyBlockAsm: | ||
11078 | CMPL CX, SI | ||
11079 | JBE match_extend_back_end_encodeSnappyBlockAsm | ||
11080 | MOVB -1(DX)(BX*1), DI | ||
11081 | MOVB -1(DX)(CX*1), R8 | ||
11082 | CMPB DI, R8 | ||
11083 | JNE match_extend_back_end_encodeSnappyBlockAsm | ||
11084 | LEAL -1(CX), CX | ||
11085 | DECL BX | ||
11086 | JZ match_extend_back_end_encodeSnappyBlockAsm | ||
11087 | JMP match_extend_back_loop_encodeSnappyBlockAsm | ||
11088 | |||
11089 | match_extend_back_end_encodeSnappyBlockAsm: | ||
11090 | MOVL CX, SI | ||
11091 | SUBL 12(SP), SI | ||
11092 | LEAQ 5(AX)(SI*1), SI | ||
11093 | CMPQ SI, (SP) | ||
11094 | JB match_dst_size_check_encodeSnappyBlockAsm | ||
11095 | MOVQ $0x00000000, ret+48(FP) | ||
11096 | RET | ||
11097 | |||
11098 | match_dst_size_check_encodeSnappyBlockAsm: | ||
11099 | MOVL CX, SI | ||
11100 | MOVL 12(SP), DI | ||
11101 | CMPL DI, SI | ||
11102 | JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm | ||
11103 | MOVL SI, R8 | ||
11104 | MOVL SI, 12(SP) | ||
11105 | LEAQ (DX)(DI*1), SI | ||
11106 | SUBL DI, R8 | ||
11107 | LEAL -1(R8), DI | ||
11108 | CMPL DI, $0x3c | ||
11109 | JB one_byte_match_emit_encodeSnappyBlockAsm | ||
11110 | CMPL DI, $0x00000100 | ||
11111 | JB two_bytes_match_emit_encodeSnappyBlockAsm | ||
11112 | CMPL DI, $0x00010000 | ||
11113 | JB three_bytes_match_emit_encodeSnappyBlockAsm | ||
11114 | CMPL DI, $0x01000000 | ||
11115 | JB four_bytes_match_emit_encodeSnappyBlockAsm | ||
11116 | MOVB $0xfc, (AX) | ||
11117 | MOVL DI, 1(AX) | ||
11118 | ADDQ $0x05, AX | ||
11119 | JMP memmove_long_match_emit_encodeSnappyBlockAsm | ||
11120 | |||
11121 | four_bytes_match_emit_encodeSnappyBlockAsm: | ||
11122 | MOVL DI, R9 | ||
11123 | SHRL $0x10, R9 | ||
11124 | MOVB $0xf8, (AX) | ||
11125 | MOVW DI, 1(AX) | ||
11126 | MOVB R9, 3(AX) | ||
11127 | ADDQ $0x04, AX | ||
11128 | JMP memmove_long_match_emit_encodeSnappyBlockAsm | ||
11129 | |||
11130 | three_bytes_match_emit_encodeSnappyBlockAsm: | ||
11131 | MOVB $0xf4, (AX) | ||
11132 | MOVW DI, 1(AX) | ||
11133 | ADDQ $0x03, AX | ||
11134 | JMP memmove_long_match_emit_encodeSnappyBlockAsm | ||
11135 | |||
11136 | two_bytes_match_emit_encodeSnappyBlockAsm: | ||
11137 | MOVB $0xf0, (AX) | ||
11138 | MOVB DI, 1(AX) | ||
11139 | ADDQ $0x02, AX | ||
11140 | CMPL DI, $0x40 | ||
11141 | JB memmove_match_emit_encodeSnappyBlockAsm | ||
11142 | JMP memmove_long_match_emit_encodeSnappyBlockAsm | ||
11143 | |||
11144 | one_byte_match_emit_encodeSnappyBlockAsm: | ||
11145 | SHLB $0x02, DI | ||
11146 | MOVB DI, (AX) | ||
11147 | ADDQ $0x01, AX | ||
11148 | |||
11149 | memmove_match_emit_encodeSnappyBlockAsm: | ||
11150 | LEAQ (AX)(R8*1), DI | ||
11151 | |||
11152 | // genMemMoveShort | ||
11153 | CMPQ R8, $0x08 | ||
11154 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 | ||
11155 | CMPQ R8, $0x10 | ||
11156 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 | ||
11157 | CMPQ R8, $0x20 | ||
11158 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 | ||
11159 | JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 | ||
11160 | |||
11161 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: | ||
11162 | MOVQ (SI), R9 | ||
11163 | MOVQ R9, (AX) | ||
11164 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm | ||
11165 | |||
11166 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: | ||
11167 | MOVQ (SI), R9 | ||
11168 | MOVQ -8(SI)(R8*1), SI | ||
11169 | MOVQ R9, (AX) | ||
11170 | MOVQ SI, -8(AX)(R8*1) | ||
11171 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm | ||
11172 | |||
11173 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: | ||
11174 | MOVOU (SI), X0 | ||
11175 | MOVOU -16(SI)(R8*1), X1 | ||
11176 | MOVOU X0, (AX) | ||
11177 | MOVOU X1, -16(AX)(R8*1) | ||
11178 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm | ||
11179 | |||
11180 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: | ||
11181 | MOVOU (SI), X0 | ||
11182 | MOVOU 16(SI), X1 | ||
11183 | MOVOU -32(SI)(R8*1), X2 | ||
11184 | MOVOU -16(SI)(R8*1), X3 | ||
11185 | MOVOU X0, (AX) | ||
11186 | MOVOU X1, 16(AX) | ||
11187 | MOVOU X2, -32(AX)(R8*1) | ||
11188 | MOVOU X3, -16(AX)(R8*1) | ||
11189 | |||
11190 | memmove_end_copy_match_emit_encodeSnappyBlockAsm: | ||
11191 | MOVQ DI, AX | ||
11192 | JMP emit_literal_done_match_emit_encodeSnappyBlockAsm | ||
11193 | |||
11194 | memmove_long_match_emit_encodeSnappyBlockAsm: | ||
11195 | LEAQ (AX)(R8*1), DI | ||
11196 | |||
11197 | // genMemMoveLong | ||
11198 | MOVOU (SI), X0 | ||
11199 | MOVOU 16(SI), X1 | ||
11200 | MOVOU -32(SI)(R8*1), X2 | ||
11201 | MOVOU -16(SI)(R8*1), X3 | ||
11202 | MOVQ R8, R10 | ||
11203 | SHRQ $0x05, R10 | ||
11204 | MOVQ AX, R9 | ||
11205 | ANDL $0x0000001f, R9 | ||
11206 | MOVQ $0x00000040, R11 | ||
11207 | SUBQ R9, R11 | ||
11208 | DECQ R10 | ||
11209 | JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 | ||
11210 | LEAQ -32(SI)(R11*1), R9 | ||
11211 | LEAQ -32(AX)(R11*1), R12 | ||
11212 | |||
11213 | emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: | ||
11214 | MOVOU (R9), X4 | ||
11215 | MOVOU 16(R9), X5 | ||
11216 | MOVOA X4, (R12) | ||
11217 | MOVOA X5, 16(R12) | ||
11218 | ADDQ $0x20, R12 | ||
11219 | ADDQ $0x20, R9 | ||
11220 | ADDQ $0x20, R11 | ||
11221 | DECQ R10 | ||
11222 | JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back | ||
11223 | |||
11224 | emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: | ||
11225 | MOVOU -32(SI)(R11*1), X4 | ||
11226 | MOVOU -16(SI)(R11*1), X5 | ||
11227 | MOVOA X4, -32(AX)(R11*1) | ||
11228 | MOVOA X5, -16(AX)(R11*1) | ||
11229 | ADDQ $0x20, R11 | ||
11230 | CMPQ R8, R11 | ||
11231 | JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 | ||
11232 | MOVOU X0, (AX) | ||
11233 | MOVOU X1, 16(AX) | ||
11234 | MOVOU X2, -32(AX)(R8*1) | ||
11235 | MOVOU X3, -16(AX)(R8*1) | ||
11236 | MOVQ DI, AX | ||
11237 | |||
11238 | emit_literal_done_match_emit_encodeSnappyBlockAsm: | ||
11239 | match_nolit_loop_encodeSnappyBlockAsm: | ||
11240 | MOVL CX, SI | ||
11241 | SUBL BX, SI | ||
11242 | MOVL SI, 16(SP) | ||
11243 | ADDL $0x04, CX | ||
11244 | ADDL $0x04, BX | ||
11245 | MOVQ src_len+32(FP), SI | ||
11246 | SUBL CX, SI | ||
11247 | LEAQ (DX)(CX*1), DI | ||
11248 | LEAQ (DX)(BX*1), BX | ||
11249 | |||
11250 | // matchLen | ||
11251 | XORL R9, R9 | ||
11252 | |||
11253 | matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm: | ||
11254 | CMPL SI, $0x10 | ||
11255 | JB matchlen_match8_match_nolit_encodeSnappyBlockAsm | ||
11256 | MOVQ (DI)(R9*1), R8 | ||
11257 | MOVQ 8(DI)(R9*1), R10 | ||
11258 | XORQ (BX)(R9*1), R8 | ||
11259 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm | ||
11260 | XORQ 8(BX)(R9*1), R10 | ||
11261 | JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm | ||
11262 | LEAL -16(SI), SI | ||
11263 | LEAL 16(R9), R9 | ||
11264 | JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm | ||
11265 | |||
11266 | matchlen_bsf_16match_nolit_encodeSnappyBlockAsm: | ||
11267 | #ifdef GOAMD64_v3 | ||
11268 | TZCNTQ R10, R10 | ||
11269 | |||
11270 | #else | ||
11271 | BSFQ R10, R10 | ||
11272 | |||
11273 | #endif | ||
11274 | SARQ $0x03, R10 | ||
11275 | LEAL 8(R9)(R10*1), R9 | ||
11276 | JMP match_nolit_end_encodeSnappyBlockAsm | ||
11277 | |||
11278 | matchlen_match8_match_nolit_encodeSnappyBlockAsm: | ||
11279 | CMPL SI, $0x08 | ||
11280 | JB matchlen_match4_match_nolit_encodeSnappyBlockAsm | ||
11281 | MOVQ (DI)(R9*1), R8 | ||
11282 | XORQ (BX)(R9*1), R8 | ||
11283 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm | ||
11284 | LEAL -8(SI), SI | ||
11285 | LEAL 8(R9), R9 | ||
11286 | JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm | ||
11287 | |||
11288 | matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm: | ||
11289 | #ifdef GOAMD64_v3 | ||
11290 | TZCNTQ R8, R8 | ||
11291 | |||
11292 | #else | ||
11293 | BSFQ R8, R8 | ||
11294 | |||
11295 | #endif | ||
11296 | SARQ $0x03, R8 | ||
11297 | LEAL (R9)(R8*1), R9 | ||
11298 | JMP match_nolit_end_encodeSnappyBlockAsm | ||
11299 | |||
11300 | matchlen_match4_match_nolit_encodeSnappyBlockAsm: | ||
11301 | CMPL SI, $0x04 | ||
11302 | JB matchlen_match2_match_nolit_encodeSnappyBlockAsm | ||
11303 | MOVL (DI)(R9*1), R8 | ||
11304 | CMPL (BX)(R9*1), R8 | ||
11305 | JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm | ||
11306 | LEAL -4(SI), SI | ||
11307 | LEAL 4(R9), R9 | ||
11308 | |||
11309 | matchlen_match2_match_nolit_encodeSnappyBlockAsm: | ||
11310 | CMPL SI, $0x01 | ||
11311 | JE matchlen_match1_match_nolit_encodeSnappyBlockAsm | ||
11312 | JB match_nolit_end_encodeSnappyBlockAsm | ||
11313 | MOVW (DI)(R9*1), R8 | ||
11314 | CMPW (BX)(R9*1), R8 | ||
11315 | JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm | ||
11316 | LEAL 2(R9), R9 | ||
11317 | SUBL $0x02, SI | ||
11318 | JZ match_nolit_end_encodeSnappyBlockAsm | ||
11319 | |||
11320 | matchlen_match1_match_nolit_encodeSnappyBlockAsm: | ||
11321 | MOVB (DI)(R9*1), R8 | ||
11322 | CMPB (BX)(R9*1), R8 | ||
11323 | JNE match_nolit_end_encodeSnappyBlockAsm | ||
11324 | LEAL 1(R9), R9 | ||
11325 | |||
11326 | match_nolit_end_encodeSnappyBlockAsm: | ||
11327 | ADDL R9, CX | ||
11328 | MOVL 16(SP), BX | ||
11329 | ADDL $0x04, R9 | ||
11330 | MOVL CX, 12(SP) | ||
11331 | |||
11332 | // emitCopy | ||
11333 | CMPL BX, $0x00010000 | ||
11334 | JB two_byte_offset_match_nolit_encodeSnappyBlockAsm | ||
11335 | |||
11336 | four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: | ||
11337 | CMPL R9, $0x40 | ||
11338 | JBE four_bytes_remain_match_nolit_encodeSnappyBlockAsm | ||
11339 | MOVB $0xff, (AX) | ||
11340 | MOVL BX, 1(AX) | ||
11341 | LEAL -64(R9), R9 | ||
11342 | ADDQ $0x05, AX | ||
11343 | CMPL R9, $0x04 | ||
11344 | JB four_bytes_remain_match_nolit_encodeSnappyBlockAsm | ||
11345 | JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm | ||
11346 | |||
11347 | four_bytes_remain_match_nolit_encodeSnappyBlockAsm: | ||
11348 | TESTL R9, R9 | ||
11349 | JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm | ||
11350 | XORL SI, SI | ||
11351 | LEAL -1(SI)(R9*4), R9 | ||
11352 | MOVB R9, (AX) | ||
11353 | MOVL BX, 1(AX) | ||
11354 | ADDQ $0x05, AX | ||
11355 | JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm | ||
11356 | |||
11357 | two_byte_offset_match_nolit_encodeSnappyBlockAsm: | ||
11358 | CMPL R9, $0x40 | ||
11359 | JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm | ||
11360 | MOVB $0xee, (AX) | ||
11361 | MOVW BX, 1(AX) | ||
11362 | LEAL -60(R9), R9 | ||
11363 | ADDQ $0x03, AX | ||
11364 | JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm | ||
11365 | |||
11366 | two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: | ||
11367 | MOVL R9, SI | ||
11368 | SHLL $0x02, SI | ||
11369 | CMPL R9, $0x0c | ||
11370 | JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm | ||
11371 | CMPL BX, $0x00000800 | ||
11372 | JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm | ||
11373 | LEAL -15(SI), SI | ||
11374 | MOVB BL, 1(AX) | ||
11375 | SHRL $0x08, BX | ||
11376 | SHLL $0x05, BX | ||
11377 | ORL BX, SI | ||
11378 | MOVB SI, (AX) | ||
11379 | ADDQ $0x02, AX | ||
11380 | JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm | ||
11381 | |||
11382 | emit_copy_three_match_nolit_encodeSnappyBlockAsm: | ||
11383 | LEAL -2(SI), SI | ||
11384 | MOVB SI, (AX) | ||
11385 | MOVW BX, 1(AX) | ||
11386 | ADDQ $0x03, AX | ||
11387 | |||
11388 | match_nolit_emitcopy_end_encodeSnappyBlockAsm: | ||
11389 | CMPL CX, 8(SP) | ||
11390 | JAE emit_remainder_encodeSnappyBlockAsm | ||
11391 | MOVQ -2(DX)(CX*1), SI | ||
11392 | CMPQ AX, (SP) | ||
11393 | JB match_nolit_dst_ok_encodeSnappyBlockAsm | ||
11394 | MOVQ $0x00000000, ret+48(FP) | ||
11395 | RET | ||
11396 | |||
11397 | match_nolit_dst_ok_encodeSnappyBlockAsm: | ||
11398 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
11399 | MOVQ SI, DI | ||
11400 | SHRQ $0x10, SI | ||
11401 | MOVQ SI, BX | ||
11402 | SHLQ $0x10, DI | ||
11403 | IMULQ R8, DI | ||
11404 | SHRQ $0x32, DI | ||
11405 | SHLQ $0x10, BX | ||
11406 | IMULQ R8, BX | ||
11407 | SHRQ $0x32, BX | ||
11408 | LEAL -2(CX), R8 | ||
11409 | LEAQ 24(SP)(BX*4), R9 | ||
11410 | MOVL (R9), BX | ||
11411 | MOVL R8, 24(SP)(DI*4) | ||
11412 | MOVL CX, (R9) | ||
11413 | CMPL (DX)(BX*1), SI | ||
11414 | JEQ match_nolit_loop_encodeSnappyBlockAsm | ||
11415 | INCL CX | ||
11416 | JMP search_loop_encodeSnappyBlockAsm | ||
11417 | |||
11418 | emit_remainder_encodeSnappyBlockAsm: | ||
11419 | MOVQ src_len+32(FP), CX | ||
11420 | SUBL 12(SP), CX | ||
11421 | LEAQ 5(AX)(CX*1), CX | ||
11422 | CMPQ CX, (SP) | ||
11423 | JB emit_remainder_ok_encodeSnappyBlockAsm | ||
11424 | MOVQ $0x00000000, ret+48(FP) | ||
11425 | RET | ||
11426 | |||
11427 | emit_remainder_ok_encodeSnappyBlockAsm: | ||
11428 | MOVQ src_len+32(FP), CX | ||
11429 | MOVL 12(SP), BX | ||
11430 | CMPL BX, CX | ||
11431 | JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm | ||
11432 | MOVL CX, SI | ||
11433 | MOVL CX, 12(SP) | ||
11434 | LEAQ (DX)(BX*1), CX | ||
11435 | SUBL BX, SI | ||
11436 | LEAL -1(SI), DX | ||
11437 | CMPL DX, $0x3c | ||
11438 | JB one_byte_emit_remainder_encodeSnappyBlockAsm | ||
11439 | CMPL DX, $0x00000100 | ||
11440 | JB two_bytes_emit_remainder_encodeSnappyBlockAsm | ||
11441 | CMPL DX, $0x00010000 | ||
11442 | JB three_bytes_emit_remainder_encodeSnappyBlockAsm | ||
11443 | CMPL DX, $0x01000000 | ||
11444 | JB four_bytes_emit_remainder_encodeSnappyBlockAsm | ||
11445 | MOVB $0xfc, (AX) | ||
11446 | MOVL DX, 1(AX) | ||
11447 | ADDQ $0x05, AX | ||
11448 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm | ||
11449 | |||
11450 | four_bytes_emit_remainder_encodeSnappyBlockAsm: | ||
11451 | MOVL DX, BX | ||
11452 | SHRL $0x10, BX | ||
11453 | MOVB $0xf8, (AX) | ||
11454 | MOVW DX, 1(AX) | ||
11455 | MOVB BL, 3(AX) | ||
11456 | ADDQ $0x04, AX | ||
11457 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm | ||
11458 | |||
11459 | three_bytes_emit_remainder_encodeSnappyBlockAsm: | ||
11460 | MOVB $0xf4, (AX) | ||
11461 | MOVW DX, 1(AX) | ||
11462 | ADDQ $0x03, AX | ||
11463 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm | ||
11464 | |||
11465 | two_bytes_emit_remainder_encodeSnappyBlockAsm: | ||
11466 | MOVB $0xf0, (AX) | ||
11467 | MOVB DL, 1(AX) | ||
11468 | ADDQ $0x02, AX | ||
11469 | CMPL DX, $0x40 | ||
11470 | JB memmove_emit_remainder_encodeSnappyBlockAsm | ||
11471 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm | ||
11472 | |||
11473 | one_byte_emit_remainder_encodeSnappyBlockAsm: | ||
11474 | SHLB $0x02, DL | ||
11475 | MOVB DL, (AX) | ||
11476 | ADDQ $0x01, AX | ||
11477 | |||
11478 | memmove_emit_remainder_encodeSnappyBlockAsm: | ||
11479 | LEAQ (AX)(SI*1), DX | ||
11480 | MOVL SI, BX | ||
11481 | |||
11482 | // genMemMoveShort | ||
11483 | CMPQ BX, $0x03 | ||
11484 | JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 | ||
11485 | JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 | ||
11486 | CMPQ BX, $0x08 | ||
11487 | JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 | ||
11488 | CMPQ BX, $0x10 | ||
11489 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 | ||
11490 | CMPQ BX, $0x20 | ||
11491 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 | ||
11492 | JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 | ||
11493 | |||
11494 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: | ||
11495 | MOVB (CX), SI | ||
11496 | MOVB -1(CX)(BX*1), CL | ||
11497 | MOVB SI, (AX) | ||
11498 | MOVB CL, -1(AX)(BX*1) | ||
11499 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm | ||
11500 | |||
11501 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: | ||
11502 | MOVW (CX), SI | ||
11503 | MOVB 2(CX), CL | ||
11504 | MOVW SI, (AX) | ||
11505 | MOVB CL, 2(AX) | ||
11506 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm | ||
11507 | |||
11508 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: | ||
11509 | MOVL (CX), SI | ||
11510 | MOVL -4(CX)(BX*1), CX | ||
11511 | MOVL SI, (AX) | ||
11512 | MOVL CX, -4(AX)(BX*1) | ||
11513 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm | ||
11514 | |||
11515 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: | ||
11516 | MOVQ (CX), SI | ||
11517 | MOVQ -8(CX)(BX*1), CX | ||
11518 | MOVQ SI, (AX) | ||
11519 | MOVQ CX, -8(AX)(BX*1) | ||
11520 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm | ||
11521 | |||
11522 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: | ||
11523 | MOVOU (CX), X0 | ||
11524 | MOVOU -16(CX)(BX*1), X1 | ||
11525 | MOVOU X0, (AX) | ||
11526 | MOVOU X1, -16(AX)(BX*1) | ||
11527 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm | ||
11528 | |||
11529 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: | ||
11530 | MOVOU (CX), X0 | ||
11531 | MOVOU 16(CX), X1 | ||
11532 | MOVOU -32(CX)(BX*1), X2 | ||
11533 | MOVOU -16(CX)(BX*1), X3 | ||
11534 | MOVOU X0, (AX) | ||
11535 | MOVOU X1, 16(AX) | ||
11536 | MOVOU X2, -32(AX)(BX*1) | ||
11537 | MOVOU X3, -16(AX)(BX*1) | ||
11538 | |||
11539 | memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: | ||
11540 | MOVQ DX, AX | ||
11541 | JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm | ||
11542 | |||
11543 | memmove_long_emit_remainder_encodeSnappyBlockAsm: | ||
11544 | LEAQ (AX)(SI*1), DX | ||
11545 | MOVL SI, BX | ||
11546 | |||
11547 | // genMemMoveLong | ||
11548 | MOVOU (CX), X0 | ||
11549 | MOVOU 16(CX), X1 | ||
11550 | MOVOU -32(CX)(BX*1), X2 | ||
11551 | MOVOU -16(CX)(BX*1), X3 | ||
11552 | MOVQ BX, DI | ||
11553 | SHRQ $0x05, DI | ||
11554 | MOVQ AX, SI | ||
11555 | ANDL $0x0000001f, SI | ||
11556 | MOVQ $0x00000040, R8 | ||
11557 | SUBQ SI, R8 | ||
11558 | DECQ DI | ||
11559 | JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 | ||
11560 | LEAQ -32(CX)(R8*1), SI | ||
11561 | LEAQ -32(AX)(R8*1), R9 | ||
11562 | |||
11563 | emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: | ||
11564 | MOVOU (SI), X4 | ||
11565 | MOVOU 16(SI), X5 | ||
11566 | MOVOA X4, (R9) | ||
11567 | MOVOA X5, 16(R9) | ||
11568 | ADDQ $0x20, R9 | ||
11569 | ADDQ $0x20, SI | ||
11570 | ADDQ $0x20, R8 | ||
11571 | DECQ DI | ||
11572 | JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back | ||
11573 | |||
11574 | emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: | ||
11575 | MOVOU -32(CX)(R8*1), X4 | ||
11576 | MOVOU -16(CX)(R8*1), X5 | ||
11577 | MOVOA X4, -32(AX)(R8*1) | ||
11578 | MOVOA X5, -16(AX)(R8*1) | ||
11579 | ADDQ $0x20, R8 | ||
11580 | CMPQ BX, R8 | ||
11581 | JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 | ||
11582 | MOVOU X0, (AX) | ||
11583 | MOVOU X1, 16(AX) | ||
11584 | MOVOU X2, -32(AX)(BX*1) | ||
11585 | MOVOU X3, -16(AX)(BX*1) | ||
11586 | MOVQ DX, AX | ||
11587 | |||
11588 | emit_literal_done_emit_remainder_encodeSnappyBlockAsm: | ||
11589 | MOVQ dst_base+0(FP), CX | ||
11590 | SUBQ CX, AX | ||
11591 | MOVQ AX, ret+48(FP) | ||
11592 | RET | ||
11593 | |||
11594 | // func encodeSnappyBlockAsm64K(dst []byte, src []byte) int | ||
11595 | // Requires: BMI, SSE2 | ||
11596 | TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 | ||
11597 | MOVQ dst_base+0(FP), AX | ||
11598 | MOVQ $0x00000200, CX | ||
11599 | LEAQ 24(SP), DX | ||
11600 | PXOR X0, X0 | ||
11601 | |||
11602 | zero_loop_encodeSnappyBlockAsm64K: | ||
11603 | MOVOU X0, (DX) | ||
11604 | MOVOU X0, 16(DX) | ||
11605 | MOVOU X0, 32(DX) | ||
11606 | MOVOU X0, 48(DX) | ||
11607 | MOVOU X0, 64(DX) | ||
11608 | MOVOU X0, 80(DX) | ||
11609 | MOVOU X0, 96(DX) | ||
11610 | MOVOU X0, 112(DX) | ||
11611 | ADDQ $0x80, DX | ||
11612 | DECQ CX | ||
11613 | JNZ zero_loop_encodeSnappyBlockAsm64K | ||
11614 | MOVL $0x00000000, 12(SP) | ||
11615 | MOVQ src_len+32(FP), CX | ||
11616 | LEAQ -9(CX), DX | ||
11617 | LEAQ -8(CX), BX | ||
11618 | MOVL BX, 8(SP) | ||
11619 | SHRQ $0x05, CX | ||
11620 | SUBL CX, DX | ||
11621 | LEAQ (AX)(DX*1), DX | ||
11622 | MOVQ DX, (SP) | ||
11623 | MOVL $0x00000001, CX | ||
11624 | MOVL CX, 16(SP) | ||
11625 | MOVQ src_base+24(FP), DX | ||
11626 | |||
11627 | search_loop_encodeSnappyBlockAsm64K: | ||
11628 | MOVL CX, BX | ||
11629 | SUBL 12(SP), BX | ||
11630 | SHRL $0x06, BX | ||
11631 | LEAL 4(CX)(BX*1), BX | ||
11632 | CMPL BX, 8(SP) | ||
11633 | JAE emit_remainder_encodeSnappyBlockAsm64K | ||
11634 | MOVQ (DX)(CX*1), SI | ||
11635 | MOVL BX, 20(SP) | ||
11636 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
11637 | MOVQ SI, R9 | ||
11638 | MOVQ SI, R10 | ||
11639 | SHRQ $0x08, R10 | ||
11640 | SHLQ $0x10, R9 | ||
11641 | IMULQ R8, R9 | ||
11642 | SHRQ $0x32, R9 | ||
11643 | SHLQ $0x10, R10 | ||
11644 | IMULQ R8, R10 | ||
11645 | SHRQ $0x32, R10 | ||
11646 | MOVL 24(SP)(R9*4), BX | ||
11647 | MOVL 24(SP)(R10*4), DI | ||
11648 | MOVL CX, 24(SP)(R9*4) | ||
11649 | LEAL 1(CX), R9 | ||
11650 | MOVL R9, 24(SP)(R10*4) | ||
11651 | MOVQ SI, R9 | ||
11652 | SHRQ $0x10, R9 | ||
11653 | SHLQ $0x10, R9 | ||
11654 | IMULQ R8, R9 | ||
11655 | SHRQ $0x32, R9 | ||
11656 | MOVL CX, R8 | ||
11657 | SUBL 16(SP), R8 | ||
11658 | MOVL 1(DX)(R8*1), R10 | ||
11659 | MOVQ SI, R8 | ||
11660 | SHRQ $0x08, R8 | ||
11661 | CMPL R8, R10 | ||
11662 | JNE no_repeat_found_encodeSnappyBlockAsm64K | ||
11663 | LEAL 1(CX), SI | ||
11664 | MOVL 12(SP), BX | ||
11665 | MOVL SI, DI | ||
11666 | SUBL 16(SP), DI | ||
11667 | JZ repeat_extend_back_end_encodeSnappyBlockAsm64K | ||
11668 | |||
11669 | repeat_extend_back_loop_encodeSnappyBlockAsm64K: | ||
11670 | CMPL SI, BX | ||
11671 | JBE repeat_extend_back_end_encodeSnappyBlockAsm64K | ||
11672 | MOVB -1(DX)(DI*1), R8 | ||
11673 | MOVB -1(DX)(SI*1), R9 | ||
11674 | CMPB R8, R9 | ||
11675 | JNE repeat_extend_back_end_encodeSnappyBlockAsm64K | ||
11676 | LEAL -1(SI), SI | ||
11677 | DECL DI | ||
11678 | JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K | ||
11679 | |||
11680 | repeat_extend_back_end_encodeSnappyBlockAsm64K: | ||
11681 | MOVL 12(SP), BX | ||
11682 | CMPL BX, SI | ||
11683 | JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K | ||
11684 | MOVL SI, DI | ||
11685 | MOVL SI, 12(SP) | ||
11686 | LEAQ (DX)(BX*1), R8 | ||
11687 | SUBL BX, DI | ||
11688 | LEAL -1(DI), BX | ||
11689 | CMPL BX, $0x3c | ||
11690 | JB one_byte_repeat_emit_encodeSnappyBlockAsm64K | ||
11691 | CMPL BX, $0x00000100 | ||
11692 | JB two_bytes_repeat_emit_encodeSnappyBlockAsm64K | ||
11693 | JB three_bytes_repeat_emit_encodeSnappyBlockAsm64K | ||
11694 | |||
11695 | three_bytes_repeat_emit_encodeSnappyBlockAsm64K: | ||
11696 | MOVB $0xf4, (AX) | ||
11697 | MOVW BX, 1(AX) | ||
11698 | ADDQ $0x03, AX | ||
11699 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K | ||
11700 | |||
11701 | two_bytes_repeat_emit_encodeSnappyBlockAsm64K: | ||
11702 | MOVB $0xf0, (AX) | ||
11703 | MOVB BL, 1(AX) | ||
11704 | ADDQ $0x02, AX | ||
11705 | CMPL BX, $0x40 | ||
11706 | JB memmove_repeat_emit_encodeSnappyBlockAsm64K | ||
11707 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K | ||
11708 | |||
11709 | one_byte_repeat_emit_encodeSnappyBlockAsm64K: | ||
11710 | SHLB $0x02, BL | ||
11711 | MOVB BL, (AX) | ||
11712 | ADDQ $0x01, AX | ||
11713 | |||
11714 | memmove_repeat_emit_encodeSnappyBlockAsm64K: | ||
11715 | LEAQ (AX)(DI*1), BX | ||
11716 | |||
11717 | // genMemMoveShort | ||
11718 | CMPQ DI, $0x08 | ||
11719 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 | ||
11720 | CMPQ DI, $0x10 | ||
11721 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 | ||
11722 | CMPQ DI, $0x20 | ||
11723 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 | ||
11724 | JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 | ||
11725 | |||
11726 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: | ||
11727 | MOVQ (R8), R9 | ||
11728 | MOVQ R9, (AX) | ||
11729 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K | ||
11730 | |||
11731 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: | ||
11732 | MOVQ (R8), R9 | ||
11733 | MOVQ -8(R8)(DI*1), R8 | ||
11734 | MOVQ R9, (AX) | ||
11735 | MOVQ R8, -8(AX)(DI*1) | ||
11736 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K | ||
11737 | |||
11738 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: | ||
11739 | MOVOU (R8), X0 | ||
11740 | MOVOU -16(R8)(DI*1), X1 | ||
11741 | MOVOU X0, (AX) | ||
11742 | MOVOU X1, -16(AX)(DI*1) | ||
11743 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K | ||
11744 | |||
11745 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: | ||
11746 | MOVOU (R8), X0 | ||
11747 | MOVOU 16(R8), X1 | ||
11748 | MOVOU -32(R8)(DI*1), X2 | ||
11749 | MOVOU -16(R8)(DI*1), X3 | ||
11750 | MOVOU X0, (AX) | ||
11751 | MOVOU X1, 16(AX) | ||
11752 | MOVOU X2, -32(AX)(DI*1) | ||
11753 | MOVOU X3, -16(AX)(DI*1) | ||
11754 | |||
11755 | memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: | ||
11756 | MOVQ BX, AX | ||
11757 | JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K | ||
11758 | |||
11759 | memmove_long_repeat_emit_encodeSnappyBlockAsm64K: | ||
11760 | LEAQ (AX)(DI*1), BX | ||
11761 | |||
11762 | // genMemMoveLong | ||
11763 | MOVOU (R8), X0 | ||
11764 | MOVOU 16(R8), X1 | ||
11765 | MOVOU -32(R8)(DI*1), X2 | ||
11766 | MOVOU -16(R8)(DI*1), X3 | ||
11767 | MOVQ DI, R10 | ||
11768 | SHRQ $0x05, R10 | ||
11769 | MOVQ AX, R9 | ||
11770 | ANDL $0x0000001f, R9 | ||
11771 | MOVQ $0x00000040, R11 | ||
11772 | SUBQ R9, R11 | ||
11773 | DECQ R10 | ||
11774 | JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 | ||
11775 | LEAQ -32(R8)(R11*1), R9 | ||
11776 | LEAQ -32(AX)(R11*1), R12 | ||
11777 | |||
11778 | emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: | ||
11779 | MOVOU (R9), X4 | ||
11780 | MOVOU 16(R9), X5 | ||
11781 | MOVOA X4, (R12) | ||
11782 | MOVOA X5, 16(R12) | ||
11783 | ADDQ $0x20, R12 | ||
11784 | ADDQ $0x20, R9 | ||
11785 | ADDQ $0x20, R11 | ||
11786 | DECQ R10 | ||
11787 | JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back | ||
11788 | |||
11789 | emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: | ||
11790 | MOVOU -32(R8)(R11*1), X4 | ||
11791 | MOVOU -16(R8)(R11*1), X5 | ||
11792 | MOVOA X4, -32(AX)(R11*1) | ||
11793 | MOVOA X5, -16(AX)(R11*1) | ||
11794 | ADDQ $0x20, R11 | ||
11795 | CMPQ DI, R11 | ||
11796 | JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 | ||
11797 | MOVOU X0, (AX) | ||
11798 | MOVOU X1, 16(AX) | ||
11799 | MOVOU X2, -32(AX)(DI*1) | ||
11800 | MOVOU X3, -16(AX)(DI*1) | ||
11801 | MOVQ BX, AX | ||
11802 | |||
11803 | emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: | ||
11804 | ADDL $0x05, CX | ||
11805 | MOVL CX, BX | ||
11806 | SUBL 16(SP), BX | ||
11807 | MOVQ src_len+32(FP), DI | ||
11808 | SUBL CX, DI | ||
11809 | LEAQ (DX)(CX*1), R8 | ||
11810 | LEAQ (DX)(BX*1), BX | ||
11811 | |||
11812 | // matchLen | ||
11813 | XORL R10, R10 | ||
11814 | |||
11815 | matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K: | ||
11816 | CMPL DI, $0x10 | ||
11817 | JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K | ||
11818 | MOVQ (R8)(R10*1), R9 | ||
11819 | MOVQ 8(R8)(R10*1), R11 | ||
11820 | XORQ (BX)(R10*1), R9 | ||
11821 | JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K | ||
11822 | XORQ 8(BX)(R10*1), R11 | ||
11823 | JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K | ||
11824 | LEAL -16(DI), DI | ||
11825 | LEAL 16(R10), R10 | ||
11826 | JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K | ||
11827 | |||
11828 | matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K: | ||
11829 | #ifdef GOAMD64_v3 | ||
11830 | TZCNTQ R11, R11 | ||
11831 | |||
11832 | #else | ||
11833 | BSFQ R11, R11 | ||
11834 | |||
11835 | #endif | ||
11836 | SARQ $0x03, R11 | ||
11837 | LEAL 8(R10)(R11*1), R10 | ||
11838 | JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K | ||
11839 | |||
11840 | matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K: | ||
11841 | CMPL DI, $0x08 | ||
11842 | JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K | ||
11843 | MOVQ (R8)(R10*1), R9 | ||
11844 | XORQ (BX)(R10*1), R9 | ||
11845 | JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K | ||
11846 | LEAL -8(DI), DI | ||
11847 | LEAL 8(R10), R10 | ||
11848 | JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K | ||
11849 | |||
11850 | matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K: | ||
11851 | #ifdef GOAMD64_v3 | ||
11852 | TZCNTQ R9, R9 | ||
11853 | |||
11854 | #else | ||
11855 | BSFQ R9, R9 | ||
11856 | |||
11857 | #endif | ||
11858 | SARQ $0x03, R9 | ||
11859 | LEAL (R10)(R9*1), R10 | ||
11860 | JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K | ||
11861 | |||
11862 | matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K: | ||
11863 | CMPL DI, $0x04 | ||
11864 | JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K | ||
11865 | MOVL (R8)(R10*1), R9 | ||
11866 | CMPL (BX)(R10*1), R9 | ||
11867 | JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K | ||
11868 | LEAL -4(DI), DI | ||
11869 | LEAL 4(R10), R10 | ||
11870 | |||
11871 | matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K: | ||
11872 | CMPL DI, $0x01 | ||
11873 | JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K | ||
11874 | JB repeat_extend_forward_end_encodeSnappyBlockAsm64K | ||
11875 | MOVW (R8)(R10*1), R9 | ||
11876 | CMPW (BX)(R10*1), R9 | ||
11877 | JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K | ||
11878 | LEAL 2(R10), R10 | ||
11879 | SUBL $0x02, DI | ||
11880 | JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K | ||
11881 | |||
11882 | matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K: | ||
11883 | MOVB (R8)(R10*1), R9 | ||
11884 | CMPB (BX)(R10*1), R9 | ||
11885 | JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K | ||
11886 | LEAL 1(R10), R10 | ||
11887 | |||
11888 | repeat_extend_forward_end_encodeSnappyBlockAsm64K: | ||
11889 | ADDL R10, CX | ||
11890 | MOVL CX, BX | ||
11891 | SUBL SI, BX | ||
11892 | MOVL 16(SP), SI | ||
11893 | |||
11894 | // emitCopy | ||
11895 | two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: | ||
11896 | CMPL BX, $0x40 | ||
11897 | JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K | ||
11898 | MOVB $0xee, (AX) | ||
11899 | MOVW SI, 1(AX) | ||
11900 | LEAL -60(BX), BX | ||
11901 | ADDQ $0x03, AX | ||
11902 | JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K | ||
11903 | |||
11904 | two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: | ||
11905 | MOVL BX, DI | ||
11906 | SHLL $0x02, DI | ||
11907 | CMPL BX, $0x0c | ||
11908 | JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K | ||
11909 | CMPL SI, $0x00000800 | ||
11910 | JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K | ||
11911 | LEAL -15(DI), DI | ||
11912 | MOVB SI, 1(AX) | ||
11913 | SHRL $0x08, SI | ||
11914 | SHLL $0x05, SI | ||
11915 | ORL SI, DI | ||
11916 | MOVB DI, (AX) | ||
11917 | ADDQ $0x02, AX | ||
11918 | JMP repeat_end_emit_encodeSnappyBlockAsm64K | ||
11919 | |||
11920 | emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: | ||
11921 | LEAL -2(DI), DI | ||
11922 | MOVB DI, (AX) | ||
11923 | MOVW SI, 1(AX) | ||
11924 | ADDQ $0x03, AX | ||
11925 | |||
11926 | repeat_end_emit_encodeSnappyBlockAsm64K: | ||
11927 | MOVL CX, 12(SP) | ||
11928 | JMP search_loop_encodeSnappyBlockAsm64K | ||
11929 | |||
11930 | no_repeat_found_encodeSnappyBlockAsm64K: | ||
11931 | CMPL (DX)(BX*1), SI | ||
11932 | JEQ candidate_match_encodeSnappyBlockAsm64K | ||
11933 | SHRQ $0x08, SI | ||
11934 | MOVL 24(SP)(R9*4), BX | ||
11935 | LEAL 2(CX), R8 | ||
11936 | CMPL (DX)(DI*1), SI | ||
11937 | JEQ candidate2_match_encodeSnappyBlockAsm64K | ||
11938 | MOVL R8, 24(SP)(R9*4) | ||
11939 | SHRQ $0x08, SI | ||
11940 | CMPL (DX)(BX*1), SI | ||
11941 | JEQ candidate3_match_encodeSnappyBlockAsm64K | ||
11942 | MOVL 20(SP), CX | ||
11943 | JMP search_loop_encodeSnappyBlockAsm64K | ||
11944 | |||
11945 | candidate3_match_encodeSnappyBlockAsm64K: | ||
11946 | ADDL $0x02, CX | ||
11947 | JMP candidate_match_encodeSnappyBlockAsm64K | ||
11948 | |||
11949 | candidate2_match_encodeSnappyBlockAsm64K: | ||
11950 | MOVL R8, 24(SP)(R9*4) | ||
11951 | INCL CX | ||
11952 | MOVL DI, BX | ||
11953 | |||
11954 | candidate_match_encodeSnappyBlockAsm64K: | ||
11955 | MOVL 12(SP), SI | ||
11956 | TESTL BX, BX | ||
11957 | JZ match_extend_back_end_encodeSnappyBlockAsm64K | ||
11958 | |||
11959 | match_extend_back_loop_encodeSnappyBlockAsm64K: | ||
11960 | CMPL CX, SI | ||
11961 | JBE match_extend_back_end_encodeSnappyBlockAsm64K | ||
11962 | MOVB -1(DX)(BX*1), DI | ||
11963 | MOVB -1(DX)(CX*1), R8 | ||
11964 | CMPB DI, R8 | ||
11965 | JNE match_extend_back_end_encodeSnappyBlockAsm64K | ||
11966 | LEAL -1(CX), CX | ||
11967 | DECL BX | ||
11968 | JZ match_extend_back_end_encodeSnappyBlockAsm64K | ||
11969 | JMP match_extend_back_loop_encodeSnappyBlockAsm64K | ||
11970 | |||
11971 | match_extend_back_end_encodeSnappyBlockAsm64K: | ||
11972 | MOVL CX, SI | ||
11973 | SUBL 12(SP), SI | ||
11974 | LEAQ 3(AX)(SI*1), SI | ||
11975 | CMPQ SI, (SP) | ||
11976 | JB match_dst_size_check_encodeSnappyBlockAsm64K | ||
11977 | MOVQ $0x00000000, ret+48(FP) | ||
11978 | RET | ||
11979 | |||
11980 | match_dst_size_check_encodeSnappyBlockAsm64K: | ||
11981 | MOVL CX, SI | ||
11982 | MOVL 12(SP), DI | ||
11983 | CMPL DI, SI | ||
11984 | JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K | ||
11985 | MOVL SI, R8 | ||
11986 | MOVL SI, 12(SP) | ||
11987 | LEAQ (DX)(DI*1), SI | ||
11988 | SUBL DI, R8 | ||
11989 | LEAL -1(R8), DI | ||
11990 | CMPL DI, $0x3c | ||
11991 | JB one_byte_match_emit_encodeSnappyBlockAsm64K | ||
11992 | CMPL DI, $0x00000100 | ||
11993 | JB two_bytes_match_emit_encodeSnappyBlockAsm64K | ||
11994 | JB three_bytes_match_emit_encodeSnappyBlockAsm64K | ||
11995 | |||
11996 | three_bytes_match_emit_encodeSnappyBlockAsm64K: | ||
11997 | MOVB $0xf4, (AX) | ||
11998 | MOVW DI, 1(AX) | ||
11999 | ADDQ $0x03, AX | ||
12000 | JMP memmove_long_match_emit_encodeSnappyBlockAsm64K | ||
12001 | |||
12002 | two_bytes_match_emit_encodeSnappyBlockAsm64K: | ||
12003 | MOVB $0xf0, (AX) | ||
12004 | MOVB DI, 1(AX) | ||
12005 | ADDQ $0x02, AX | ||
12006 | CMPL DI, $0x40 | ||
12007 | JB memmove_match_emit_encodeSnappyBlockAsm64K | ||
12008 | JMP memmove_long_match_emit_encodeSnappyBlockAsm64K | ||
12009 | |||
12010 | one_byte_match_emit_encodeSnappyBlockAsm64K: | ||
12011 | SHLB $0x02, DI | ||
12012 | MOVB DI, (AX) | ||
12013 | ADDQ $0x01, AX | ||
12014 | |||
12015 | memmove_match_emit_encodeSnappyBlockAsm64K: | ||
12016 | LEAQ (AX)(R8*1), DI | ||
12017 | |||
12018 | // genMemMoveShort | ||
12019 | CMPQ R8, $0x08 | ||
12020 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 | ||
12021 | CMPQ R8, $0x10 | ||
12022 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 | ||
12023 | CMPQ R8, $0x20 | ||
12024 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 | ||
12025 | JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 | ||
12026 | |||
12027 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: | ||
12028 | MOVQ (SI), R9 | ||
12029 | MOVQ R9, (AX) | ||
12030 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K | ||
12031 | |||
12032 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: | ||
12033 | MOVQ (SI), R9 | ||
12034 | MOVQ -8(SI)(R8*1), SI | ||
12035 | MOVQ R9, (AX) | ||
12036 | MOVQ SI, -8(AX)(R8*1) | ||
12037 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K | ||
12038 | |||
12039 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: | ||
12040 | MOVOU (SI), X0 | ||
12041 | MOVOU -16(SI)(R8*1), X1 | ||
12042 | MOVOU X0, (AX) | ||
12043 | MOVOU X1, -16(AX)(R8*1) | ||
12044 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K | ||
12045 | |||
12046 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: | ||
12047 | MOVOU (SI), X0 | ||
12048 | MOVOU 16(SI), X1 | ||
12049 | MOVOU -32(SI)(R8*1), X2 | ||
12050 | MOVOU -16(SI)(R8*1), X3 | ||
12051 | MOVOU X0, (AX) | ||
12052 | MOVOU X1, 16(AX) | ||
12053 | MOVOU X2, -32(AX)(R8*1) | ||
12054 | MOVOU X3, -16(AX)(R8*1) | ||
12055 | |||
12056 | memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: | ||
12057 | MOVQ DI, AX | ||
12058 | JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K | ||
12059 | |||
12060 | memmove_long_match_emit_encodeSnappyBlockAsm64K: | ||
12061 | LEAQ (AX)(R8*1), DI | ||
12062 | |||
12063 | // genMemMoveLong | ||
12064 | MOVOU (SI), X0 | ||
12065 | MOVOU 16(SI), X1 | ||
12066 | MOVOU -32(SI)(R8*1), X2 | ||
12067 | MOVOU -16(SI)(R8*1), X3 | ||
12068 | MOVQ R8, R10 | ||
12069 | SHRQ $0x05, R10 | ||
12070 | MOVQ AX, R9 | ||
12071 | ANDL $0x0000001f, R9 | ||
12072 | MOVQ $0x00000040, R11 | ||
12073 | SUBQ R9, R11 | ||
12074 | DECQ R10 | ||
12075 | JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 | ||
12076 | LEAQ -32(SI)(R11*1), R9 | ||
12077 | LEAQ -32(AX)(R11*1), R12 | ||
12078 | |||
12079 | emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: | ||
12080 | MOVOU (R9), X4 | ||
12081 | MOVOU 16(R9), X5 | ||
12082 | MOVOA X4, (R12) | ||
12083 | MOVOA X5, 16(R12) | ||
12084 | ADDQ $0x20, R12 | ||
12085 | ADDQ $0x20, R9 | ||
12086 | ADDQ $0x20, R11 | ||
12087 | DECQ R10 | ||
12088 | JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back | ||
12089 | |||
12090 | emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: | ||
12091 | MOVOU -32(SI)(R11*1), X4 | ||
12092 | MOVOU -16(SI)(R11*1), X5 | ||
12093 | MOVOA X4, -32(AX)(R11*1) | ||
12094 | MOVOA X5, -16(AX)(R11*1) | ||
12095 | ADDQ $0x20, R11 | ||
12096 | CMPQ R8, R11 | ||
12097 | JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 | ||
12098 | MOVOU X0, (AX) | ||
12099 | MOVOU X1, 16(AX) | ||
12100 | MOVOU X2, -32(AX)(R8*1) | ||
12101 | MOVOU X3, -16(AX)(R8*1) | ||
12102 | MOVQ DI, AX | ||
12103 | |||
12104 | emit_literal_done_match_emit_encodeSnappyBlockAsm64K: | ||
12105 | match_nolit_loop_encodeSnappyBlockAsm64K: | ||
12106 | MOVL CX, SI | ||
12107 | SUBL BX, SI | ||
12108 | MOVL SI, 16(SP) | ||
12109 | ADDL $0x04, CX | ||
12110 | ADDL $0x04, BX | ||
12111 | MOVQ src_len+32(FP), SI | ||
12112 | SUBL CX, SI | ||
12113 | LEAQ (DX)(CX*1), DI | ||
12114 | LEAQ (DX)(BX*1), BX | ||
12115 | |||
12116 | // matchLen | ||
12117 | XORL R9, R9 | ||
12118 | |||
12119 | matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K: | ||
12120 | CMPL SI, $0x10 | ||
12121 | JB matchlen_match8_match_nolit_encodeSnappyBlockAsm64K | ||
12122 | MOVQ (DI)(R9*1), R8 | ||
12123 | MOVQ 8(DI)(R9*1), R10 | ||
12124 | XORQ (BX)(R9*1), R8 | ||
12125 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K | ||
12126 | XORQ 8(BX)(R9*1), R10 | ||
12127 | JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K | ||
12128 | LEAL -16(SI), SI | ||
12129 | LEAL 16(R9), R9 | ||
12130 | JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K | ||
12131 | |||
12132 | matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K: | ||
12133 | #ifdef GOAMD64_v3 | ||
12134 | TZCNTQ R10, R10 | ||
12135 | |||
12136 | #else | ||
12137 | BSFQ R10, R10 | ||
12138 | |||
12139 | #endif | ||
12140 | SARQ $0x03, R10 | ||
12141 | LEAL 8(R9)(R10*1), R9 | ||
12142 | JMP match_nolit_end_encodeSnappyBlockAsm64K | ||
12143 | |||
12144 | matchlen_match8_match_nolit_encodeSnappyBlockAsm64K: | ||
12145 | CMPL SI, $0x08 | ||
12146 | JB matchlen_match4_match_nolit_encodeSnappyBlockAsm64K | ||
12147 | MOVQ (DI)(R9*1), R8 | ||
12148 | XORQ (BX)(R9*1), R8 | ||
12149 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K | ||
12150 | LEAL -8(SI), SI | ||
12151 | LEAL 8(R9), R9 | ||
12152 | JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm64K | ||
12153 | |||
12154 | matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K: | ||
12155 | #ifdef GOAMD64_v3 | ||
12156 | TZCNTQ R8, R8 | ||
12157 | |||
12158 | #else | ||
12159 | BSFQ R8, R8 | ||
12160 | |||
12161 | #endif | ||
12162 | SARQ $0x03, R8 | ||
12163 | LEAL (R9)(R8*1), R9 | ||
12164 | JMP match_nolit_end_encodeSnappyBlockAsm64K | ||
12165 | |||
12166 | matchlen_match4_match_nolit_encodeSnappyBlockAsm64K: | ||
12167 | CMPL SI, $0x04 | ||
12168 | JB matchlen_match2_match_nolit_encodeSnappyBlockAsm64K | ||
12169 | MOVL (DI)(R9*1), R8 | ||
12170 | CMPL (BX)(R9*1), R8 | ||
12171 | JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K | ||
12172 | LEAL -4(SI), SI | ||
12173 | LEAL 4(R9), R9 | ||
12174 | |||
12175 | matchlen_match2_match_nolit_encodeSnappyBlockAsm64K: | ||
12176 | CMPL SI, $0x01 | ||
12177 | JE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K | ||
12178 | JB match_nolit_end_encodeSnappyBlockAsm64K | ||
12179 | MOVW (DI)(R9*1), R8 | ||
12180 | CMPW (BX)(R9*1), R8 | ||
12181 | JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K | ||
12182 | LEAL 2(R9), R9 | ||
12183 | SUBL $0x02, SI | ||
12184 | JZ match_nolit_end_encodeSnappyBlockAsm64K | ||
12185 | |||
12186 | matchlen_match1_match_nolit_encodeSnappyBlockAsm64K: | ||
12187 | MOVB (DI)(R9*1), R8 | ||
12188 | CMPB (BX)(R9*1), R8 | ||
12189 | JNE match_nolit_end_encodeSnappyBlockAsm64K | ||
12190 | LEAL 1(R9), R9 | ||
12191 | |||
12192 | match_nolit_end_encodeSnappyBlockAsm64K: | ||
12193 | ADDL R9, CX | ||
12194 | MOVL 16(SP), BX | ||
12195 | ADDL $0x04, R9 | ||
12196 | MOVL CX, 12(SP) | ||
12197 | |||
12198 | // emitCopy | ||
12199 | two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: | ||
12200 | CMPL R9, $0x40 | ||
12201 | JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K | ||
12202 | MOVB $0xee, (AX) | ||
12203 | MOVW BX, 1(AX) | ||
12204 | LEAL -60(R9), R9 | ||
12205 | ADDQ $0x03, AX | ||
12206 | JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K | ||
12207 | |||
12208 | two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: | ||
12209 | MOVL R9, SI | ||
12210 | SHLL $0x02, SI | ||
12211 | CMPL R9, $0x0c | ||
12212 | JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K | ||
12213 | CMPL BX, $0x00000800 | ||
12214 | JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K | ||
12215 | LEAL -15(SI), SI | ||
12216 | MOVB BL, 1(AX) | ||
12217 | SHRL $0x08, BX | ||
12218 | SHLL $0x05, BX | ||
12219 | ORL BX, SI | ||
12220 | MOVB SI, (AX) | ||
12221 | ADDQ $0x02, AX | ||
12222 | JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K | ||
12223 | |||
12224 | emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: | ||
12225 | LEAL -2(SI), SI | ||
12226 | MOVB SI, (AX) | ||
12227 | MOVW BX, 1(AX) | ||
12228 | ADDQ $0x03, AX | ||
12229 | |||
12230 | match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: | ||
12231 | CMPL CX, 8(SP) | ||
12232 | JAE emit_remainder_encodeSnappyBlockAsm64K | ||
12233 | MOVQ -2(DX)(CX*1), SI | ||
12234 | CMPQ AX, (SP) | ||
12235 | JB match_nolit_dst_ok_encodeSnappyBlockAsm64K | ||
12236 | MOVQ $0x00000000, ret+48(FP) | ||
12237 | RET | ||
12238 | |||
12239 | match_nolit_dst_ok_encodeSnappyBlockAsm64K: | ||
12240 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
12241 | MOVQ SI, DI | ||
12242 | SHRQ $0x10, SI | ||
12243 | MOVQ SI, BX | ||
12244 | SHLQ $0x10, DI | ||
12245 | IMULQ R8, DI | ||
12246 | SHRQ $0x32, DI | ||
12247 | SHLQ $0x10, BX | ||
12248 | IMULQ R8, BX | ||
12249 | SHRQ $0x32, BX | ||
12250 | LEAL -2(CX), R8 | ||
12251 | LEAQ 24(SP)(BX*4), R9 | ||
12252 | MOVL (R9), BX | ||
12253 | MOVL R8, 24(SP)(DI*4) | ||
12254 | MOVL CX, (R9) | ||
12255 | CMPL (DX)(BX*1), SI | ||
12256 | JEQ match_nolit_loop_encodeSnappyBlockAsm64K | ||
12257 | INCL CX | ||
12258 | JMP search_loop_encodeSnappyBlockAsm64K | ||
12259 | |||
12260 | emit_remainder_encodeSnappyBlockAsm64K: | ||
12261 | MOVQ src_len+32(FP), CX | ||
12262 | SUBL 12(SP), CX | ||
12263 | LEAQ 3(AX)(CX*1), CX | ||
12264 | CMPQ CX, (SP) | ||
12265 | JB emit_remainder_ok_encodeSnappyBlockAsm64K | ||
12266 | MOVQ $0x00000000, ret+48(FP) | ||
12267 | RET | ||
12268 | |||
12269 | emit_remainder_ok_encodeSnappyBlockAsm64K: | ||
12270 | MOVQ src_len+32(FP), CX | ||
12271 | MOVL 12(SP), BX | ||
12272 | CMPL BX, CX | ||
12273 | JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K | ||
12274 | MOVL CX, SI | ||
12275 | MOVL CX, 12(SP) | ||
12276 | LEAQ (DX)(BX*1), CX | ||
12277 | SUBL BX, SI | ||
12278 | LEAL -1(SI), DX | ||
12279 | CMPL DX, $0x3c | ||
12280 | JB one_byte_emit_remainder_encodeSnappyBlockAsm64K | ||
12281 | CMPL DX, $0x00000100 | ||
12282 | JB two_bytes_emit_remainder_encodeSnappyBlockAsm64K | ||
12283 | JB three_bytes_emit_remainder_encodeSnappyBlockAsm64K | ||
12284 | |||
12285 | three_bytes_emit_remainder_encodeSnappyBlockAsm64K: | ||
12286 | MOVB $0xf4, (AX) | ||
12287 | MOVW DX, 1(AX) | ||
12288 | ADDQ $0x03, AX | ||
12289 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K | ||
12290 | |||
12291 | two_bytes_emit_remainder_encodeSnappyBlockAsm64K: | ||
12292 | MOVB $0xf0, (AX) | ||
12293 | MOVB DL, 1(AX) | ||
12294 | ADDQ $0x02, AX | ||
12295 | CMPL DX, $0x40 | ||
12296 | JB memmove_emit_remainder_encodeSnappyBlockAsm64K | ||
12297 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K | ||
12298 | |||
12299 | one_byte_emit_remainder_encodeSnappyBlockAsm64K: | ||
12300 | SHLB $0x02, DL | ||
12301 | MOVB DL, (AX) | ||
12302 | ADDQ $0x01, AX | ||
12303 | |||
12304 | memmove_emit_remainder_encodeSnappyBlockAsm64K: | ||
12305 | LEAQ (AX)(SI*1), DX | ||
12306 | MOVL SI, BX | ||
12307 | |||
12308 | // genMemMoveShort | ||
12309 | CMPQ BX, $0x03 | ||
12310 | JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2 | ||
12311 | JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3 | ||
12312 | CMPQ BX, $0x08 | ||
12313 | JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7 | ||
12314 | CMPQ BX, $0x10 | ||
12315 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 | ||
12316 | CMPQ BX, $0x20 | ||
12317 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 | ||
12318 | JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 | ||
12319 | |||
12320 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2: | ||
12321 | MOVB (CX), SI | ||
12322 | MOVB -1(CX)(BX*1), CL | ||
12323 | MOVB SI, (AX) | ||
12324 | MOVB CL, -1(AX)(BX*1) | ||
12325 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K | ||
12326 | |||
12327 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3: | ||
12328 | MOVW (CX), SI | ||
12329 | MOVB 2(CX), CL | ||
12330 | MOVW SI, (AX) | ||
12331 | MOVB CL, 2(AX) | ||
12332 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K | ||
12333 | |||
12334 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7: | ||
12335 | MOVL (CX), SI | ||
12336 | MOVL -4(CX)(BX*1), CX | ||
12337 | MOVL SI, (AX) | ||
12338 | MOVL CX, -4(AX)(BX*1) | ||
12339 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K | ||
12340 | |||
12341 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: | ||
12342 | MOVQ (CX), SI | ||
12343 | MOVQ -8(CX)(BX*1), CX | ||
12344 | MOVQ SI, (AX) | ||
12345 | MOVQ CX, -8(AX)(BX*1) | ||
12346 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K | ||
12347 | |||
12348 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: | ||
12349 | MOVOU (CX), X0 | ||
12350 | MOVOU -16(CX)(BX*1), X1 | ||
12351 | MOVOU X0, (AX) | ||
12352 | MOVOU X1, -16(AX)(BX*1) | ||
12353 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K | ||
12354 | |||
12355 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: | ||
12356 | MOVOU (CX), X0 | ||
12357 | MOVOU 16(CX), X1 | ||
12358 | MOVOU -32(CX)(BX*1), X2 | ||
12359 | MOVOU -16(CX)(BX*1), X3 | ||
12360 | MOVOU X0, (AX) | ||
12361 | MOVOU X1, 16(AX) | ||
12362 | MOVOU X2, -32(AX)(BX*1) | ||
12363 | MOVOU X3, -16(AX)(BX*1) | ||
12364 | |||
12365 | memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: | ||
12366 | MOVQ DX, AX | ||
12367 | JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K | ||
12368 | |||
12369 | memmove_long_emit_remainder_encodeSnappyBlockAsm64K: | ||
12370 | LEAQ (AX)(SI*1), DX | ||
12371 | MOVL SI, BX | ||
12372 | |||
12373 | // genMemMoveLong | ||
12374 | MOVOU (CX), X0 | ||
12375 | MOVOU 16(CX), X1 | ||
12376 | MOVOU -32(CX)(BX*1), X2 | ||
12377 | MOVOU -16(CX)(BX*1), X3 | ||
12378 | MOVQ BX, DI | ||
12379 | SHRQ $0x05, DI | ||
12380 | MOVQ AX, SI | ||
12381 | ANDL $0x0000001f, SI | ||
12382 | MOVQ $0x00000040, R8 | ||
12383 | SUBQ SI, R8 | ||
12384 | DECQ DI | ||
12385 | JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 | ||
12386 | LEAQ -32(CX)(R8*1), SI | ||
12387 | LEAQ -32(AX)(R8*1), R9 | ||
12388 | |||
12389 | emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: | ||
12390 | MOVOU (SI), X4 | ||
12391 | MOVOU 16(SI), X5 | ||
12392 | MOVOA X4, (R9) | ||
12393 | MOVOA X5, 16(R9) | ||
12394 | ADDQ $0x20, R9 | ||
12395 | ADDQ $0x20, SI | ||
12396 | ADDQ $0x20, R8 | ||
12397 | DECQ DI | ||
12398 | JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back | ||
12399 | |||
12400 | emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: | ||
12401 | MOVOU -32(CX)(R8*1), X4 | ||
12402 | MOVOU -16(CX)(R8*1), X5 | ||
12403 | MOVOA X4, -32(AX)(R8*1) | ||
12404 | MOVOA X5, -16(AX)(R8*1) | ||
12405 | ADDQ $0x20, R8 | ||
12406 | CMPQ BX, R8 | ||
12407 | JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 | ||
12408 | MOVOU X0, (AX) | ||
12409 | MOVOU X1, 16(AX) | ||
12410 | MOVOU X2, -32(AX)(BX*1) | ||
12411 | MOVOU X3, -16(AX)(BX*1) | ||
12412 | MOVQ DX, AX | ||
12413 | |||
12414 | emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: | ||
12415 | MOVQ dst_base+0(FP), CX | ||
12416 | SUBQ CX, AX | ||
12417 | MOVQ AX, ret+48(FP) | ||
12418 | RET | ||
12419 | |||
12420 | // func encodeSnappyBlockAsm12B(dst []byte, src []byte) int | ||
12421 | // Requires: BMI, SSE2 | ||
12422 | TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 | ||
12423 | MOVQ dst_base+0(FP), AX | ||
12424 | MOVQ $0x00000080, CX | ||
12425 | LEAQ 24(SP), DX | ||
12426 | PXOR X0, X0 | ||
12427 | |||
12428 | zero_loop_encodeSnappyBlockAsm12B: | ||
12429 | MOVOU X0, (DX) | ||
12430 | MOVOU X0, 16(DX) | ||
12431 | MOVOU X0, 32(DX) | ||
12432 | MOVOU X0, 48(DX) | ||
12433 | MOVOU X0, 64(DX) | ||
12434 | MOVOU X0, 80(DX) | ||
12435 | MOVOU X0, 96(DX) | ||
12436 | MOVOU X0, 112(DX) | ||
12437 | ADDQ $0x80, DX | ||
12438 | DECQ CX | ||
12439 | JNZ zero_loop_encodeSnappyBlockAsm12B | ||
12440 | MOVL $0x00000000, 12(SP) | ||
12441 | MOVQ src_len+32(FP), CX | ||
12442 | LEAQ -9(CX), DX | ||
12443 | LEAQ -8(CX), BX | ||
12444 | MOVL BX, 8(SP) | ||
12445 | SHRQ $0x05, CX | ||
12446 | SUBL CX, DX | ||
12447 | LEAQ (AX)(DX*1), DX | ||
12448 | MOVQ DX, (SP) | ||
12449 | MOVL $0x00000001, CX | ||
12450 | MOVL CX, 16(SP) | ||
12451 | MOVQ src_base+24(FP), DX | ||
12452 | |||
12453 | search_loop_encodeSnappyBlockAsm12B: | ||
12454 | MOVL CX, BX | ||
12455 | SUBL 12(SP), BX | ||
12456 | SHRL $0x05, BX | ||
12457 | LEAL 4(CX)(BX*1), BX | ||
12458 | CMPL BX, 8(SP) | ||
12459 | JAE emit_remainder_encodeSnappyBlockAsm12B | ||
12460 | MOVQ (DX)(CX*1), SI | ||
12461 | MOVL BX, 20(SP) | ||
12462 | MOVQ $0x000000cf1bbcdcbb, R8 | ||
12463 | MOVQ SI, R9 | ||
12464 | MOVQ SI, R10 | ||
12465 | SHRQ $0x08, R10 | ||
12466 | SHLQ $0x18, R9 | ||
12467 | IMULQ R8, R9 | ||
12468 | SHRQ $0x34, R9 | ||
12469 | SHLQ $0x18, R10 | ||
12470 | IMULQ R8, R10 | ||
12471 | SHRQ $0x34, R10 | ||
12472 | MOVL 24(SP)(R9*4), BX | ||
12473 | MOVL 24(SP)(R10*4), DI | ||
12474 | MOVL CX, 24(SP)(R9*4) | ||
12475 | LEAL 1(CX), R9 | ||
12476 | MOVL R9, 24(SP)(R10*4) | ||
12477 | MOVQ SI, R9 | ||
12478 | SHRQ $0x10, R9 | ||
12479 | SHLQ $0x18, R9 | ||
12480 | IMULQ R8, R9 | ||
12481 | SHRQ $0x34, R9 | ||
12482 | MOVL CX, R8 | ||
12483 | SUBL 16(SP), R8 | ||
12484 | MOVL 1(DX)(R8*1), R10 | ||
12485 | MOVQ SI, R8 | ||
12486 | SHRQ $0x08, R8 | ||
12487 | CMPL R8, R10 | ||
12488 | JNE no_repeat_found_encodeSnappyBlockAsm12B | ||
12489 | LEAL 1(CX), SI | ||
12490 | MOVL 12(SP), BX | ||
12491 | MOVL SI, DI | ||
12492 | SUBL 16(SP), DI | ||
12493 | JZ repeat_extend_back_end_encodeSnappyBlockAsm12B | ||
12494 | |||
12495 | repeat_extend_back_loop_encodeSnappyBlockAsm12B: | ||
12496 | CMPL SI, BX | ||
12497 | JBE repeat_extend_back_end_encodeSnappyBlockAsm12B | ||
12498 | MOVB -1(DX)(DI*1), R8 | ||
12499 | MOVB -1(DX)(SI*1), R9 | ||
12500 | CMPB R8, R9 | ||
12501 | JNE repeat_extend_back_end_encodeSnappyBlockAsm12B | ||
12502 | LEAL -1(SI), SI | ||
12503 | DECL DI | ||
12504 | JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B | ||
12505 | |||
12506 | repeat_extend_back_end_encodeSnappyBlockAsm12B: | ||
12507 | MOVL 12(SP), BX | ||
12508 | CMPL BX, SI | ||
12509 | JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B | ||
12510 | MOVL SI, DI | ||
12511 | MOVL SI, 12(SP) | ||
12512 | LEAQ (DX)(BX*1), R8 | ||
12513 | SUBL BX, DI | ||
12514 | LEAL -1(DI), BX | ||
12515 | CMPL BX, $0x3c | ||
12516 | JB one_byte_repeat_emit_encodeSnappyBlockAsm12B | ||
12517 | CMPL BX, $0x00000100 | ||
12518 | JB two_bytes_repeat_emit_encodeSnappyBlockAsm12B | ||
12519 | JB three_bytes_repeat_emit_encodeSnappyBlockAsm12B | ||
12520 | |||
12521 | three_bytes_repeat_emit_encodeSnappyBlockAsm12B: | ||
12522 | MOVB $0xf4, (AX) | ||
12523 | MOVW BX, 1(AX) | ||
12524 | ADDQ $0x03, AX | ||
12525 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B | ||
12526 | |||
12527 | two_bytes_repeat_emit_encodeSnappyBlockAsm12B: | ||
12528 | MOVB $0xf0, (AX) | ||
12529 | MOVB BL, 1(AX) | ||
12530 | ADDQ $0x02, AX | ||
12531 | CMPL BX, $0x40 | ||
12532 | JB memmove_repeat_emit_encodeSnappyBlockAsm12B | ||
12533 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B | ||
12534 | |||
12535 | one_byte_repeat_emit_encodeSnappyBlockAsm12B: | ||
12536 | SHLB $0x02, BL | ||
12537 | MOVB BL, (AX) | ||
12538 | ADDQ $0x01, AX | ||
12539 | |||
12540 | memmove_repeat_emit_encodeSnappyBlockAsm12B: | ||
12541 | LEAQ (AX)(DI*1), BX | ||
12542 | |||
12543 | // genMemMoveShort | ||
12544 | CMPQ DI, $0x08 | ||
12545 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 | ||
12546 | CMPQ DI, $0x10 | ||
12547 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 | ||
12548 | CMPQ DI, $0x20 | ||
12549 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 | ||
12550 | JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 | ||
12551 | |||
12552 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: | ||
12553 | MOVQ (R8), R9 | ||
12554 | MOVQ R9, (AX) | ||
12555 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B | ||
12556 | |||
12557 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: | ||
12558 | MOVQ (R8), R9 | ||
12559 | MOVQ -8(R8)(DI*1), R8 | ||
12560 | MOVQ R9, (AX) | ||
12561 | MOVQ R8, -8(AX)(DI*1) | ||
12562 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B | ||
12563 | |||
12564 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: | ||
12565 | MOVOU (R8), X0 | ||
12566 | MOVOU -16(R8)(DI*1), X1 | ||
12567 | MOVOU X0, (AX) | ||
12568 | MOVOU X1, -16(AX)(DI*1) | ||
12569 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B | ||
12570 | |||
12571 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: | ||
12572 | MOVOU (R8), X0 | ||
12573 | MOVOU 16(R8), X1 | ||
12574 | MOVOU -32(R8)(DI*1), X2 | ||
12575 | MOVOU -16(R8)(DI*1), X3 | ||
12576 | MOVOU X0, (AX) | ||
12577 | MOVOU X1, 16(AX) | ||
12578 | MOVOU X2, -32(AX)(DI*1) | ||
12579 | MOVOU X3, -16(AX)(DI*1) | ||
12580 | |||
12581 | memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: | ||
12582 | MOVQ BX, AX | ||
12583 | JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B | ||
12584 | |||
12585 | memmove_long_repeat_emit_encodeSnappyBlockAsm12B: | ||
12586 | LEAQ (AX)(DI*1), BX | ||
12587 | |||
12588 | // genMemMoveLong | ||
12589 | MOVOU (R8), X0 | ||
12590 | MOVOU 16(R8), X1 | ||
12591 | MOVOU -32(R8)(DI*1), X2 | ||
12592 | MOVOU -16(R8)(DI*1), X3 | ||
12593 | MOVQ DI, R10 | ||
12594 | SHRQ $0x05, R10 | ||
12595 | MOVQ AX, R9 | ||
12596 | ANDL $0x0000001f, R9 | ||
12597 | MOVQ $0x00000040, R11 | ||
12598 | SUBQ R9, R11 | ||
12599 | DECQ R10 | ||
12600 | JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 | ||
12601 | LEAQ -32(R8)(R11*1), R9 | ||
12602 | LEAQ -32(AX)(R11*1), R12 | ||
12603 | |||
12604 | emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: | ||
12605 | MOVOU (R9), X4 | ||
12606 | MOVOU 16(R9), X5 | ||
12607 | MOVOA X4, (R12) | ||
12608 | MOVOA X5, 16(R12) | ||
12609 | ADDQ $0x20, R12 | ||
12610 | ADDQ $0x20, R9 | ||
12611 | ADDQ $0x20, R11 | ||
12612 | DECQ R10 | ||
12613 | JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back | ||
12614 | |||
12615 | emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: | ||
12616 | MOVOU -32(R8)(R11*1), X4 | ||
12617 | MOVOU -16(R8)(R11*1), X5 | ||
12618 | MOVOA X4, -32(AX)(R11*1) | ||
12619 | MOVOA X5, -16(AX)(R11*1) | ||
12620 | ADDQ $0x20, R11 | ||
12621 | CMPQ DI, R11 | ||
12622 | JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 | ||
12623 | MOVOU X0, (AX) | ||
12624 | MOVOU X1, 16(AX) | ||
12625 | MOVOU X2, -32(AX)(DI*1) | ||
12626 | MOVOU X3, -16(AX)(DI*1) | ||
12627 | MOVQ BX, AX | ||
12628 | |||
12629 | emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: | ||
12630 | ADDL $0x05, CX | ||
12631 | MOVL CX, BX | ||
12632 | SUBL 16(SP), BX | ||
12633 | MOVQ src_len+32(FP), DI | ||
12634 | SUBL CX, DI | ||
12635 | LEAQ (DX)(CX*1), R8 | ||
12636 | LEAQ (DX)(BX*1), BX | ||
12637 | |||
12638 | // matchLen | ||
12639 | XORL R10, R10 | ||
12640 | |||
12641 | matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B: | ||
12642 | CMPL DI, $0x10 | ||
12643 | JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B | ||
12644 | MOVQ (R8)(R10*1), R9 | ||
12645 | MOVQ 8(R8)(R10*1), R11 | ||
12646 | XORQ (BX)(R10*1), R9 | ||
12647 | JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B | ||
12648 | XORQ 8(BX)(R10*1), R11 | ||
12649 | JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B | ||
12650 | LEAL -16(DI), DI | ||
12651 | LEAL 16(R10), R10 | ||
12652 | JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B | ||
12653 | |||
12654 | matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B: | ||
12655 | #ifdef GOAMD64_v3 | ||
12656 | TZCNTQ R11, R11 | ||
12657 | |||
12658 | #else | ||
12659 | BSFQ R11, R11 | ||
12660 | |||
12661 | #endif | ||
12662 | SARQ $0x03, R11 | ||
12663 | LEAL 8(R10)(R11*1), R10 | ||
12664 | JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B | ||
12665 | |||
12666 | matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B: | ||
12667 | CMPL DI, $0x08 | ||
12668 | JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B | ||
12669 | MOVQ (R8)(R10*1), R9 | ||
12670 | XORQ (BX)(R10*1), R9 | ||
12671 | JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B | ||
12672 | LEAL -8(DI), DI | ||
12673 | LEAL 8(R10), R10 | ||
12674 | JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B | ||
12675 | |||
12676 | matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B: | ||
12677 | #ifdef GOAMD64_v3 | ||
12678 | TZCNTQ R9, R9 | ||
12679 | |||
12680 | #else | ||
12681 | BSFQ R9, R9 | ||
12682 | |||
12683 | #endif | ||
12684 | SARQ $0x03, R9 | ||
12685 | LEAL (R10)(R9*1), R10 | ||
12686 | JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B | ||
12687 | |||
12688 | matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B: | ||
12689 | CMPL DI, $0x04 | ||
12690 | JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B | ||
12691 | MOVL (R8)(R10*1), R9 | ||
12692 | CMPL (BX)(R10*1), R9 | ||
12693 | JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B | ||
12694 | LEAL -4(DI), DI | ||
12695 | LEAL 4(R10), R10 | ||
12696 | |||
12697 | matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B: | ||
12698 | CMPL DI, $0x01 | ||
12699 | JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B | ||
12700 | JB repeat_extend_forward_end_encodeSnappyBlockAsm12B | ||
12701 | MOVW (R8)(R10*1), R9 | ||
12702 | CMPW (BX)(R10*1), R9 | ||
12703 | JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B | ||
12704 | LEAL 2(R10), R10 | ||
12705 | SUBL $0x02, DI | ||
12706 | JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B | ||
12707 | |||
12708 | matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B: | ||
12709 | MOVB (R8)(R10*1), R9 | ||
12710 | CMPB (BX)(R10*1), R9 | ||
12711 | JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B | ||
12712 | LEAL 1(R10), R10 | ||
12713 | |||
12714 | repeat_extend_forward_end_encodeSnappyBlockAsm12B: | ||
12715 | ADDL R10, CX | ||
12716 | MOVL CX, BX | ||
12717 | SUBL SI, BX | ||
12718 | MOVL 16(SP), SI | ||
12719 | |||
12720 | // emitCopy | ||
12721 | two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: | ||
12722 | CMPL BX, $0x40 | ||
12723 | JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B | ||
12724 | MOVB $0xee, (AX) | ||
12725 | MOVW SI, 1(AX) | ||
12726 | LEAL -60(BX), BX | ||
12727 | ADDQ $0x03, AX | ||
12728 | JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B | ||
12729 | |||
12730 | two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: | ||
12731 | MOVL BX, DI | ||
12732 | SHLL $0x02, DI | ||
12733 | CMPL BX, $0x0c | ||
12734 | JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B | ||
12735 | CMPL SI, $0x00000800 | ||
12736 | JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B | ||
12737 | LEAL -15(DI), DI | ||
12738 | MOVB SI, 1(AX) | ||
12739 | SHRL $0x08, SI | ||
12740 | SHLL $0x05, SI | ||
12741 | ORL SI, DI | ||
12742 | MOVB DI, (AX) | ||
12743 | ADDQ $0x02, AX | ||
12744 | JMP repeat_end_emit_encodeSnappyBlockAsm12B | ||
12745 | |||
12746 | emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: | ||
12747 | LEAL -2(DI), DI | ||
12748 | MOVB DI, (AX) | ||
12749 | MOVW SI, 1(AX) | ||
12750 | ADDQ $0x03, AX | ||
12751 | |||
12752 | repeat_end_emit_encodeSnappyBlockAsm12B: | ||
12753 | MOVL CX, 12(SP) | ||
12754 | JMP search_loop_encodeSnappyBlockAsm12B | ||
12755 | |||
12756 | no_repeat_found_encodeSnappyBlockAsm12B: | ||
12757 | CMPL (DX)(BX*1), SI | ||
12758 | JEQ candidate_match_encodeSnappyBlockAsm12B | ||
12759 | SHRQ $0x08, SI | ||
12760 | MOVL 24(SP)(R9*4), BX | ||
12761 | LEAL 2(CX), R8 | ||
12762 | CMPL (DX)(DI*1), SI | ||
12763 | JEQ candidate2_match_encodeSnappyBlockAsm12B | ||
12764 | MOVL R8, 24(SP)(R9*4) | ||
12765 | SHRQ $0x08, SI | ||
12766 | CMPL (DX)(BX*1), SI | ||
12767 | JEQ candidate3_match_encodeSnappyBlockAsm12B | ||
12768 | MOVL 20(SP), CX | ||
12769 | JMP search_loop_encodeSnappyBlockAsm12B | ||
12770 | |||
12771 | candidate3_match_encodeSnappyBlockAsm12B: | ||
12772 | ADDL $0x02, CX | ||
12773 | JMP candidate_match_encodeSnappyBlockAsm12B | ||
12774 | |||
12775 | candidate2_match_encodeSnappyBlockAsm12B: | ||
12776 | MOVL R8, 24(SP)(R9*4) | ||
12777 | INCL CX | ||
12778 | MOVL DI, BX | ||
12779 | |||
12780 | candidate_match_encodeSnappyBlockAsm12B: | ||
12781 | MOVL 12(SP), SI | ||
12782 | TESTL BX, BX | ||
12783 | JZ match_extend_back_end_encodeSnappyBlockAsm12B | ||
12784 | |||
12785 | match_extend_back_loop_encodeSnappyBlockAsm12B: | ||
12786 | CMPL CX, SI | ||
12787 | JBE match_extend_back_end_encodeSnappyBlockAsm12B | ||
12788 | MOVB -1(DX)(BX*1), DI | ||
12789 | MOVB -1(DX)(CX*1), R8 | ||
12790 | CMPB DI, R8 | ||
12791 | JNE match_extend_back_end_encodeSnappyBlockAsm12B | ||
12792 | LEAL -1(CX), CX | ||
12793 | DECL BX | ||
12794 | JZ match_extend_back_end_encodeSnappyBlockAsm12B | ||
12795 | JMP match_extend_back_loop_encodeSnappyBlockAsm12B | ||
12796 | |||
12797 | match_extend_back_end_encodeSnappyBlockAsm12B: | ||
12798 | MOVL CX, SI | ||
12799 | SUBL 12(SP), SI | ||
12800 | LEAQ 3(AX)(SI*1), SI | ||
12801 | CMPQ SI, (SP) | ||
12802 | JB match_dst_size_check_encodeSnappyBlockAsm12B | ||
12803 | MOVQ $0x00000000, ret+48(FP) | ||
12804 | RET | ||
12805 | |||
12806 | match_dst_size_check_encodeSnappyBlockAsm12B: | ||
12807 | MOVL CX, SI | ||
12808 | MOVL 12(SP), DI | ||
12809 | CMPL DI, SI | ||
12810 | JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B | ||
12811 | MOVL SI, R8 | ||
12812 | MOVL SI, 12(SP) | ||
12813 | LEAQ (DX)(DI*1), SI | ||
12814 | SUBL DI, R8 | ||
12815 | LEAL -1(R8), DI | ||
12816 | CMPL DI, $0x3c | ||
12817 | JB one_byte_match_emit_encodeSnappyBlockAsm12B | ||
12818 | CMPL DI, $0x00000100 | ||
12819 | JB two_bytes_match_emit_encodeSnappyBlockAsm12B | ||
12820 | JB three_bytes_match_emit_encodeSnappyBlockAsm12B | ||
12821 | |||
12822 | three_bytes_match_emit_encodeSnappyBlockAsm12B: | ||
12823 | MOVB $0xf4, (AX) | ||
12824 | MOVW DI, 1(AX) | ||
12825 | ADDQ $0x03, AX | ||
12826 | JMP memmove_long_match_emit_encodeSnappyBlockAsm12B | ||
12827 | |||
12828 | two_bytes_match_emit_encodeSnappyBlockAsm12B: | ||
12829 | MOVB $0xf0, (AX) | ||
12830 | MOVB DI, 1(AX) | ||
12831 | ADDQ $0x02, AX | ||
12832 | CMPL DI, $0x40 | ||
12833 | JB memmove_match_emit_encodeSnappyBlockAsm12B | ||
12834 | JMP memmove_long_match_emit_encodeSnappyBlockAsm12B | ||
12835 | |||
12836 | one_byte_match_emit_encodeSnappyBlockAsm12B: | ||
12837 | SHLB $0x02, DI | ||
12838 | MOVB DI, (AX) | ||
12839 | ADDQ $0x01, AX | ||
12840 | |||
12841 | memmove_match_emit_encodeSnappyBlockAsm12B: | ||
12842 | LEAQ (AX)(R8*1), DI | ||
12843 | |||
12844 | // genMemMoveShort | ||
12845 | CMPQ R8, $0x08 | ||
12846 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 | ||
12847 | CMPQ R8, $0x10 | ||
12848 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 | ||
12849 | CMPQ R8, $0x20 | ||
12850 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 | ||
12851 | JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 | ||
12852 | |||
12853 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: | ||
12854 | MOVQ (SI), R9 | ||
12855 | MOVQ R9, (AX) | ||
12856 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B | ||
12857 | |||
12858 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: | ||
12859 | MOVQ (SI), R9 | ||
12860 | MOVQ -8(SI)(R8*1), SI | ||
12861 | MOVQ R9, (AX) | ||
12862 | MOVQ SI, -8(AX)(R8*1) | ||
12863 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B | ||
12864 | |||
12865 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: | ||
12866 | MOVOU (SI), X0 | ||
12867 | MOVOU -16(SI)(R8*1), X1 | ||
12868 | MOVOU X0, (AX) | ||
12869 | MOVOU X1, -16(AX)(R8*1) | ||
12870 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B | ||
12871 | |||
12872 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: | ||
12873 | MOVOU (SI), X0 | ||
12874 | MOVOU 16(SI), X1 | ||
12875 | MOVOU -32(SI)(R8*1), X2 | ||
12876 | MOVOU -16(SI)(R8*1), X3 | ||
12877 | MOVOU X0, (AX) | ||
12878 | MOVOU X1, 16(AX) | ||
12879 | MOVOU X2, -32(AX)(R8*1) | ||
12880 | MOVOU X3, -16(AX)(R8*1) | ||
12881 | |||
12882 | memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: | ||
12883 | MOVQ DI, AX | ||
12884 | JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B | ||
12885 | |||
12886 | memmove_long_match_emit_encodeSnappyBlockAsm12B: | ||
12887 | LEAQ (AX)(R8*1), DI | ||
12888 | |||
12889 | // genMemMoveLong | ||
12890 | MOVOU (SI), X0 | ||
12891 | MOVOU 16(SI), X1 | ||
12892 | MOVOU -32(SI)(R8*1), X2 | ||
12893 | MOVOU -16(SI)(R8*1), X3 | ||
12894 | MOVQ R8, R10 | ||
12895 | SHRQ $0x05, R10 | ||
12896 | MOVQ AX, R9 | ||
12897 | ANDL $0x0000001f, R9 | ||
12898 | MOVQ $0x00000040, R11 | ||
12899 | SUBQ R9, R11 | ||
12900 | DECQ R10 | ||
12901 | JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 | ||
12902 | LEAQ -32(SI)(R11*1), R9 | ||
12903 | LEAQ -32(AX)(R11*1), R12 | ||
12904 | |||
12905 | emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: | ||
12906 | MOVOU (R9), X4 | ||
12907 | MOVOU 16(R9), X5 | ||
12908 | MOVOA X4, (R12) | ||
12909 | MOVOA X5, 16(R12) | ||
12910 | ADDQ $0x20, R12 | ||
12911 | ADDQ $0x20, R9 | ||
12912 | ADDQ $0x20, R11 | ||
12913 | DECQ R10 | ||
12914 | JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back | ||
12915 | |||
12916 | emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: | ||
12917 | MOVOU -32(SI)(R11*1), X4 | ||
12918 | MOVOU -16(SI)(R11*1), X5 | ||
12919 | MOVOA X4, -32(AX)(R11*1) | ||
12920 | MOVOA X5, -16(AX)(R11*1) | ||
12921 | ADDQ $0x20, R11 | ||
12922 | CMPQ R8, R11 | ||
12923 | JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 | ||
12924 | MOVOU X0, (AX) | ||
12925 | MOVOU X1, 16(AX) | ||
12926 | MOVOU X2, -32(AX)(R8*1) | ||
12927 | MOVOU X3, -16(AX)(R8*1) | ||
12928 | MOVQ DI, AX | ||
12929 | |||
12930 | emit_literal_done_match_emit_encodeSnappyBlockAsm12B: | ||
12931 | match_nolit_loop_encodeSnappyBlockAsm12B: | ||
12932 | MOVL CX, SI | ||
12933 | SUBL BX, SI | ||
12934 | MOVL SI, 16(SP) | ||
12935 | ADDL $0x04, CX | ||
12936 | ADDL $0x04, BX | ||
12937 | MOVQ src_len+32(FP), SI | ||
12938 | SUBL CX, SI | ||
12939 | LEAQ (DX)(CX*1), DI | ||
12940 | LEAQ (DX)(BX*1), BX | ||
12941 | |||
12942 | // matchLen | ||
12943 | XORL R9, R9 | ||
12944 | |||
12945 | matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B: | ||
12946 | CMPL SI, $0x10 | ||
12947 | JB matchlen_match8_match_nolit_encodeSnappyBlockAsm12B | ||
12948 | MOVQ (DI)(R9*1), R8 | ||
12949 | MOVQ 8(DI)(R9*1), R10 | ||
12950 | XORQ (BX)(R9*1), R8 | ||
12951 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B | ||
12952 | XORQ 8(BX)(R9*1), R10 | ||
12953 | JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B | ||
12954 | LEAL -16(SI), SI | ||
12955 | LEAL 16(R9), R9 | ||
12956 | JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B | ||
12957 | |||
12958 | matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B: | ||
12959 | #ifdef GOAMD64_v3 | ||
12960 | TZCNTQ R10, R10 | ||
12961 | |||
12962 | #else | ||
12963 | BSFQ R10, R10 | ||
12964 | |||
12965 | #endif | ||
12966 | SARQ $0x03, R10 | ||
12967 | LEAL 8(R9)(R10*1), R9 | ||
12968 | JMP match_nolit_end_encodeSnappyBlockAsm12B | ||
12969 | |||
12970 | matchlen_match8_match_nolit_encodeSnappyBlockAsm12B: | ||
12971 | CMPL SI, $0x08 | ||
12972 | JB matchlen_match4_match_nolit_encodeSnappyBlockAsm12B | ||
12973 | MOVQ (DI)(R9*1), R8 | ||
12974 | XORQ (BX)(R9*1), R8 | ||
12975 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B | ||
12976 | LEAL -8(SI), SI | ||
12977 | LEAL 8(R9), R9 | ||
12978 | JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm12B | ||
12979 | |||
12980 | matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B: | ||
12981 | #ifdef GOAMD64_v3 | ||
12982 | TZCNTQ R8, R8 | ||
12983 | |||
12984 | #else | ||
12985 | BSFQ R8, R8 | ||
12986 | |||
12987 | #endif | ||
12988 | SARQ $0x03, R8 | ||
12989 | LEAL (R9)(R8*1), R9 | ||
12990 | JMP match_nolit_end_encodeSnappyBlockAsm12B | ||
12991 | |||
12992 | matchlen_match4_match_nolit_encodeSnappyBlockAsm12B: | ||
12993 | CMPL SI, $0x04 | ||
12994 | JB matchlen_match2_match_nolit_encodeSnappyBlockAsm12B | ||
12995 | MOVL (DI)(R9*1), R8 | ||
12996 | CMPL (BX)(R9*1), R8 | ||
12997 | JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B | ||
12998 | LEAL -4(SI), SI | ||
12999 | LEAL 4(R9), R9 | ||
13000 | |||
13001 | matchlen_match2_match_nolit_encodeSnappyBlockAsm12B: | ||
13002 | CMPL SI, $0x01 | ||
13003 | JE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B | ||
13004 | JB match_nolit_end_encodeSnappyBlockAsm12B | ||
13005 | MOVW (DI)(R9*1), R8 | ||
13006 | CMPW (BX)(R9*1), R8 | ||
13007 | JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B | ||
13008 | LEAL 2(R9), R9 | ||
13009 | SUBL $0x02, SI | ||
13010 | JZ match_nolit_end_encodeSnappyBlockAsm12B | ||
13011 | |||
13012 | matchlen_match1_match_nolit_encodeSnappyBlockAsm12B: | ||
13013 | MOVB (DI)(R9*1), R8 | ||
13014 | CMPB (BX)(R9*1), R8 | ||
13015 | JNE match_nolit_end_encodeSnappyBlockAsm12B | ||
13016 | LEAL 1(R9), R9 | ||
13017 | |||
13018 | match_nolit_end_encodeSnappyBlockAsm12B: | ||
13019 | ADDL R9, CX | ||
13020 | MOVL 16(SP), BX | ||
13021 | ADDL $0x04, R9 | ||
13022 | MOVL CX, 12(SP) | ||
13023 | |||
13024 | // emitCopy | ||
13025 | two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: | ||
13026 | CMPL R9, $0x40 | ||
13027 | JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B | ||
13028 | MOVB $0xee, (AX) | ||
13029 | MOVW BX, 1(AX) | ||
13030 | LEAL -60(R9), R9 | ||
13031 | ADDQ $0x03, AX | ||
13032 | JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B | ||
13033 | |||
13034 | two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: | ||
13035 | MOVL R9, SI | ||
13036 | SHLL $0x02, SI | ||
13037 | CMPL R9, $0x0c | ||
13038 | JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B | ||
13039 | CMPL BX, $0x00000800 | ||
13040 | JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B | ||
13041 | LEAL -15(SI), SI | ||
13042 | MOVB BL, 1(AX) | ||
13043 | SHRL $0x08, BX | ||
13044 | SHLL $0x05, BX | ||
13045 | ORL BX, SI | ||
13046 | MOVB SI, (AX) | ||
13047 | ADDQ $0x02, AX | ||
13048 | JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B | ||
13049 | |||
13050 | emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: | ||
13051 | LEAL -2(SI), SI | ||
13052 | MOVB SI, (AX) | ||
13053 | MOVW BX, 1(AX) | ||
13054 | ADDQ $0x03, AX | ||
13055 | |||
13056 | match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: | ||
13057 | CMPL CX, 8(SP) | ||
13058 | JAE emit_remainder_encodeSnappyBlockAsm12B | ||
13059 | MOVQ -2(DX)(CX*1), SI | ||
13060 | CMPQ AX, (SP) | ||
13061 | JB match_nolit_dst_ok_encodeSnappyBlockAsm12B | ||
13062 | MOVQ $0x00000000, ret+48(FP) | ||
13063 | RET | ||
13064 | |||
13065 | match_nolit_dst_ok_encodeSnappyBlockAsm12B: | ||
13066 | MOVQ $0x000000cf1bbcdcbb, R8 | ||
13067 | MOVQ SI, DI | ||
13068 | SHRQ $0x10, SI | ||
13069 | MOVQ SI, BX | ||
13070 | SHLQ $0x18, DI | ||
13071 | IMULQ R8, DI | ||
13072 | SHRQ $0x34, DI | ||
13073 | SHLQ $0x18, BX | ||
13074 | IMULQ R8, BX | ||
13075 | SHRQ $0x34, BX | ||
13076 | LEAL -2(CX), R8 | ||
13077 | LEAQ 24(SP)(BX*4), R9 | ||
13078 | MOVL (R9), BX | ||
13079 | MOVL R8, 24(SP)(DI*4) | ||
13080 | MOVL CX, (R9) | ||
13081 | CMPL (DX)(BX*1), SI | ||
13082 | JEQ match_nolit_loop_encodeSnappyBlockAsm12B | ||
13083 | INCL CX | ||
13084 | JMP search_loop_encodeSnappyBlockAsm12B | ||
13085 | |||
13086 | emit_remainder_encodeSnappyBlockAsm12B: | ||
13087 | MOVQ src_len+32(FP), CX | ||
13088 | SUBL 12(SP), CX | ||
13089 | LEAQ 3(AX)(CX*1), CX | ||
13090 | CMPQ CX, (SP) | ||
13091 | JB emit_remainder_ok_encodeSnappyBlockAsm12B | ||
13092 | MOVQ $0x00000000, ret+48(FP) | ||
13093 | RET | ||
13094 | |||
13095 | emit_remainder_ok_encodeSnappyBlockAsm12B: | ||
13096 | MOVQ src_len+32(FP), CX | ||
13097 | MOVL 12(SP), BX | ||
13098 | CMPL BX, CX | ||
13099 | JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B | ||
13100 | MOVL CX, SI | ||
13101 | MOVL CX, 12(SP) | ||
13102 | LEAQ (DX)(BX*1), CX | ||
13103 | SUBL BX, SI | ||
13104 | LEAL -1(SI), DX | ||
13105 | CMPL DX, $0x3c | ||
13106 | JB one_byte_emit_remainder_encodeSnappyBlockAsm12B | ||
13107 | CMPL DX, $0x00000100 | ||
13108 | JB two_bytes_emit_remainder_encodeSnappyBlockAsm12B | ||
13109 | JB three_bytes_emit_remainder_encodeSnappyBlockAsm12B | ||
13110 | |||
13111 | three_bytes_emit_remainder_encodeSnappyBlockAsm12B: | ||
13112 | MOVB $0xf4, (AX) | ||
13113 | MOVW DX, 1(AX) | ||
13114 | ADDQ $0x03, AX | ||
13115 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B | ||
13116 | |||
13117 | two_bytes_emit_remainder_encodeSnappyBlockAsm12B: | ||
13118 | MOVB $0xf0, (AX) | ||
13119 | MOVB DL, 1(AX) | ||
13120 | ADDQ $0x02, AX | ||
13121 | CMPL DX, $0x40 | ||
13122 | JB memmove_emit_remainder_encodeSnappyBlockAsm12B | ||
13123 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B | ||
13124 | |||
13125 | one_byte_emit_remainder_encodeSnappyBlockAsm12B: | ||
13126 | SHLB $0x02, DL | ||
13127 | MOVB DL, (AX) | ||
13128 | ADDQ $0x01, AX | ||
13129 | |||
13130 | memmove_emit_remainder_encodeSnappyBlockAsm12B: | ||
13131 | LEAQ (AX)(SI*1), DX | ||
13132 | MOVL SI, BX | ||
13133 | |||
13134 | // genMemMoveShort | ||
13135 | CMPQ BX, $0x03 | ||
13136 | JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 | ||
13137 | JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 | ||
13138 | CMPQ BX, $0x08 | ||
13139 | JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 | ||
13140 | CMPQ BX, $0x10 | ||
13141 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 | ||
13142 | CMPQ BX, $0x20 | ||
13143 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 | ||
13144 | JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 | ||
13145 | |||
13146 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: | ||
13147 | MOVB (CX), SI | ||
13148 | MOVB -1(CX)(BX*1), CL | ||
13149 | MOVB SI, (AX) | ||
13150 | MOVB CL, -1(AX)(BX*1) | ||
13151 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B | ||
13152 | |||
13153 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: | ||
13154 | MOVW (CX), SI | ||
13155 | MOVB 2(CX), CL | ||
13156 | MOVW SI, (AX) | ||
13157 | MOVB CL, 2(AX) | ||
13158 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B | ||
13159 | |||
13160 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: | ||
13161 | MOVL (CX), SI | ||
13162 | MOVL -4(CX)(BX*1), CX | ||
13163 | MOVL SI, (AX) | ||
13164 | MOVL CX, -4(AX)(BX*1) | ||
13165 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B | ||
13166 | |||
13167 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: | ||
13168 | MOVQ (CX), SI | ||
13169 | MOVQ -8(CX)(BX*1), CX | ||
13170 | MOVQ SI, (AX) | ||
13171 | MOVQ CX, -8(AX)(BX*1) | ||
13172 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B | ||
13173 | |||
13174 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: | ||
13175 | MOVOU (CX), X0 | ||
13176 | MOVOU -16(CX)(BX*1), X1 | ||
13177 | MOVOU X0, (AX) | ||
13178 | MOVOU X1, -16(AX)(BX*1) | ||
13179 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B | ||
13180 | |||
13181 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: | ||
13182 | MOVOU (CX), X0 | ||
13183 | MOVOU 16(CX), X1 | ||
13184 | MOVOU -32(CX)(BX*1), X2 | ||
13185 | MOVOU -16(CX)(BX*1), X3 | ||
13186 | MOVOU X0, (AX) | ||
13187 | MOVOU X1, 16(AX) | ||
13188 | MOVOU X2, -32(AX)(BX*1) | ||
13189 | MOVOU X3, -16(AX)(BX*1) | ||
13190 | |||
13191 | memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: | ||
13192 | MOVQ DX, AX | ||
13193 | JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B | ||
13194 | |||
13195 | memmove_long_emit_remainder_encodeSnappyBlockAsm12B: | ||
13196 | LEAQ (AX)(SI*1), DX | ||
13197 | MOVL SI, BX | ||
13198 | |||
13199 | // genMemMoveLong | ||
13200 | MOVOU (CX), X0 | ||
13201 | MOVOU 16(CX), X1 | ||
13202 | MOVOU -32(CX)(BX*1), X2 | ||
13203 | MOVOU -16(CX)(BX*1), X3 | ||
13204 | MOVQ BX, DI | ||
13205 | SHRQ $0x05, DI | ||
13206 | MOVQ AX, SI | ||
13207 | ANDL $0x0000001f, SI | ||
13208 | MOVQ $0x00000040, R8 | ||
13209 | SUBQ SI, R8 | ||
13210 | DECQ DI | ||
13211 | JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 | ||
13212 | LEAQ -32(CX)(R8*1), SI | ||
13213 | LEAQ -32(AX)(R8*1), R9 | ||
13214 | |||
13215 | emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: | ||
13216 | MOVOU (SI), X4 | ||
13217 | MOVOU 16(SI), X5 | ||
13218 | MOVOA X4, (R9) | ||
13219 | MOVOA X5, 16(R9) | ||
13220 | ADDQ $0x20, R9 | ||
13221 | ADDQ $0x20, SI | ||
13222 | ADDQ $0x20, R8 | ||
13223 | DECQ DI | ||
13224 | JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back | ||
13225 | |||
13226 | emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: | ||
13227 | MOVOU -32(CX)(R8*1), X4 | ||
13228 | MOVOU -16(CX)(R8*1), X5 | ||
13229 | MOVOA X4, -32(AX)(R8*1) | ||
13230 | MOVOA X5, -16(AX)(R8*1) | ||
13231 | ADDQ $0x20, R8 | ||
13232 | CMPQ BX, R8 | ||
13233 | JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 | ||
13234 | MOVOU X0, (AX) | ||
13235 | MOVOU X1, 16(AX) | ||
13236 | MOVOU X2, -32(AX)(BX*1) | ||
13237 | MOVOU X3, -16(AX)(BX*1) | ||
13238 | MOVQ DX, AX | ||
13239 | |||
13240 | emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: | ||
13241 | MOVQ dst_base+0(FP), CX | ||
13242 | SUBQ CX, AX | ||
13243 | MOVQ AX, ret+48(FP) | ||
13244 | RET | ||
13245 | |||
13246 | // func encodeSnappyBlockAsm10B(dst []byte, src []byte) int | ||
13247 | // Requires: BMI, SSE2 | ||
13248 | TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 | ||
13249 | MOVQ dst_base+0(FP), AX | ||
13250 | MOVQ $0x00000020, CX | ||
13251 | LEAQ 24(SP), DX | ||
13252 | PXOR X0, X0 | ||
13253 | |||
13254 | zero_loop_encodeSnappyBlockAsm10B: | ||
13255 | MOVOU X0, (DX) | ||
13256 | MOVOU X0, 16(DX) | ||
13257 | MOVOU X0, 32(DX) | ||
13258 | MOVOU X0, 48(DX) | ||
13259 | MOVOU X0, 64(DX) | ||
13260 | MOVOU X0, 80(DX) | ||
13261 | MOVOU X0, 96(DX) | ||
13262 | MOVOU X0, 112(DX) | ||
13263 | ADDQ $0x80, DX | ||
13264 | DECQ CX | ||
13265 | JNZ zero_loop_encodeSnappyBlockAsm10B | ||
13266 | MOVL $0x00000000, 12(SP) | ||
13267 | MOVQ src_len+32(FP), CX | ||
13268 | LEAQ -9(CX), DX | ||
13269 | LEAQ -8(CX), BX | ||
13270 | MOVL BX, 8(SP) | ||
13271 | SHRQ $0x05, CX | ||
13272 | SUBL CX, DX | ||
13273 | LEAQ (AX)(DX*1), DX | ||
13274 | MOVQ DX, (SP) | ||
13275 | MOVL $0x00000001, CX | ||
13276 | MOVL CX, 16(SP) | ||
13277 | MOVQ src_base+24(FP), DX | ||
13278 | |||
13279 | search_loop_encodeSnappyBlockAsm10B: | ||
13280 | MOVL CX, BX | ||
13281 | SUBL 12(SP), BX | ||
13282 | SHRL $0x05, BX | ||
13283 | LEAL 4(CX)(BX*1), BX | ||
13284 | CMPL BX, 8(SP) | ||
13285 | JAE emit_remainder_encodeSnappyBlockAsm10B | ||
13286 | MOVQ (DX)(CX*1), SI | ||
13287 | MOVL BX, 20(SP) | ||
13288 | MOVQ $0x9e3779b1, R8 | ||
13289 | MOVQ SI, R9 | ||
13290 | MOVQ SI, R10 | ||
13291 | SHRQ $0x08, R10 | ||
13292 | SHLQ $0x20, R9 | ||
13293 | IMULQ R8, R9 | ||
13294 | SHRQ $0x36, R9 | ||
13295 | SHLQ $0x20, R10 | ||
13296 | IMULQ R8, R10 | ||
13297 | SHRQ $0x36, R10 | ||
13298 | MOVL 24(SP)(R9*4), BX | ||
13299 | MOVL 24(SP)(R10*4), DI | ||
13300 | MOVL CX, 24(SP)(R9*4) | ||
13301 | LEAL 1(CX), R9 | ||
13302 | MOVL R9, 24(SP)(R10*4) | ||
13303 | MOVQ SI, R9 | ||
13304 | SHRQ $0x10, R9 | ||
13305 | SHLQ $0x20, R9 | ||
13306 | IMULQ R8, R9 | ||
13307 | SHRQ $0x36, R9 | ||
13308 | MOVL CX, R8 | ||
13309 | SUBL 16(SP), R8 | ||
13310 | MOVL 1(DX)(R8*1), R10 | ||
13311 | MOVQ SI, R8 | ||
13312 | SHRQ $0x08, R8 | ||
13313 | CMPL R8, R10 | ||
13314 | JNE no_repeat_found_encodeSnappyBlockAsm10B | ||
13315 | LEAL 1(CX), SI | ||
13316 | MOVL 12(SP), BX | ||
13317 | MOVL SI, DI | ||
13318 | SUBL 16(SP), DI | ||
13319 | JZ repeat_extend_back_end_encodeSnappyBlockAsm10B | ||
13320 | |||
13321 | repeat_extend_back_loop_encodeSnappyBlockAsm10B: | ||
13322 | CMPL SI, BX | ||
13323 | JBE repeat_extend_back_end_encodeSnappyBlockAsm10B | ||
13324 | MOVB -1(DX)(DI*1), R8 | ||
13325 | MOVB -1(DX)(SI*1), R9 | ||
13326 | CMPB R8, R9 | ||
13327 | JNE repeat_extend_back_end_encodeSnappyBlockAsm10B | ||
13328 | LEAL -1(SI), SI | ||
13329 | DECL DI | ||
13330 | JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B | ||
13331 | |||
13332 | repeat_extend_back_end_encodeSnappyBlockAsm10B: | ||
13333 | MOVL 12(SP), BX | ||
13334 | CMPL BX, SI | ||
13335 | JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B | ||
13336 | MOVL SI, DI | ||
13337 | MOVL SI, 12(SP) | ||
13338 | LEAQ (DX)(BX*1), R8 | ||
13339 | SUBL BX, DI | ||
13340 | LEAL -1(DI), BX | ||
13341 | CMPL BX, $0x3c | ||
13342 | JB one_byte_repeat_emit_encodeSnappyBlockAsm10B | ||
13343 | CMPL BX, $0x00000100 | ||
13344 | JB two_bytes_repeat_emit_encodeSnappyBlockAsm10B | ||
13345 | JB three_bytes_repeat_emit_encodeSnappyBlockAsm10B | ||
13346 | |||
13347 | three_bytes_repeat_emit_encodeSnappyBlockAsm10B: | ||
13348 | MOVB $0xf4, (AX) | ||
13349 | MOVW BX, 1(AX) | ||
13350 | ADDQ $0x03, AX | ||
13351 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B | ||
13352 | |||
13353 | two_bytes_repeat_emit_encodeSnappyBlockAsm10B: | ||
13354 | MOVB $0xf0, (AX) | ||
13355 | MOVB BL, 1(AX) | ||
13356 | ADDQ $0x02, AX | ||
13357 | CMPL BX, $0x40 | ||
13358 | JB memmove_repeat_emit_encodeSnappyBlockAsm10B | ||
13359 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B | ||
13360 | |||
13361 | one_byte_repeat_emit_encodeSnappyBlockAsm10B: | ||
13362 | SHLB $0x02, BL | ||
13363 | MOVB BL, (AX) | ||
13364 | ADDQ $0x01, AX | ||
13365 | |||
13366 | memmove_repeat_emit_encodeSnappyBlockAsm10B: | ||
13367 | LEAQ (AX)(DI*1), BX | ||
13368 | |||
13369 | // genMemMoveShort | ||
13370 | CMPQ DI, $0x08 | ||
13371 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 | ||
13372 | CMPQ DI, $0x10 | ||
13373 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 | ||
13374 | CMPQ DI, $0x20 | ||
13375 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 | ||
13376 | JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 | ||
13377 | |||
13378 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: | ||
13379 | MOVQ (R8), R9 | ||
13380 | MOVQ R9, (AX) | ||
13381 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B | ||
13382 | |||
13383 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: | ||
13384 | MOVQ (R8), R9 | ||
13385 | MOVQ -8(R8)(DI*1), R8 | ||
13386 | MOVQ R9, (AX) | ||
13387 | MOVQ R8, -8(AX)(DI*1) | ||
13388 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B | ||
13389 | |||
13390 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: | ||
13391 | MOVOU (R8), X0 | ||
13392 | MOVOU -16(R8)(DI*1), X1 | ||
13393 | MOVOU X0, (AX) | ||
13394 | MOVOU X1, -16(AX)(DI*1) | ||
13395 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B | ||
13396 | |||
13397 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: | ||
13398 | MOVOU (R8), X0 | ||
13399 | MOVOU 16(R8), X1 | ||
13400 | MOVOU -32(R8)(DI*1), X2 | ||
13401 | MOVOU -16(R8)(DI*1), X3 | ||
13402 | MOVOU X0, (AX) | ||
13403 | MOVOU X1, 16(AX) | ||
13404 | MOVOU X2, -32(AX)(DI*1) | ||
13405 | MOVOU X3, -16(AX)(DI*1) | ||
13406 | |||
13407 | memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: | ||
13408 | MOVQ BX, AX | ||
13409 | JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B | ||
13410 | |||
13411 | memmove_long_repeat_emit_encodeSnappyBlockAsm10B: | ||
13412 | LEAQ (AX)(DI*1), BX | ||
13413 | |||
13414 | // genMemMoveLong | ||
13415 | MOVOU (R8), X0 | ||
13416 | MOVOU 16(R8), X1 | ||
13417 | MOVOU -32(R8)(DI*1), X2 | ||
13418 | MOVOU -16(R8)(DI*1), X3 | ||
13419 | MOVQ DI, R10 | ||
13420 | SHRQ $0x05, R10 | ||
13421 | MOVQ AX, R9 | ||
13422 | ANDL $0x0000001f, R9 | ||
13423 | MOVQ $0x00000040, R11 | ||
13424 | SUBQ R9, R11 | ||
13425 | DECQ R10 | ||
13426 | JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 | ||
13427 | LEAQ -32(R8)(R11*1), R9 | ||
13428 | LEAQ -32(AX)(R11*1), R12 | ||
13429 | |||
13430 | emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: | ||
13431 | MOVOU (R9), X4 | ||
13432 | MOVOU 16(R9), X5 | ||
13433 | MOVOA X4, (R12) | ||
13434 | MOVOA X5, 16(R12) | ||
13435 | ADDQ $0x20, R12 | ||
13436 | ADDQ $0x20, R9 | ||
13437 | ADDQ $0x20, R11 | ||
13438 | DECQ R10 | ||
13439 | JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back | ||
13440 | |||
13441 | emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: | ||
13442 | MOVOU -32(R8)(R11*1), X4 | ||
13443 | MOVOU -16(R8)(R11*1), X5 | ||
13444 | MOVOA X4, -32(AX)(R11*1) | ||
13445 | MOVOA X5, -16(AX)(R11*1) | ||
13446 | ADDQ $0x20, R11 | ||
13447 | CMPQ DI, R11 | ||
13448 | JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 | ||
13449 | MOVOU X0, (AX) | ||
13450 | MOVOU X1, 16(AX) | ||
13451 | MOVOU X2, -32(AX)(DI*1) | ||
13452 | MOVOU X3, -16(AX)(DI*1) | ||
13453 | MOVQ BX, AX | ||
13454 | |||
13455 | emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: | ||
13456 | ADDL $0x05, CX | ||
13457 | MOVL CX, BX | ||
13458 | SUBL 16(SP), BX | ||
13459 | MOVQ src_len+32(FP), DI | ||
13460 | SUBL CX, DI | ||
13461 | LEAQ (DX)(CX*1), R8 | ||
13462 | LEAQ (DX)(BX*1), BX | ||
13463 | |||
13464 | // matchLen | ||
13465 | XORL R10, R10 | ||
13466 | |||
13467 | matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B: | ||
13468 | CMPL DI, $0x10 | ||
13469 | JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B | ||
13470 | MOVQ (R8)(R10*1), R9 | ||
13471 | MOVQ 8(R8)(R10*1), R11 | ||
13472 | XORQ (BX)(R10*1), R9 | ||
13473 | JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B | ||
13474 | XORQ 8(BX)(R10*1), R11 | ||
13475 | JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B | ||
13476 | LEAL -16(DI), DI | ||
13477 | LEAL 16(R10), R10 | ||
13478 | JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B | ||
13479 | |||
13480 | matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B: | ||
13481 | #ifdef GOAMD64_v3 | ||
13482 | TZCNTQ R11, R11 | ||
13483 | |||
13484 | #else | ||
13485 | BSFQ R11, R11 | ||
13486 | |||
13487 | #endif | ||
13488 | SARQ $0x03, R11 | ||
13489 | LEAL 8(R10)(R11*1), R10 | ||
13490 | JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B | ||
13491 | |||
13492 | matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B: | ||
13493 | CMPL DI, $0x08 | ||
13494 | JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B | ||
13495 | MOVQ (R8)(R10*1), R9 | ||
13496 | XORQ (BX)(R10*1), R9 | ||
13497 | JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B | ||
13498 | LEAL -8(DI), DI | ||
13499 | LEAL 8(R10), R10 | ||
13500 | JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B | ||
13501 | |||
13502 | matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B: | ||
13503 | #ifdef GOAMD64_v3 | ||
13504 | TZCNTQ R9, R9 | ||
13505 | |||
13506 | #else | ||
13507 | BSFQ R9, R9 | ||
13508 | |||
13509 | #endif | ||
13510 | SARQ $0x03, R9 | ||
13511 | LEAL (R10)(R9*1), R10 | ||
13512 | JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B | ||
13513 | |||
13514 | matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B: | ||
13515 | CMPL DI, $0x04 | ||
13516 | JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B | ||
13517 | MOVL (R8)(R10*1), R9 | ||
13518 | CMPL (BX)(R10*1), R9 | ||
13519 | JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B | ||
13520 | LEAL -4(DI), DI | ||
13521 | LEAL 4(R10), R10 | ||
13522 | |||
13523 | matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B: | ||
13524 | CMPL DI, $0x01 | ||
13525 | JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B | ||
13526 | JB repeat_extend_forward_end_encodeSnappyBlockAsm10B | ||
13527 | MOVW (R8)(R10*1), R9 | ||
13528 | CMPW (BX)(R10*1), R9 | ||
13529 | JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B | ||
13530 | LEAL 2(R10), R10 | ||
13531 | SUBL $0x02, DI | ||
13532 | JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B | ||
13533 | |||
13534 | matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B: | ||
13535 | MOVB (R8)(R10*1), R9 | ||
13536 | CMPB (BX)(R10*1), R9 | ||
13537 | JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B | ||
13538 | LEAL 1(R10), R10 | ||
13539 | |||
13540 | repeat_extend_forward_end_encodeSnappyBlockAsm10B: | ||
13541 | ADDL R10, CX | ||
13542 | MOVL CX, BX | ||
13543 | SUBL SI, BX | ||
13544 | MOVL 16(SP), SI | ||
13545 | |||
13546 | // emitCopy | ||
13547 | two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: | ||
13548 | CMPL BX, $0x40 | ||
13549 | JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B | ||
13550 | MOVB $0xee, (AX) | ||
13551 | MOVW SI, 1(AX) | ||
13552 | LEAL -60(BX), BX | ||
13553 | ADDQ $0x03, AX | ||
13554 | JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B | ||
13555 | |||
13556 | two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: | ||
13557 | MOVL BX, DI | ||
13558 | SHLL $0x02, DI | ||
13559 | CMPL BX, $0x0c | ||
13560 | JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B | ||
13561 | CMPL SI, $0x00000800 | ||
13562 | JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B | ||
13563 | LEAL -15(DI), DI | ||
13564 | MOVB SI, 1(AX) | ||
13565 | SHRL $0x08, SI | ||
13566 | SHLL $0x05, SI | ||
13567 | ORL SI, DI | ||
13568 | MOVB DI, (AX) | ||
13569 | ADDQ $0x02, AX | ||
13570 | JMP repeat_end_emit_encodeSnappyBlockAsm10B | ||
13571 | |||
13572 | emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: | ||
13573 | LEAL -2(DI), DI | ||
13574 | MOVB DI, (AX) | ||
13575 | MOVW SI, 1(AX) | ||
13576 | ADDQ $0x03, AX | ||
13577 | |||
13578 | repeat_end_emit_encodeSnappyBlockAsm10B: | ||
13579 | MOVL CX, 12(SP) | ||
13580 | JMP search_loop_encodeSnappyBlockAsm10B | ||
13581 | |||
13582 | no_repeat_found_encodeSnappyBlockAsm10B: | ||
13583 | CMPL (DX)(BX*1), SI | ||
13584 | JEQ candidate_match_encodeSnappyBlockAsm10B | ||
13585 | SHRQ $0x08, SI | ||
13586 | MOVL 24(SP)(R9*4), BX | ||
13587 | LEAL 2(CX), R8 | ||
13588 | CMPL (DX)(DI*1), SI | ||
13589 | JEQ candidate2_match_encodeSnappyBlockAsm10B | ||
13590 | MOVL R8, 24(SP)(R9*4) | ||
13591 | SHRQ $0x08, SI | ||
13592 | CMPL (DX)(BX*1), SI | ||
13593 | JEQ candidate3_match_encodeSnappyBlockAsm10B | ||
13594 | MOVL 20(SP), CX | ||
13595 | JMP search_loop_encodeSnappyBlockAsm10B | ||
13596 | |||
13597 | candidate3_match_encodeSnappyBlockAsm10B: | ||
13598 | ADDL $0x02, CX | ||
13599 | JMP candidate_match_encodeSnappyBlockAsm10B | ||
13600 | |||
13601 | candidate2_match_encodeSnappyBlockAsm10B: | ||
13602 | MOVL R8, 24(SP)(R9*4) | ||
13603 | INCL CX | ||
13604 | MOVL DI, BX | ||
13605 | |||
13606 | candidate_match_encodeSnappyBlockAsm10B: | ||
13607 | MOVL 12(SP), SI | ||
13608 | TESTL BX, BX | ||
13609 | JZ match_extend_back_end_encodeSnappyBlockAsm10B | ||
13610 | |||
13611 | match_extend_back_loop_encodeSnappyBlockAsm10B: | ||
13612 | CMPL CX, SI | ||
13613 | JBE match_extend_back_end_encodeSnappyBlockAsm10B | ||
13614 | MOVB -1(DX)(BX*1), DI | ||
13615 | MOVB -1(DX)(CX*1), R8 | ||
13616 | CMPB DI, R8 | ||
13617 | JNE match_extend_back_end_encodeSnappyBlockAsm10B | ||
13618 | LEAL -1(CX), CX | ||
13619 | DECL BX | ||
13620 | JZ match_extend_back_end_encodeSnappyBlockAsm10B | ||
13621 | JMP match_extend_back_loop_encodeSnappyBlockAsm10B | ||
13622 | |||
13623 | match_extend_back_end_encodeSnappyBlockAsm10B: | ||
13624 | MOVL CX, SI | ||
13625 | SUBL 12(SP), SI | ||
13626 | LEAQ 3(AX)(SI*1), SI | ||
13627 | CMPQ SI, (SP) | ||
13628 | JB match_dst_size_check_encodeSnappyBlockAsm10B | ||
13629 | MOVQ $0x00000000, ret+48(FP) | ||
13630 | RET | ||
13631 | |||
13632 | match_dst_size_check_encodeSnappyBlockAsm10B: | ||
13633 | MOVL CX, SI | ||
13634 | MOVL 12(SP), DI | ||
13635 | CMPL DI, SI | ||
13636 | JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B | ||
13637 | MOVL SI, R8 | ||
13638 | MOVL SI, 12(SP) | ||
13639 | LEAQ (DX)(DI*1), SI | ||
13640 | SUBL DI, R8 | ||
13641 | LEAL -1(R8), DI | ||
13642 | CMPL DI, $0x3c | ||
13643 | JB one_byte_match_emit_encodeSnappyBlockAsm10B | ||
13644 | CMPL DI, $0x00000100 | ||
13645 | JB two_bytes_match_emit_encodeSnappyBlockAsm10B | ||
13646 | JB three_bytes_match_emit_encodeSnappyBlockAsm10B | ||
13647 | |||
13648 | three_bytes_match_emit_encodeSnappyBlockAsm10B: | ||
13649 | MOVB $0xf4, (AX) | ||
13650 | MOVW DI, 1(AX) | ||
13651 | ADDQ $0x03, AX | ||
13652 | JMP memmove_long_match_emit_encodeSnappyBlockAsm10B | ||
13653 | |||
13654 | two_bytes_match_emit_encodeSnappyBlockAsm10B: | ||
13655 | MOVB $0xf0, (AX) | ||
13656 | MOVB DI, 1(AX) | ||
13657 | ADDQ $0x02, AX | ||
13658 | CMPL DI, $0x40 | ||
13659 | JB memmove_match_emit_encodeSnappyBlockAsm10B | ||
13660 | JMP memmove_long_match_emit_encodeSnappyBlockAsm10B | ||
13661 | |||
13662 | one_byte_match_emit_encodeSnappyBlockAsm10B: | ||
13663 | SHLB $0x02, DI | ||
13664 | MOVB DI, (AX) | ||
13665 | ADDQ $0x01, AX | ||
13666 | |||
13667 | memmove_match_emit_encodeSnappyBlockAsm10B: | ||
13668 | LEAQ (AX)(R8*1), DI | ||
13669 | |||
13670 | // genMemMoveShort | ||
13671 | CMPQ R8, $0x08 | ||
13672 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 | ||
13673 | CMPQ R8, $0x10 | ||
13674 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 | ||
13675 | CMPQ R8, $0x20 | ||
13676 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 | ||
13677 | JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 | ||
13678 | |||
13679 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: | ||
13680 | MOVQ (SI), R9 | ||
13681 | MOVQ R9, (AX) | ||
13682 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B | ||
13683 | |||
13684 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: | ||
13685 | MOVQ (SI), R9 | ||
13686 | MOVQ -8(SI)(R8*1), SI | ||
13687 | MOVQ R9, (AX) | ||
13688 | MOVQ SI, -8(AX)(R8*1) | ||
13689 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B | ||
13690 | |||
13691 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: | ||
13692 | MOVOU (SI), X0 | ||
13693 | MOVOU -16(SI)(R8*1), X1 | ||
13694 | MOVOU X0, (AX) | ||
13695 | MOVOU X1, -16(AX)(R8*1) | ||
13696 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B | ||
13697 | |||
13698 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: | ||
13699 | MOVOU (SI), X0 | ||
13700 | MOVOU 16(SI), X1 | ||
13701 | MOVOU -32(SI)(R8*1), X2 | ||
13702 | MOVOU -16(SI)(R8*1), X3 | ||
13703 | MOVOU X0, (AX) | ||
13704 | MOVOU X1, 16(AX) | ||
13705 | MOVOU X2, -32(AX)(R8*1) | ||
13706 | MOVOU X3, -16(AX)(R8*1) | ||
13707 | |||
13708 | memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: | ||
13709 | MOVQ DI, AX | ||
13710 | JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B | ||
13711 | |||
13712 | memmove_long_match_emit_encodeSnappyBlockAsm10B: | ||
13713 | LEAQ (AX)(R8*1), DI | ||
13714 | |||
13715 | // genMemMoveLong | ||
13716 | MOVOU (SI), X0 | ||
13717 | MOVOU 16(SI), X1 | ||
13718 | MOVOU -32(SI)(R8*1), X2 | ||
13719 | MOVOU -16(SI)(R8*1), X3 | ||
13720 | MOVQ R8, R10 | ||
13721 | SHRQ $0x05, R10 | ||
13722 | MOVQ AX, R9 | ||
13723 | ANDL $0x0000001f, R9 | ||
13724 | MOVQ $0x00000040, R11 | ||
13725 | SUBQ R9, R11 | ||
13726 | DECQ R10 | ||
13727 | JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 | ||
13728 | LEAQ -32(SI)(R11*1), R9 | ||
13729 | LEAQ -32(AX)(R11*1), R12 | ||
13730 | |||
13731 | emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: | ||
13732 | MOVOU (R9), X4 | ||
13733 | MOVOU 16(R9), X5 | ||
13734 | MOVOA X4, (R12) | ||
13735 | MOVOA X5, 16(R12) | ||
13736 | ADDQ $0x20, R12 | ||
13737 | ADDQ $0x20, R9 | ||
13738 | ADDQ $0x20, R11 | ||
13739 | DECQ R10 | ||
13740 | JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back | ||
13741 | |||
13742 | emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: | ||
13743 | MOVOU -32(SI)(R11*1), X4 | ||
13744 | MOVOU -16(SI)(R11*1), X5 | ||
13745 | MOVOA X4, -32(AX)(R11*1) | ||
13746 | MOVOA X5, -16(AX)(R11*1) | ||
13747 | ADDQ $0x20, R11 | ||
13748 | CMPQ R8, R11 | ||
13749 | JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 | ||
13750 | MOVOU X0, (AX) | ||
13751 | MOVOU X1, 16(AX) | ||
13752 | MOVOU X2, -32(AX)(R8*1) | ||
13753 | MOVOU X3, -16(AX)(R8*1) | ||
13754 | MOVQ DI, AX | ||
13755 | |||
13756 | emit_literal_done_match_emit_encodeSnappyBlockAsm10B: | ||
13757 | match_nolit_loop_encodeSnappyBlockAsm10B: | ||
13758 | MOVL CX, SI | ||
13759 | SUBL BX, SI | ||
13760 | MOVL SI, 16(SP) | ||
13761 | ADDL $0x04, CX | ||
13762 | ADDL $0x04, BX | ||
13763 | MOVQ src_len+32(FP), SI | ||
13764 | SUBL CX, SI | ||
13765 | LEAQ (DX)(CX*1), DI | ||
13766 | LEAQ (DX)(BX*1), BX | ||
13767 | |||
13768 | // matchLen | ||
13769 | XORL R9, R9 | ||
13770 | |||
13771 | matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B: | ||
13772 | CMPL SI, $0x10 | ||
13773 | JB matchlen_match8_match_nolit_encodeSnappyBlockAsm10B | ||
13774 | MOVQ (DI)(R9*1), R8 | ||
13775 | MOVQ 8(DI)(R9*1), R10 | ||
13776 | XORQ (BX)(R9*1), R8 | ||
13777 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B | ||
13778 | XORQ 8(BX)(R9*1), R10 | ||
13779 | JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B | ||
13780 | LEAL -16(SI), SI | ||
13781 | LEAL 16(R9), R9 | ||
13782 | JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B | ||
13783 | |||
13784 | matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B: | ||
13785 | #ifdef GOAMD64_v3 | ||
13786 | TZCNTQ R10, R10 | ||
13787 | |||
13788 | #else | ||
13789 | BSFQ R10, R10 | ||
13790 | |||
13791 | #endif | ||
13792 | SARQ $0x03, R10 | ||
13793 | LEAL 8(R9)(R10*1), R9 | ||
13794 | JMP match_nolit_end_encodeSnappyBlockAsm10B | ||
13795 | |||
13796 | matchlen_match8_match_nolit_encodeSnappyBlockAsm10B: | ||
13797 | CMPL SI, $0x08 | ||
13798 | JB matchlen_match4_match_nolit_encodeSnappyBlockAsm10B | ||
13799 | MOVQ (DI)(R9*1), R8 | ||
13800 | XORQ (BX)(R9*1), R8 | ||
13801 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B | ||
13802 | LEAL -8(SI), SI | ||
13803 | LEAL 8(R9), R9 | ||
13804 | JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm10B | ||
13805 | |||
13806 | matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B: | ||
13807 | #ifdef GOAMD64_v3 | ||
13808 | TZCNTQ R8, R8 | ||
13809 | |||
13810 | #else | ||
13811 | BSFQ R8, R8 | ||
13812 | |||
13813 | #endif | ||
13814 | SARQ $0x03, R8 | ||
13815 | LEAL (R9)(R8*1), R9 | ||
13816 | JMP match_nolit_end_encodeSnappyBlockAsm10B | ||
13817 | |||
13818 | matchlen_match4_match_nolit_encodeSnappyBlockAsm10B: | ||
13819 | CMPL SI, $0x04 | ||
13820 | JB matchlen_match2_match_nolit_encodeSnappyBlockAsm10B | ||
13821 | MOVL (DI)(R9*1), R8 | ||
13822 | CMPL (BX)(R9*1), R8 | ||
13823 | JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B | ||
13824 | LEAL -4(SI), SI | ||
13825 | LEAL 4(R9), R9 | ||
13826 | |||
13827 | matchlen_match2_match_nolit_encodeSnappyBlockAsm10B: | ||
13828 | CMPL SI, $0x01 | ||
13829 | JE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B | ||
13830 | JB match_nolit_end_encodeSnappyBlockAsm10B | ||
13831 | MOVW (DI)(R9*1), R8 | ||
13832 | CMPW (BX)(R9*1), R8 | ||
13833 | JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B | ||
13834 | LEAL 2(R9), R9 | ||
13835 | SUBL $0x02, SI | ||
13836 | JZ match_nolit_end_encodeSnappyBlockAsm10B | ||
13837 | |||
13838 | matchlen_match1_match_nolit_encodeSnappyBlockAsm10B: | ||
13839 | MOVB (DI)(R9*1), R8 | ||
13840 | CMPB (BX)(R9*1), R8 | ||
13841 | JNE match_nolit_end_encodeSnappyBlockAsm10B | ||
13842 | LEAL 1(R9), R9 | ||
13843 | |||
13844 | match_nolit_end_encodeSnappyBlockAsm10B: | ||
13845 | ADDL R9, CX | ||
13846 | MOVL 16(SP), BX | ||
13847 | ADDL $0x04, R9 | ||
13848 | MOVL CX, 12(SP) | ||
13849 | |||
13850 | // emitCopy | ||
13851 | two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: | ||
13852 | CMPL R9, $0x40 | ||
13853 | JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B | ||
13854 | MOVB $0xee, (AX) | ||
13855 | MOVW BX, 1(AX) | ||
13856 | LEAL -60(R9), R9 | ||
13857 | ADDQ $0x03, AX | ||
13858 | JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B | ||
13859 | |||
13860 | two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: | ||
13861 | MOVL R9, SI | ||
13862 | SHLL $0x02, SI | ||
13863 | CMPL R9, $0x0c | ||
13864 | JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B | ||
13865 | CMPL BX, $0x00000800 | ||
13866 | JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B | ||
13867 | LEAL -15(SI), SI | ||
13868 | MOVB BL, 1(AX) | ||
13869 | SHRL $0x08, BX | ||
13870 | SHLL $0x05, BX | ||
13871 | ORL BX, SI | ||
13872 | MOVB SI, (AX) | ||
13873 | ADDQ $0x02, AX | ||
13874 | JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B | ||
13875 | |||
13876 | emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: | ||
13877 | LEAL -2(SI), SI | ||
13878 | MOVB SI, (AX) | ||
13879 | MOVW BX, 1(AX) | ||
13880 | ADDQ $0x03, AX | ||
13881 | |||
13882 | match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: | ||
13883 | CMPL CX, 8(SP) | ||
13884 | JAE emit_remainder_encodeSnappyBlockAsm10B | ||
13885 | MOVQ -2(DX)(CX*1), SI | ||
13886 | CMPQ AX, (SP) | ||
13887 | JB match_nolit_dst_ok_encodeSnappyBlockAsm10B | ||
13888 | MOVQ $0x00000000, ret+48(FP) | ||
13889 | RET | ||
13890 | |||
13891 | match_nolit_dst_ok_encodeSnappyBlockAsm10B: | ||
13892 | MOVQ $0x9e3779b1, R8 | ||
13893 | MOVQ SI, DI | ||
13894 | SHRQ $0x10, SI | ||
13895 | MOVQ SI, BX | ||
13896 | SHLQ $0x20, DI | ||
13897 | IMULQ R8, DI | ||
13898 | SHRQ $0x36, DI | ||
13899 | SHLQ $0x20, BX | ||
13900 | IMULQ R8, BX | ||
13901 | SHRQ $0x36, BX | ||
13902 | LEAL -2(CX), R8 | ||
13903 | LEAQ 24(SP)(BX*4), R9 | ||
13904 | MOVL (R9), BX | ||
13905 | MOVL R8, 24(SP)(DI*4) | ||
13906 | MOVL CX, (R9) | ||
13907 | CMPL (DX)(BX*1), SI | ||
13908 | JEQ match_nolit_loop_encodeSnappyBlockAsm10B | ||
13909 | INCL CX | ||
13910 | JMP search_loop_encodeSnappyBlockAsm10B | ||
13911 | |||
13912 | emit_remainder_encodeSnappyBlockAsm10B: | ||
13913 | MOVQ src_len+32(FP), CX | ||
13914 | SUBL 12(SP), CX | ||
13915 | LEAQ 3(AX)(CX*1), CX | ||
13916 | CMPQ CX, (SP) | ||
13917 | JB emit_remainder_ok_encodeSnappyBlockAsm10B | ||
13918 | MOVQ $0x00000000, ret+48(FP) | ||
13919 | RET | ||
13920 | |||
13921 | emit_remainder_ok_encodeSnappyBlockAsm10B: | ||
13922 | MOVQ src_len+32(FP), CX | ||
13923 | MOVL 12(SP), BX | ||
13924 | CMPL BX, CX | ||
13925 | JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B | ||
13926 | MOVL CX, SI | ||
13927 | MOVL CX, 12(SP) | ||
13928 | LEAQ (DX)(BX*1), CX | ||
13929 | SUBL BX, SI | ||
13930 | LEAL -1(SI), DX | ||
13931 | CMPL DX, $0x3c | ||
13932 | JB one_byte_emit_remainder_encodeSnappyBlockAsm10B | ||
13933 | CMPL DX, $0x00000100 | ||
13934 | JB two_bytes_emit_remainder_encodeSnappyBlockAsm10B | ||
13935 | JB three_bytes_emit_remainder_encodeSnappyBlockAsm10B | ||
13936 | |||
13937 | three_bytes_emit_remainder_encodeSnappyBlockAsm10B: | ||
13938 | MOVB $0xf4, (AX) | ||
13939 | MOVW DX, 1(AX) | ||
13940 | ADDQ $0x03, AX | ||
13941 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B | ||
13942 | |||
13943 | two_bytes_emit_remainder_encodeSnappyBlockAsm10B: | ||
13944 | MOVB $0xf0, (AX) | ||
13945 | MOVB DL, 1(AX) | ||
13946 | ADDQ $0x02, AX | ||
13947 | CMPL DX, $0x40 | ||
13948 | JB memmove_emit_remainder_encodeSnappyBlockAsm10B | ||
13949 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B | ||
13950 | |||
13951 | one_byte_emit_remainder_encodeSnappyBlockAsm10B: | ||
13952 | SHLB $0x02, DL | ||
13953 | MOVB DL, (AX) | ||
13954 | ADDQ $0x01, AX | ||
13955 | |||
13956 | memmove_emit_remainder_encodeSnappyBlockAsm10B: | ||
13957 | LEAQ (AX)(SI*1), DX | ||
13958 | MOVL SI, BX | ||
13959 | |||
13960 | // genMemMoveShort | ||
13961 | CMPQ BX, $0x03 | ||
13962 | JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 | ||
13963 | JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 | ||
13964 | CMPQ BX, $0x08 | ||
13965 | JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 | ||
13966 | CMPQ BX, $0x10 | ||
13967 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 | ||
13968 | CMPQ BX, $0x20 | ||
13969 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 | ||
13970 | JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 | ||
13971 | |||
13972 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: | ||
13973 | MOVB (CX), SI | ||
13974 | MOVB -1(CX)(BX*1), CL | ||
13975 | MOVB SI, (AX) | ||
13976 | MOVB CL, -1(AX)(BX*1) | ||
13977 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B | ||
13978 | |||
13979 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: | ||
13980 | MOVW (CX), SI | ||
13981 | MOVB 2(CX), CL | ||
13982 | MOVW SI, (AX) | ||
13983 | MOVB CL, 2(AX) | ||
13984 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B | ||
13985 | |||
13986 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: | ||
13987 | MOVL (CX), SI | ||
13988 | MOVL -4(CX)(BX*1), CX | ||
13989 | MOVL SI, (AX) | ||
13990 | MOVL CX, -4(AX)(BX*1) | ||
13991 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B | ||
13992 | |||
13993 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: | ||
13994 | MOVQ (CX), SI | ||
13995 | MOVQ -8(CX)(BX*1), CX | ||
13996 | MOVQ SI, (AX) | ||
13997 | MOVQ CX, -8(AX)(BX*1) | ||
13998 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B | ||
13999 | |||
14000 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: | ||
14001 | MOVOU (CX), X0 | ||
14002 | MOVOU -16(CX)(BX*1), X1 | ||
14003 | MOVOU X0, (AX) | ||
14004 | MOVOU X1, -16(AX)(BX*1) | ||
14005 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B | ||
14006 | |||
14007 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: | ||
14008 | MOVOU (CX), X0 | ||
14009 | MOVOU 16(CX), X1 | ||
14010 | MOVOU -32(CX)(BX*1), X2 | ||
14011 | MOVOU -16(CX)(BX*1), X3 | ||
14012 | MOVOU X0, (AX) | ||
14013 | MOVOU X1, 16(AX) | ||
14014 | MOVOU X2, -32(AX)(BX*1) | ||
14015 | MOVOU X3, -16(AX)(BX*1) | ||
14016 | |||
14017 | memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: | ||
14018 | MOVQ DX, AX | ||
14019 | JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B | ||
14020 | |||
14021 | memmove_long_emit_remainder_encodeSnappyBlockAsm10B: | ||
14022 | LEAQ (AX)(SI*1), DX | ||
14023 | MOVL SI, BX | ||
14024 | |||
14025 | // genMemMoveLong | ||
14026 | MOVOU (CX), X0 | ||
14027 | MOVOU 16(CX), X1 | ||
14028 | MOVOU -32(CX)(BX*1), X2 | ||
14029 | MOVOU -16(CX)(BX*1), X3 | ||
14030 | MOVQ BX, DI | ||
14031 | SHRQ $0x05, DI | ||
14032 | MOVQ AX, SI | ||
14033 | ANDL $0x0000001f, SI | ||
14034 | MOVQ $0x00000040, R8 | ||
14035 | SUBQ SI, R8 | ||
14036 | DECQ DI | ||
14037 | JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 | ||
14038 | LEAQ -32(CX)(R8*1), SI | ||
14039 | LEAQ -32(AX)(R8*1), R9 | ||
14040 | |||
14041 | emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: | ||
14042 | MOVOU (SI), X4 | ||
14043 | MOVOU 16(SI), X5 | ||
14044 | MOVOA X4, (R9) | ||
14045 | MOVOA X5, 16(R9) | ||
14046 | ADDQ $0x20, R9 | ||
14047 | ADDQ $0x20, SI | ||
14048 | ADDQ $0x20, R8 | ||
14049 | DECQ DI | ||
14050 | JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back | ||
14051 | |||
14052 | emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: | ||
14053 | MOVOU -32(CX)(R8*1), X4 | ||
14054 | MOVOU -16(CX)(R8*1), X5 | ||
14055 | MOVOA X4, -32(AX)(R8*1) | ||
14056 | MOVOA X5, -16(AX)(R8*1) | ||
14057 | ADDQ $0x20, R8 | ||
14058 | CMPQ BX, R8 | ||
14059 | JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 | ||
14060 | MOVOU X0, (AX) | ||
14061 | MOVOU X1, 16(AX) | ||
14062 | MOVOU X2, -32(AX)(BX*1) | ||
14063 | MOVOU X3, -16(AX)(BX*1) | ||
14064 | MOVQ DX, AX | ||
14065 | |||
14066 | emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: | ||
14067 | MOVQ dst_base+0(FP), CX | ||
14068 | SUBQ CX, AX | ||
14069 | MOVQ AX, ret+48(FP) | ||
14070 | RET | ||
14071 | |||
14072 | // func encodeSnappyBlockAsm8B(dst []byte, src []byte) int | ||
14073 | // Requires: BMI, SSE2 | ||
14074 | TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 | ||
14075 | MOVQ dst_base+0(FP), AX | ||
14076 | MOVQ $0x00000008, CX | ||
14077 | LEAQ 24(SP), DX | ||
14078 | PXOR X0, X0 | ||
14079 | |||
14080 | zero_loop_encodeSnappyBlockAsm8B: | ||
14081 | MOVOU X0, (DX) | ||
14082 | MOVOU X0, 16(DX) | ||
14083 | MOVOU X0, 32(DX) | ||
14084 | MOVOU X0, 48(DX) | ||
14085 | MOVOU X0, 64(DX) | ||
14086 | MOVOU X0, 80(DX) | ||
14087 | MOVOU X0, 96(DX) | ||
14088 | MOVOU X0, 112(DX) | ||
14089 | ADDQ $0x80, DX | ||
14090 | DECQ CX | ||
14091 | JNZ zero_loop_encodeSnappyBlockAsm8B | ||
14092 | MOVL $0x00000000, 12(SP) | ||
14093 | MOVQ src_len+32(FP), CX | ||
14094 | LEAQ -9(CX), DX | ||
14095 | LEAQ -8(CX), BX | ||
14096 | MOVL BX, 8(SP) | ||
14097 | SHRQ $0x05, CX | ||
14098 | SUBL CX, DX | ||
14099 | LEAQ (AX)(DX*1), DX | ||
14100 | MOVQ DX, (SP) | ||
14101 | MOVL $0x00000001, CX | ||
14102 | MOVL CX, 16(SP) | ||
14103 | MOVQ src_base+24(FP), DX | ||
14104 | |||
14105 | search_loop_encodeSnappyBlockAsm8B: | ||
14106 | MOVL CX, BX | ||
14107 | SUBL 12(SP), BX | ||
14108 | SHRL $0x04, BX | ||
14109 | LEAL 4(CX)(BX*1), BX | ||
14110 | CMPL BX, 8(SP) | ||
14111 | JAE emit_remainder_encodeSnappyBlockAsm8B | ||
14112 | MOVQ (DX)(CX*1), SI | ||
14113 | MOVL BX, 20(SP) | ||
14114 | MOVQ $0x9e3779b1, R8 | ||
14115 | MOVQ SI, R9 | ||
14116 | MOVQ SI, R10 | ||
14117 | SHRQ $0x08, R10 | ||
14118 | SHLQ $0x20, R9 | ||
14119 | IMULQ R8, R9 | ||
14120 | SHRQ $0x38, R9 | ||
14121 | SHLQ $0x20, R10 | ||
14122 | IMULQ R8, R10 | ||
14123 | SHRQ $0x38, R10 | ||
14124 | MOVL 24(SP)(R9*4), BX | ||
14125 | MOVL 24(SP)(R10*4), DI | ||
14126 | MOVL CX, 24(SP)(R9*4) | ||
14127 | LEAL 1(CX), R9 | ||
14128 | MOVL R9, 24(SP)(R10*4) | ||
14129 | MOVQ SI, R9 | ||
14130 | SHRQ $0x10, R9 | ||
14131 | SHLQ $0x20, R9 | ||
14132 | IMULQ R8, R9 | ||
14133 | SHRQ $0x38, R9 | ||
14134 | MOVL CX, R8 | ||
14135 | SUBL 16(SP), R8 | ||
14136 | MOVL 1(DX)(R8*1), R10 | ||
14137 | MOVQ SI, R8 | ||
14138 | SHRQ $0x08, R8 | ||
14139 | CMPL R8, R10 | ||
14140 | JNE no_repeat_found_encodeSnappyBlockAsm8B | ||
14141 | LEAL 1(CX), SI | ||
14142 | MOVL 12(SP), BX | ||
14143 | MOVL SI, DI | ||
14144 | SUBL 16(SP), DI | ||
14145 | JZ repeat_extend_back_end_encodeSnappyBlockAsm8B | ||
14146 | |||
14147 | repeat_extend_back_loop_encodeSnappyBlockAsm8B: | ||
14148 | CMPL SI, BX | ||
14149 | JBE repeat_extend_back_end_encodeSnappyBlockAsm8B | ||
14150 | MOVB -1(DX)(DI*1), R8 | ||
14151 | MOVB -1(DX)(SI*1), R9 | ||
14152 | CMPB R8, R9 | ||
14153 | JNE repeat_extend_back_end_encodeSnappyBlockAsm8B | ||
14154 | LEAL -1(SI), SI | ||
14155 | DECL DI | ||
14156 | JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B | ||
14157 | |||
14158 | repeat_extend_back_end_encodeSnappyBlockAsm8B: | ||
14159 | MOVL 12(SP), BX | ||
14160 | CMPL BX, SI | ||
14161 | JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B | ||
14162 | MOVL SI, DI | ||
14163 | MOVL SI, 12(SP) | ||
14164 | LEAQ (DX)(BX*1), R8 | ||
14165 | SUBL BX, DI | ||
14166 | LEAL -1(DI), BX | ||
14167 | CMPL BX, $0x3c | ||
14168 | JB one_byte_repeat_emit_encodeSnappyBlockAsm8B | ||
14169 | CMPL BX, $0x00000100 | ||
14170 | JB two_bytes_repeat_emit_encodeSnappyBlockAsm8B | ||
14171 | JB three_bytes_repeat_emit_encodeSnappyBlockAsm8B | ||
14172 | |||
14173 | three_bytes_repeat_emit_encodeSnappyBlockAsm8B: | ||
14174 | MOVB $0xf4, (AX) | ||
14175 | MOVW BX, 1(AX) | ||
14176 | ADDQ $0x03, AX | ||
14177 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B | ||
14178 | |||
14179 | two_bytes_repeat_emit_encodeSnappyBlockAsm8B: | ||
14180 | MOVB $0xf0, (AX) | ||
14181 | MOVB BL, 1(AX) | ||
14182 | ADDQ $0x02, AX | ||
14183 | CMPL BX, $0x40 | ||
14184 | JB memmove_repeat_emit_encodeSnappyBlockAsm8B | ||
14185 | JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B | ||
14186 | |||
14187 | one_byte_repeat_emit_encodeSnappyBlockAsm8B: | ||
14188 | SHLB $0x02, BL | ||
14189 | MOVB BL, (AX) | ||
14190 | ADDQ $0x01, AX | ||
14191 | |||
14192 | memmove_repeat_emit_encodeSnappyBlockAsm8B: | ||
14193 | LEAQ (AX)(DI*1), BX | ||
14194 | |||
14195 | // genMemMoveShort | ||
14196 | CMPQ DI, $0x08 | ||
14197 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 | ||
14198 | CMPQ DI, $0x10 | ||
14199 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 | ||
14200 | CMPQ DI, $0x20 | ||
14201 | JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 | ||
14202 | JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 | ||
14203 | |||
14204 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: | ||
14205 | MOVQ (R8), R9 | ||
14206 | MOVQ R9, (AX) | ||
14207 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B | ||
14208 | |||
14209 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: | ||
14210 | MOVQ (R8), R9 | ||
14211 | MOVQ -8(R8)(DI*1), R8 | ||
14212 | MOVQ R9, (AX) | ||
14213 | MOVQ R8, -8(AX)(DI*1) | ||
14214 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B | ||
14215 | |||
14216 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: | ||
14217 | MOVOU (R8), X0 | ||
14218 | MOVOU -16(R8)(DI*1), X1 | ||
14219 | MOVOU X0, (AX) | ||
14220 | MOVOU X1, -16(AX)(DI*1) | ||
14221 | JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B | ||
14222 | |||
14223 | emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: | ||
14224 | MOVOU (R8), X0 | ||
14225 | MOVOU 16(R8), X1 | ||
14226 | MOVOU -32(R8)(DI*1), X2 | ||
14227 | MOVOU -16(R8)(DI*1), X3 | ||
14228 | MOVOU X0, (AX) | ||
14229 | MOVOU X1, 16(AX) | ||
14230 | MOVOU X2, -32(AX)(DI*1) | ||
14231 | MOVOU X3, -16(AX)(DI*1) | ||
14232 | |||
14233 | memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: | ||
14234 | MOVQ BX, AX | ||
14235 | JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B | ||
14236 | |||
14237 | memmove_long_repeat_emit_encodeSnappyBlockAsm8B: | ||
14238 | LEAQ (AX)(DI*1), BX | ||
14239 | |||
14240 | // genMemMoveLong | ||
14241 | MOVOU (R8), X0 | ||
14242 | MOVOU 16(R8), X1 | ||
14243 | MOVOU -32(R8)(DI*1), X2 | ||
14244 | MOVOU -16(R8)(DI*1), X3 | ||
14245 | MOVQ DI, R10 | ||
14246 | SHRQ $0x05, R10 | ||
14247 | MOVQ AX, R9 | ||
14248 | ANDL $0x0000001f, R9 | ||
14249 | MOVQ $0x00000040, R11 | ||
14250 | SUBQ R9, R11 | ||
14251 | DECQ R10 | ||
14252 | JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 | ||
14253 | LEAQ -32(R8)(R11*1), R9 | ||
14254 | LEAQ -32(AX)(R11*1), R12 | ||
14255 | |||
14256 | emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: | ||
14257 | MOVOU (R9), X4 | ||
14258 | MOVOU 16(R9), X5 | ||
14259 | MOVOA X4, (R12) | ||
14260 | MOVOA X5, 16(R12) | ||
14261 | ADDQ $0x20, R12 | ||
14262 | ADDQ $0x20, R9 | ||
14263 | ADDQ $0x20, R11 | ||
14264 | DECQ R10 | ||
14265 | JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back | ||
14266 | |||
14267 | emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: | ||
14268 | MOVOU -32(R8)(R11*1), X4 | ||
14269 | MOVOU -16(R8)(R11*1), X5 | ||
14270 | MOVOA X4, -32(AX)(R11*1) | ||
14271 | MOVOA X5, -16(AX)(R11*1) | ||
14272 | ADDQ $0x20, R11 | ||
14273 | CMPQ DI, R11 | ||
14274 | JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 | ||
14275 | MOVOU X0, (AX) | ||
14276 | MOVOU X1, 16(AX) | ||
14277 | MOVOU X2, -32(AX)(DI*1) | ||
14278 | MOVOU X3, -16(AX)(DI*1) | ||
14279 | MOVQ BX, AX | ||
14280 | |||
14281 | emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: | ||
14282 | ADDL $0x05, CX | ||
14283 | MOVL CX, BX | ||
14284 | SUBL 16(SP), BX | ||
14285 | MOVQ src_len+32(FP), DI | ||
14286 | SUBL CX, DI | ||
14287 | LEAQ (DX)(CX*1), R8 | ||
14288 | LEAQ (DX)(BX*1), BX | ||
14289 | |||
14290 | // matchLen | ||
14291 | XORL R10, R10 | ||
14292 | |||
14293 | matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B: | ||
14294 | CMPL DI, $0x10 | ||
14295 | JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B | ||
14296 | MOVQ (R8)(R10*1), R9 | ||
14297 | MOVQ 8(R8)(R10*1), R11 | ||
14298 | XORQ (BX)(R10*1), R9 | ||
14299 | JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B | ||
14300 | XORQ 8(BX)(R10*1), R11 | ||
14301 | JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B | ||
14302 | LEAL -16(DI), DI | ||
14303 | LEAL 16(R10), R10 | ||
14304 | JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B | ||
14305 | |||
14306 | matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B: | ||
14307 | #ifdef GOAMD64_v3 | ||
14308 | TZCNTQ R11, R11 | ||
14309 | |||
14310 | #else | ||
14311 | BSFQ R11, R11 | ||
14312 | |||
14313 | #endif | ||
14314 | SARQ $0x03, R11 | ||
14315 | LEAL 8(R10)(R11*1), R10 | ||
14316 | JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B | ||
14317 | |||
14318 | matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B: | ||
14319 | CMPL DI, $0x08 | ||
14320 | JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B | ||
14321 | MOVQ (R8)(R10*1), R9 | ||
14322 | XORQ (BX)(R10*1), R9 | ||
14323 | JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B | ||
14324 | LEAL -8(DI), DI | ||
14325 | LEAL 8(R10), R10 | ||
14326 | JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B | ||
14327 | |||
14328 | matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B: | ||
14329 | #ifdef GOAMD64_v3 | ||
14330 | TZCNTQ R9, R9 | ||
14331 | |||
14332 | #else | ||
14333 | BSFQ R9, R9 | ||
14334 | |||
14335 | #endif | ||
14336 | SARQ $0x03, R9 | ||
14337 | LEAL (R10)(R9*1), R10 | ||
14338 | JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B | ||
14339 | |||
14340 | matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B: | ||
14341 | CMPL DI, $0x04 | ||
14342 | JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B | ||
14343 | MOVL (R8)(R10*1), R9 | ||
14344 | CMPL (BX)(R10*1), R9 | ||
14345 | JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B | ||
14346 | LEAL -4(DI), DI | ||
14347 | LEAL 4(R10), R10 | ||
14348 | |||
14349 | matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B: | ||
14350 | CMPL DI, $0x01 | ||
14351 | JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B | ||
14352 | JB repeat_extend_forward_end_encodeSnappyBlockAsm8B | ||
14353 | MOVW (R8)(R10*1), R9 | ||
14354 | CMPW (BX)(R10*1), R9 | ||
14355 | JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B | ||
14356 | LEAL 2(R10), R10 | ||
14357 | SUBL $0x02, DI | ||
14358 | JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B | ||
14359 | |||
14360 | matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B: | ||
14361 | MOVB (R8)(R10*1), R9 | ||
14362 | CMPB (BX)(R10*1), R9 | ||
14363 | JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B | ||
14364 | LEAL 1(R10), R10 | ||
14365 | |||
14366 | repeat_extend_forward_end_encodeSnappyBlockAsm8B: | ||
14367 | ADDL R10, CX | ||
14368 | MOVL CX, BX | ||
14369 | SUBL SI, BX | ||
14370 | MOVL 16(SP), SI | ||
14371 | |||
14372 | // emitCopy | ||
14373 | two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: | ||
14374 | CMPL BX, $0x40 | ||
14375 | JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B | ||
14376 | MOVB $0xee, (AX) | ||
14377 | MOVW SI, 1(AX) | ||
14378 | LEAL -60(BX), BX | ||
14379 | ADDQ $0x03, AX | ||
14380 | JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B | ||
14381 | |||
14382 | two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: | ||
14383 | MOVL BX, DI | ||
14384 | SHLL $0x02, DI | ||
14385 | CMPL BX, $0x0c | ||
14386 | JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B | ||
14387 | LEAL -15(DI), DI | ||
14388 | MOVB SI, 1(AX) | ||
14389 | SHRL $0x08, SI | ||
14390 | SHLL $0x05, SI | ||
14391 | ORL SI, DI | ||
14392 | MOVB DI, (AX) | ||
14393 | ADDQ $0x02, AX | ||
14394 | JMP repeat_end_emit_encodeSnappyBlockAsm8B | ||
14395 | |||
14396 | emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: | ||
14397 | LEAL -2(DI), DI | ||
14398 | MOVB DI, (AX) | ||
14399 | MOVW SI, 1(AX) | ||
14400 | ADDQ $0x03, AX | ||
14401 | |||
14402 | repeat_end_emit_encodeSnappyBlockAsm8B: | ||
14403 | MOVL CX, 12(SP) | ||
14404 | JMP search_loop_encodeSnappyBlockAsm8B | ||
14405 | |||
14406 | no_repeat_found_encodeSnappyBlockAsm8B: | ||
14407 | CMPL (DX)(BX*1), SI | ||
14408 | JEQ candidate_match_encodeSnappyBlockAsm8B | ||
14409 | SHRQ $0x08, SI | ||
14410 | MOVL 24(SP)(R9*4), BX | ||
14411 | LEAL 2(CX), R8 | ||
14412 | CMPL (DX)(DI*1), SI | ||
14413 | JEQ candidate2_match_encodeSnappyBlockAsm8B | ||
14414 | MOVL R8, 24(SP)(R9*4) | ||
14415 | SHRQ $0x08, SI | ||
14416 | CMPL (DX)(BX*1), SI | ||
14417 | JEQ candidate3_match_encodeSnappyBlockAsm8B | ||
14418 | MOVL 20(SP), CX | ||
14419 | JMP search_loop_encodeSnappyBlockAsm8B | ||
14420 | |||
14421 | candidate3_match_encodeSnappyBlockAsm8B: | ||
14422 | ADDL $0x02, CX | ||
14423 | JMP candidate_match_encodeSnappyBlockAsm8B | ||
14424 | |||
14425 | candidate2_match_encodeSnappyBlockAsm8B: | ||
14426 | MOVL R8, 24(SP)(R9*4) | ||
14427 | INCL CX | ||
14428 | MOVL DI, BX | ||
14429 | |||
14430 | candidate_match_encodeSnappyBlockAsm8B: | ||
14431 | MOVL 12(SP), SI | ||
14432 | TESTL BX, BX | ||
14433 | JZ match_extend_back_end_encodeSnappyBlockAsm8B | ||
14434 | |||
14435 | match_extend_back_loop_encodeSnappyBlockAsm8B: | ||
14436 | CMPL CX, SI | ||
14437 | JBE match_extend_back_end_encodeSnappyBlockAsm8B | ||
14438 | MOVB -1(DX)(BX*1), DI | ||
14439 | MOVB -1(DX)(CX*1), R8 | ||
14440 | CMPB DI, R8 | ||
14441 | JNE match_extend_back_end_encodeSnappyBlockAsm8B | ||
14442 | LEAL -1(CX), CX | ||
14443 | DECL BX | ||
14444 | JZ match_extend_back_end_encodeSnappyBlockAsm8B | ||
14445 | JMP match_extend_back_loop_encodeSnappyBlockAsm8B | ||
14446 | |||
14447 | match_extend_back_end_encodeSnappyBlockAsm8B: | ||
14448 | MOVL CX, SI | ||
14449 | SUBL 12(SP), SI | ||
14450 | LEAQ 3(AX)(SI*1), SI | ||
14451 | CMPQ SI, (SP) | ||
14452 | JB match_dst_size_check_encodeSnappyBlockAsm8B | ||
14453 | MOVQ $0x00000000, ret+48(FP) | ||
14454 | RET | ||
14455 | |||
14456 | match_dst_size_check_encodeSnappyBlockAsm8B: | ||
14457 | MOVL CX, SI | ||
14458 | MOVL 12(SP), DI | ||
14459 | CMPL DI, SI | ||
14460 | JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B | ||
14461 | MOVL SI, R8 | ||
14462 | MOVL SI, 12(SP) | ||
14463 | LEAQ (DX)(DI*1), SI | ||
14464 | SUBL DI, R8 | ||
14465 | LEAL -1(R8), DI | ||
14466 | CMPL DI, $0x3c | ||
14467 | JB one_byte_match_emit_encodeSnappyBlockAsm8B | ||
14468 | CMPL DI, $0x00000100 | ||
14469 | JB two_bytes_match_emit_encodeSnappyBlockAsm8B | ||
14470 | JB three_bytes_match_emit_encodeSnappyBlockAsm8B | ||
14471 | |||
14472 | three_bytes_match_emit_encodeSnappyBlockAsm8B: | ||
14473 | MOVB $0xf4, (AX) | ||
14474 | MOVW DI, 1(AX) | ||
14475 | ADDQ $0x03, AX | ||
14476 | JMP memmove_long_match_emit_encodeSnappyBlockAsm8B | ||
14477 | |||
14478 | two_bytes_match_emit_encodeSnappyBlockAsm8B: | ||
14479 | MOVB $0xf0, (AX) | ||
14480 | MOVB DI, 1(AX) | ||
14481 | ADDQ $0x02, AX | ||
14482 | CMPL DI, $0x40 | ||
14483 | JB memmove_match_emit_encodeSnappyBlockAsm8B | ||
14484 | JMP memmove_long_match_emit_encodeSnappyBlockAsm8B | ||
14485 | |||
14486 | one_byte_match_emit_encodeSnappyBlockAsm8B: | ||
14487 | SHLB $0x02, DI | ||
14488 | MOVB DI, (AX) | ||
14489 | ADDQ $0x01, AX | ||
14490 | |||
14491 | memmove_match_emit_encodeSnappyBlockAsm8B: | ||
14492 | LEAQ (AX)(R8*1), DI | ||
14493 | |||
14494 | // genMemMoveShort | ||
14495 | CMPQ R8, $0x08 | ||
14496 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 | ||
14497 | CMPQ R8, $0x10 | ||
14498 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 | ||
14499 | CMPQ R8, $0x20 | ||
14500 | JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 | ||
14501 | JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 | ||
14502 | |||
14503 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: | ||
14504 | MOVQ (SI), R9 | ||
14505 | MOVQ R9, (AX) | ||
14506 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B | ||
14507 | |||
14508 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: | ||
14509 | MOVQ (SI), R9 | ||
14510 | MOVQ -8(SI)(R8*1), SI | ||
14511 | MOVQ R9, (AX) | ||
14512 | MOVQ SI, -8(AX)(R8*1) | ||
14513 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B | ||
14514 | |||
14515 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: | ||
14516 | MOVOU (SI), X0 | ||
14517 | MOVOU -16(SI)(R8*1), X1 | ||
14518 | MOVOU X0, (AX) | ||
14519 | MOVOU X1, -16(AX)(R8*1) | ||
14520 | JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B | ||
14521 | |||
14522 | emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: | ||
14523 | MOVOU (SI), X0 | ||
14524 | MOVOU 16(SI), X1 | ||
14525 | MOVOU -32(SI)(R8*1), X2 | ||
14526 | MOVOU -16(SI)(R8*1), X3 | ||
14527 | MOVOU X0, (AX) | ||
14528 | MOVOU X1, 16(AX) | ||
14529 | MOVOU X2, -32(AX)(R8*1) | ||
14530 | MOVOU X3, -16(AX)(R8*1) | ||
14531 | |||
14532 | memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: | ||
14533 | MOVQ DI, AX | ||
14534 | JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B | ||
14535 | |||
14536 | memmove_long_match_emit_encodeSnappyBlockAsm8B: | ||
14537 | LEAQ (AX)(R8*1), DI | ||
14538 | |||
14539 | // genMemMoveLong | ||
14540 | MOVOU (SI), X0 | ||
14541 | MOVOU 16(SI), X1 | ||
14542 | MOVOU -32(SI)(R8*1), X2 | ||
14543 | MOVOU -16(SI)(R8*1), X3 | ||
14544 | MOVQ R8, R10 | ||
14545 | SHRQ $0x05, R10 | ||
14546 | MOVQ AX, R9 | ||
14547 | ANDL $0x0000001f, R9 | ||
14548 | MOVQ $0x00000040, R11 | ||
14549 | SUBQ R9, R11 | ||
14550 | DECQ R10 | ||
14551 | JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 | ||
14552 | LEAQ -32(SI)(R11*1), R9 | ||
14553 | LEAQ -32(AX)(R11*1), R12 | ||
14554 | |||
14555 | emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: | ||
14556 | MOVOU (R9), X4 | ||
14557 | MOVOU 16(R9), X5 | ||
14558 | MOVOA X4, (R12) | ||
14559 | MOVOA X5, 16(R12) | ||
14560 | ADDQ $0x20, R12 | ||
14561 | ADDQ $0x20, R9 | ||
14562 | ADDQ $0x20, R11 | ||
14563 | DECQ R10 | ||
14564 | JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back | ||
14565 | |||
14566 | emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: | ||
14567 | MOVOU -32(SI)(R11*1), X4 | ||
14568 | MOVOU -16(SI)(R11*1), X5 | ||
14569 | MOVOA X4, -32(AX)(R11*1) | ||
14570 | MOVOA X5, -16(AX)(R11*1) | ||
14571 | ADDQ $0x20, R11 | ||
14572 | CMPQ R8, R11 | ||
14573 | JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 | ||
14574 | MOVOU X0, (AX) | ||
14575 | MOVOU X1, 16(AX) | ||
14576 | MOVOU X2, -32(AX)(R8*1) | ||
14577 | MOVOU X3, -16(AX)(R8*1) | ||
14578 | MOVQ DI, AX | ||
14579 | |||
14580 | emit_literal_done_match_emit_encodeSnappyBlockAsm8B: | ||
14581 | match_nolit_loop_encodeSnappyBlockAsm8B: | ||
14582 | MOVL CX, SI | ||
14583 | SUBL BX, SI | ||
14584 | MOVL SI, 16(SP) | ||
14585 | ADDL $0x04, CX | ||
14586 | ADDL $0x04, BX | ||
14587 | MOVQ src_len+32(FP), SI | ||
14588 | SUBL CX, SI | ||
14589 | LEAQ (DX)(CX*1), DI | ||
14590 | LEAQ (DX)(BX*1), BX | ||
14591 | |||
14592 | // matchLen | ||
14593 | XORL R9, R9 | ||
14594 | |||
14595 | matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B: | ||
14596 | CMPL SI, $0x10 | ||
14597 | JB matchlen_match8_match_nolit_encodeSnappyBlockAsm8B | ||
14598 | MOVQ (DI)(R9*1), R8 | ||
14599 | MOVQ 8(DI)(R9*1), R10 | ||
14600 | XORQ (BX)(R9*1), R8 | ||
14601 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B | ||
14602 | XORQ 8(BX)(R9*1), R10 | ||
14603 | JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B | ||
14604 | LEAL -16(SI), SI | ||
14605 | LEAL 16(R9), R9 | ||
14606 | JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B | ||
14607 | |||
14608 | matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B: | ||
14609 | #ifdef GOAMD64_v3 | ||
14610 | TZCNTQ R10, R10 | ||
14611 | |||
14612 | #else | ||
14613 | BSFQ R10, R10 | ||
14614 | |||
14615 | #endif | ||
14616 | SARQ $0x03, R10 | ||
14617 | LEAL 8(R9)(R10*1), R9 | ||
14618 | JMP match_nolit_end_encodeSnappyBlockAsm8B | ||
14619 | |||
14620 | matchlen_match8_match_nolit_encodeSnappyBlockAsm8B: | ||
14621 | CMPL SI, $0x08 | ||
14622 | JB matchlen_match4_match_nolit_encodeSnappyBlockAsm8B | ||
14623 | MOVQ (DI)(R9*1), R8 | ||
14624 | XORQ (BX)(R9*1), R8 | ||
14625 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B | ||
14626 | LEAL -8(SI), SI | ||
14627 | LEAL 8(R9), R9 | ||
14628 | JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm8B | ||
14629 | |||
14630 | matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B: | ||
14631 | #ifdef GOAMD64_v3 | ||
14632 | TZCNTQ R8, R8 | ||
14633 | |||
14634 | #else | ||
14635 | BSFQ R8, R8 | ||
14636 | |||
14637 | #endif | ||
14638 | SARQ $0x03, R8 | ||
14639 | LEAL (R9)(R8*1), R9 | ||
14640 | JMP match_nolit_end_encodeSnappyBlockAsm8B | ||
14641 | |||
14642 | matchlen_match4_match_nolit_encodeSnappyBlockAsm8B: | ||
14643 | CMPL SI, $0x04 | ||
14644 | JB matchlen_match2_match_nolit_encodeSnappyBlockAsm8B | ||
14645 | MOVL (DI)(R9*1), R8 | ||
14646 | CMPL (BX)(R9*1), R8 | ||
14647 | JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B | ||
14648 | LEAL -4(SI), SI | ||
14649 | LEAL 4(R9), R9 | ||
14650 | |||
14651 | matchlen_match2_match_nolit_encodeSnappyBlockAsm8B: | ||
14652 | CMPL SI, $0x01 | ||
14653 | JE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B | ||
14654 | JB match_nolit_end_encodeSnappyBlockAsm8B | ||
14655 | MOVW (DI)(R9*1), R8 | ||
14656 | CMPW (BX)(R9*1), R8 | ||
14657 | JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B | ||
14658 | LEAL 2(R9), R9 | ||
14659 | SUBL $0x02, SI | ||
14660 | JZ match_nolit_end_encodeSnappyBlockAsm8B | ||
14661 | |||
14662 | matchlen_match1_match_nolit_encodeSnappyBlockAsm8B: | ||
14663 | MOVB (DI)(R9*1), R8 | ||
14664 | CMPB (BX)(R9*1), R8 | ||
14665 | JNE match_nolit_end_encodeSnappyBlockAsm8B | ||
14666 | LEAL 1(R9), R9 | ||
14667 | |||
14668 | match_nolit_end_encodeSnappyBlockAsm8B: | ||
14669 | ADDL R9, CX | ||
14670 | MOVL 16(SP), BX | ||
14671 | ADDL $0x04, R9 | ||
14672 | MOVL CX, 12(SP) | ||
14673 | |||
14674 | // emitCopy | ||
14675 | two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: | ||
14676 | CMPL R9, $0x40 | ||
14677 | JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B | ||
14678 | MOVB $0xee, (AX) | ||
14679 | MOVW BX, 1(AX) | ||
14680 | LEAL -60(R9), R9 | ||
14681 | ADDQ $0x03, AX | ||
14682 | JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B | ||
14683 | |||
14684 | two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: | ||
14685 | MOVL R9, SI | ||
14686 | SHLL $0x02, SI | ||
14687 | CMPL R9, $0x0c | ||
14688 | JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B | ||
14689 | LEAL -15(SI), SI | ||
14690 | MOVB BL, 1(AX) | ||
14691 | SHRL $0x08, BX | ||
14692 | SHLL $0x05, BX | ||
14693 | ORL BX, SI | ||
14694 | MOVB SI, (AX) | ||
14695 | ADDQ $0x02, AX | ||
14696 | JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B | ||
14697 | |||
14698 | emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: | ||
14699 | LEAL -2(SI), SI | ||
14700 | MOVB SI, (AX) | ||
14701 | MOVW BX, 1(AX) | ||
14702 | ADDQ $0x03, AX | ||
14703 | |||
14704 | match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: | ||
14705 | CMPL CX, 8(SP) | ||
14706 | JAE emit_remainder_encodeSnappyBlockAsm8B | ||
14707 | MOVQ -2(DX)(CX*1), SI | ||
14708 | CMPQ AX, (SP) | ||
14709 | JB match_nolit_dst_ok_encodeSnappyBlockAsm8B | ||
14710 | MOVQ $0x00000000, ret+48(FP) | ||
14711 | RET | ||
14712 | |||
14713 | match_nolit_dst_ok_encodeSnappyBlockAsm8B: | ||
14714 | MOVQ $0x9e3779b1, R8 | ||
14715 | MOVQ SI, DI | ||
14716 | SHRQ $0x10, SI | ||
14717 | MOVQ SI, BX | ||
14718 | SHLQ $0x20, DI | ||
14719 | IMULQ R8, DI | ||
14720 | SHRQ $0x38, DI | ||
14721 | SHLQ $0x20, BX | ||
14722 | IMULQ R8, BX | ||
14723 | SHRQ $0x38, BX | ||
14724 | LEAL -2(CX), R8 | ||
14725 | LEAQ 24(SP)(BX*4), R9 | ||
14726 | MOVL (R9), BX | ||
14727 | MOVL R8, 24(SP)(DI*4) | ||
14728 | MOVL CX, (R9) | ||
14729 | CMPL (DX)(BX*1), SI | ||
14730 | JEQ match_nolit_loop_encodeSnappyBlockAsm8B | ||
14731 | INCL CX | ||
14732 | JMP search_loop_encodeSnappyBlockAsm8B | ||
14733 | |||
14734 | emit_remainder_encodeSnappyBlockAsm8B: | ||
14735 | MOVQ src_len+32(FP), CX | ||
14736 | SUBL 12(SP), CX | ||
14737 | LEAQ 3(AX)(CX*1), CX | ||
14738 | CMPQ CX, (SP) | ||
14739 | JB emit_remainder_ok_encodeSnappyBlockAsm8B | ||
14740 | MOVQ $0x00000000, ret+48(FP) | ||
14741 | RET | ||
14742 | |||
14743 | emit_remainder_ok_encodeSnappyBlockAsm8B: | ||
14744 | MOVQ src_len+32(FP), CX | ||
14745 | MOVL 12(SP), BX | ||
14746 | CMPL BX, CX | ||
14747 | JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B | ||
14748 | MOVL CX, SI | ||
14749 | MOVL CX, 12(SP) | ||
14750 | LEAQ (DX)(BX*1), CX | ||
14751 | SUBL BX, SI | ||
14752 | LEAL -1(SI), DX | ||
14753 | CMPL DX, $0x3c | ||
14754 | JB one_byte_emit_remainder_encodeSnappyBlockAsm8B | ||
14755 | CMPL DX, $0x00000100 | ||
14756 | JB two_bytes_emit_remainder_encodeSnappyBlockAsm8B | ||
14757 | JB three_bytes_emit_remainder_encodeSnappyBlockAsm8B | ||
14758 | |||
14759 | three_bytes_emit_remainder_encodeSnappyBlockAsm8B: | ||
14760 | MOVB $0xf4, (AX) | ||
14761 | MOVW DX, 1(AX) | ||
14762 | ADDQ $0x03, AX | ||
14763 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B | ||
14764 | |||
14765 | two_bytes_emit_remainder_encodeSnappyBlockAsm8B: | ||
14766 | MOVB $0xf0, (AX) | ||
14767 | MOVB DL, 1(AX) | ||
14768 | ADDQ $0x02, AX | ||
14769 | CMPL DX, $0x40 | ||
14770 | JB memmove_emit_remainder_encodeSnappyBlockAsm8B | ||
14771 | JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B | ||
14772 | |||
14773 | one_byte_emit_remainder_encodeSnappyBlockAsm8B: | ||
14774 | SHLB $0x02, DL | ||
14775 | MOVB DL, (AX) | ||
14776 | ADDQ $0x01, AX | ||
14777 | |||
14778 | memmove_emit_remainder_encodeSnappyBlockAsm8B: | ||
14779 | LEAQ (AX)(SI*1), DX | ||
14780 | MOVL SI, BX | ||
14781 | |||
14782 | // genMemMoveShort | ||
14783 | CMPQ BX, $0x03 | ||
14784 | JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 | ||
14785 | JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 | ||
14786 | CMPQ BX, $0x08 | ||
14787 | JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 | ||
14788 | CMPQ BX, $0x10 | ||
14789 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 | ||
14790 | CMPQ BX, $0x20 | ||
14791 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 | ||
14792 | JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 | ||
14793 | |||
14794 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: | ||
14795 | MOVB (CX), SI | ||
14796 | MOVB -1(CX)(BX*1), CL | ||
14797 | MOVB SI, (AX) | ||
14798 | MOVB CL, -1(AX)(BX*1) | ||
14799 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B | ||
14800 | |||
14801 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: | ||
14802 | MOVW (CX), SI | ||
14803 | MOVB 2(CX), CL | ||
14804 | MOVW SI, (AX) | ||
14805 | MOVB CL, 2(AX) | ||
14806 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B | ||
14807 | |||
14808 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: | ||
14809 | MOVL (CX), SI | ||
14810 | MOVL -4(CX)(BX*1), CX | ||
14811 | MOVL SI, (AX) | ||
14812 | MOVL CX, -4(AX)(BX*1) | ||
14813 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B | ||
14814 | |||
14815 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: | ||
14816 | MOVQ (CX), SI | ||
14817 | MOVQ -8(CX)(BX*1), CX | ||
14818 | MOVQ SI, (AX) | ||
14819 | MOVQ CX, -8(AX)(BX*1) | ||
14820 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B | ||
14821 | |||
14822 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: | ||
14823 | MOVOU (CX), X0 | ||
14824 | MOVOU -16(CX)(BX*1), X1 | ||
14825 | MOVOU X0, (AX) | ||
14826 | MOVOU X1, -16(AX)(BX*1) | ||
14827 | JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B | ||
14828 | |||
14829 | emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: | ||
14830 | MOVOU (CX), X0 | ||
14831 | MOVOU 16(CX), X1 | ||
14832 | MOVOU -32(CX)(BX*1), X2 | ||
14833 | MOVOU -16(CX)(BX*1), X3 | ||
14834 | MOVOU X0, (AX) | ||
14835 | MOVOU X1, 16(AX) | ||
14836 | MOVOU X2, -32(AX)(BX*1) | ||
14837 | MOVOU X3, -16(AX)(BX*1) | ||
14838 | |||
14839 | memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: | ||
14840 | MOVQ DX, AX | ||
14841 | JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B | ||
14842 | |||
14843 | memmove_long_emit_remainder_encodeSnappyBlockAsm8B: | ||
14844 | LEAQ (AX)(SI*1), DX | ||
14845 | MOVL SI, BX | ||
14846 | |||
14847 | // genMemMoveLong | ||
14848 | MOVOU (CX), X0 | ||
14849 | MOVOU 16(CX), X1 | ||
14850 | MOVOU -32(CX)(BX*1), X2 | ||
14851 | MOVOU -16(CX)(BX*1), X3 | ||
14852 | MOVQ BX, DI | ||
14853 | SHRQ $0x05, DI | ||
14854 | MOVQ AX, SI | ||
14855 | ANDL $0x0000001f, SI | ||
14856 | MOVQ $0x00000040, R8 | ||
14857 | SUBQ SI, R8 | ||
14858 | DECQ DI | ||
14859 | JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 | ||
14860 | LEAQ -32(CX)(R8*1), SI | ||
14861 | LEAQ -32(AX)(R8*1), R9 | ||
14862 | |||
14863 | emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: | ||
14864 | MOVOU (SI), X4 | ||
14865 | MOVOU 16(SI), X5 | ||
14866 | MOVOA X4, (R9) | ||
14867 | MOVOA X5, 16(R9) | ||
14868 | ADDQ $0x20, R9 | ||
14869 | ADDQ $0x20, SI | ||
14870 | ADDQ $0x20, R8 | ||
14871 | DECQ DI | ||
14872 | JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back | ||
14873 | |||
14874 | emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: | ||
14875 | MOVOU -32(CX)(R8*1), X4 | ||
14876 | MOVOU -16(CX)(R8*1), X5 | ||
14877 | MOVOA X4, -32(AX)(R8*1) | ||
14878 | MOVOA X5, -16(AX)(R8*1) | ||
14879 | ADDQ $0x20, R8 | ||
14880 | CMPQ BX, R8 | ||
14881 | JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 | ||
14882 | MOVOU X0, (AX) | ||
14883 | MOVOU X1, 16(AX) | ||
14884 | MOVOU X2, -32(AX)(BX*1) | ||
14885 | MOVOU X3, -16(AX)(BX*1) | ||
14886 | MOVQ DX, AX | ||
14887 | |||
14888 | emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: | ||
14889 | MOVQ dst_base+0(FP), CX | ||
14890 | SUBQ CX, AX | ||
14891 | MOVQ AX, ret+48(FP) | ||
14892 | RET | ||
14893 | |||
14894 | // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int | ||
14895 | // Requires: BMI, SSE2 | ||
14896 | TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56 | ||
14897 | MOVQ dst_base+0(FP), AX | ||
14898 | MOVQ $0x00001200, CX | ||
14899 | LEAQ 24(SP), DX | ||
14900 | PXOR X0, X0 | ||
14901 | |||
14902 | zero_loop_encodeSnappyBetterBlockAsm: | ||
14903 | MOVOU X0, (DX) | ||
14904 | MOVOU X0, 16(DX) | ||
14905 | MOVOU X0, 32(DX) | ||
14906 | MOVOU X0, 48(DX) | ||
14907 | MOVOU X0, 64(DX) | ||
14908 | MOVOU X0, 80(DX) | ||
14909 | MOVOU X0, 96(DX) | ||
14910 | MOVOU X0, 112(DX) | ||
14911 | ADDQ $0x80, DX | ||
14912 | DECQ CX | ||
14913 | JNZ zero_loop_encodeSnappyBetterBlockAsm | ||
14914 | MOVL $0x00000000, 12(SP) | ||
14915 | MOVQ src_len+32(FP), CX | ||
14916 | LEAQ -9(CX), DX | ||
14917 | LEAQ -8(CX), BX | ||
14918 | MOVL BX, 8(SP) | ||
14919 | SHRQ $0x05, CX | ||
14920 | SUBL CX, DX | ||
14921 | LEAQ (AX)(DX*1), DX | ||
14922 | MOVQ DX, (SP) | ||
14923 | MOVL $0x00000001, CX | ||
14924 | MOVL $0x00000000, 16(SP) | ||
14925 | MOVQ src_base+24(FP), DX | ||
14926 | |||
14927 | search_loop_encodeSnappyBetterBlockAsm: | ||
14928 | MOVL CX, BX | ||
14929 | SUBL 12(SP), BX | ||
14930 | SHRL $0x07, BX | ||
14931 | CMPL BX, $0x63 | ||
14932 | JBE check_maxskip_ok_encodeSnappyBetterBlockAsm | ||
14933 | LEAL 100(CX), BX | ||
14934 | JMP check_maxskip_cont_encodeSnappyBetterBlockAsm | ||
14935 | |||
14936 | check_maxskip_ok_encodeSnappyBetterBlockAsm: | ||
14937 | LEAL 1(CX)(BX*1), BX | ||
14938 | |||
14939 | check_maxskip_cont_encodeSnappyBetterBlockAsm: | ||
14940 | CMPL BX, 8(SP) | ||
14941 | JAE emit_remainder_encodeSnappyBetterBlockAsm | ||
14942 | MOVQ (DX)(CX*1), SI | ||
14943 | MOVL BX, 20(SP) | ||
14944 | MOVQ $0x00cf1bbcdcbfa563, R8 | ||
14945 | MOVQ $0x9e3779b1, BX | ||
14946 | MOVQ SI, R9 | ||
14947 | MOVQ SI, R10 | ||
14948 | SHLQ $0x08, R9 | ||
14949 | IMULQ R8, R9 | ||
14950 | SHRQ $0x2f, R9 | ||
14951 | SHLQ $0x20, R10 | ||
14952 | IMULQ BX, R10 | ||
14953 | SHRQ $0x32, R10 | ||
14954 | MOVL 24(SP)(R9*4), BX | ||
14955 | MOVL 524312(SP)(R10*4), DI | ||
14956 | MOVL CX, 24(SP)(R9*4) | ||
14957 | MOVL CX, 524312(SP)(R10*4) | ||
14958 | MOVQ (DX)(BX*1), R9 | ||
14959 | MOVQ (DX)(DI*1), R10 | ||
14960 | CMPQ R9, SI | ||
14961 | JEQ candidate_match_encodeSnappyBetterBlockAsm | ||
14962 | CMPQ R10, SI | ||
14963 | JNE no_short_found_encodeSnappyBetterBlockAsm | ||
14964 | MOVL DI, BX | ||
14965 | JMP candidate_match_encodeSnappyBetterBlockAsm | ||
14966 | |||
14967 | no_short_found_encodeSnappyBetterBlockAsm: | ||
14968 | CMPL R9, SI | ||
14969 | JEQ candidate_match_encodeSnappyBetterBlockAsm | ||
14970 | CMPL R10, SI | ||
14971 | JEQ candidateS_match_encodeSnappyBetterBlockAsm | ||
14972 | MOVL 20(SP), CX | ||
14973 | JMP search_loop_encodeSnappyBetterBlockAsm | ||
14974 | |||
14975 | candidateS_match_encodeSnappyBetterBlockAsm: | ||
14976 | SHRQ $0x08, SI | ||
14977 | MOVQ SI, R9 | ||
14978 | SHLQ $0x08, R9 | ||
14979 | IMULQ R8, R9 | ||
14980 | SHRQ $0x2f, R9 | ||
14981 | MOVL 24(SP)(R9*4), BX | ||
14982 | INCL CX | ||
14983 | MOVL CX, 24(SP)(R9*4) | ||
14984 | CMPL (DX)(BX*1), SI | ||
14985 | JEQ candidate_match_encodeSnappyBetterBlockAsm | ||
14986 | DECL CX | ||
14987 | MOVL DI, BX | ||
14988 | |||
14989 | candidate_match_encodeSnappyBetterBlockAsm: | ||
14990 | MOVL 12(SP), SI | ||
14991 | TESTL BX, BX | ||
14992 | JZ match_extend_back_end_encodeSnappyBetterBlockAsm | ||
14993 | |||
14994 | match_extend_back_loop_encodeSnappyBetterBlockAsm: | ||
14995 | CMPL CX, SI | ||
14996 | JBE match_extend_back_end_encodeSnappyBetterBlockAsm | ||
14997 | MOVB -1(DX)(BX*1), DI | ||
14998 | MOVB -1(DX)(CX*1), R8 | ||
14999 | CMPB DI, R8 | ||
15000 | JNE match_extend_back_end_encodeSnappyBetterBlockAsm | ||
15001 | LEAL -1(CX), CX | ||
15002 | DECL BX | ||
15003 | JZ match_extend_back_end_encodeSnappyBetterBlockAsm | ||
15004 | JMP match_extend_back_loop_encodeSnappyBetterBlockAsm | ||
15005 | |||
15006 | match_extend_back_end_encodeSnappyBetterBlockAsm: | ||
15007 | MOVL CX, SI | ||
15008 | SUBL 12(SP), SI | ||
15009 | LEAQ 5(AX)(SI*1), SI | ||
15010 | CMPQ SI, (SP) | ||
15011 | JB match_dst_size_check_encodeSnappyBetterBlockAsm | ||
15012 | MOVQ $0x00000000, ret+48(FP) | ||
15013 | RET | ||
15014 | |||
15015 | match_dst_size_check_encodeSnappyBetterBlockAsm: | ||
15016 | MOVL CX, SI | ||
15017 | ADDL $0x04, CX | ||
15018 | ADDL $0x04, BX | ||
15019 | MOVQ src_len+32(FP), DI | ||
15020 | SUBL CX, DI | ||
15021 | LEAQ (DX)(CX*1), R8 | ||
15022 | LEAQ (DX)(BX*1), R9 | ||
15023 | |||
15024 | // matchLen | ||
15025 | XORL R11, R11 | ||
15026 | |||
15027 | matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm: | ||
15028 | CMPL DI, $0x10 | ||
15029 | JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm | ||
15030 | MOVQ (R8)(R11*1), R10 | ||
15031 | MOVQ 8(R8)(R11*1), R12 | ||
15032 | XORQ (R9)(R11*1), R10 | ||
15033 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm | ||
15034 | XORQ 8(R9)(R11*1), R12 | ||
15035 | JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm | ||
15036 | LEAL -16(DI), DI | ||
15037 | LEAL 16(R11), R11 | ||
15038 | JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm | ||
15039 | |||
15040 | matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm: | ||
15041 | #ifdef GOAMD64_v3 | ||
15042 | TZCNTQ R12, R12 | ||
15043 | |||
15044 | #else | ||
15045 | BSFQ R12, R12 | ||
15046 | |||
15047 | #endif | ||
15048 | SARQ $0x03, R12 | ||
15049 | LEAL 8(R11)(R12*1), R11 | ||
15050 | JMP match_nolit_end_encodeSnappyBetterBlockAsm | ||
15051 | |||
15052 | matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm: | ||
15053 | CMPL DI, $0x08 | ||
15054 | JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm | ||
15055 | MOVQ (R8)(R11*1), R10 | ||
15056 | XORQ (R9)(R11*1), R10 | ||
15057 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm | ||
15058 | LEAL -8(DI), DI | ||
15059 | LEAL 8(R11), R11 | ||
15060 | JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm | ||
15061 | |||
15062 | matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm: | ||
15063 | #ifdef GOAMD64_v3 | ||
15064 | TZCNTQ R10, R10 | ||
15065 | |||
15066 | #else | ||
15067 | BSFQ R10, R10 | ||
15068 | |||
15069 | #endif | ||
15070 | SARQ $0x03, R10 | ||
15071 | LEAL (R11)(R10*1), R11 | ||
15072 | JMP match_nolit_end_encodeSnappyBetterBlockAsm | ||
15073 | |||
15074 | matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm: | ||
15075 | CMPL DI, $0x04 | ||
15076 | JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm | ||
15077 | MOVL (R8)(R11*1), R10 | ||
15078 | CMPL (R9)(R11*1), R10 | ||
15079 | JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm | ||
15080 | LEAL -4(DI), DI | ||
15081 | LEAL 4(R11), R11 | ||
15082 | |||
15083 | matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm: | ||
15084 | CMPL DI, $0x01 | ||
15085 | JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm | ||
15086 | JB match_nolit_end_encodeSnappyBetterBlockAsm | ||
15087 | MOVW (R8)(R11*1), R10 | ||
15088 | CMPW (R9)(R11*1), R10 | ||
15089 | JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm | ||
15090 | LEAL 2(R11), R11 | ||
15091 | SUBL $0x02, DI | ||
15092 | JZ match_nolit_end_encodeSnappyBetterBlockAsm | ||
15093 | |||
15094 | matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm: | ||
15095 | MOVB (R8)(R11*1), R10 | ||
15096 | CMPB (R9)(R11*1), R10 | ||
15097 | JNE match_nolit_end_encodeSnappyBetterBlockAsm | ||
15098 | LEAL 1(R11), R11 | ||
15099 | |||
15100 | match_nolit_end_encodeSnappyBetterBlockAsm: | ||
15101 | MOVL CX, DI | ||
15102 | SUBL BX, DI | ||
15103 | |||
15104 | // Check if repeat | ||
15105 | CMPL R11, $0x01 | ||
15106 | JA match_length_ok_encodeSnappyBetterBlockAsm | ||
15107 | CMPL DI, $0x0000ffff | ||
15108 | JBE match_length_ok_encodeSnappyBetterBlockAsm | ||
15109 | MOVL 20(SP), CX | ||
15110 | INCL CX | ||
15111 | JMP search_loop_encodeSnappyBetterBlockAsm | ||
15112 | |||
15113 | match_length_ok_encodeSnappyBetterBlockAsm: | ||
15114 | MOVL DI, 16(SP) | ||
15115 | MOVL 12(SP), BX | ||
15116 | CMPL BX, SI | ||
15117 | JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm | ||
15118 | MOVL SI, R8 | ||
15119 | MOVL SI, 12(SP) | ||
15120 | LEAQ (DX)(BX*1), R9 | ||
15121 | SUBL BX, R8 | ||
15122 | LEAL -1(R8), BX | ||
15123 | CMPL BX, $0x3c | ||
15124 | JB one_byte_match_emit_encodeSnappyBetterBlockAsm | ||
15125 | CMPL BX, $0x00000100 | ||
15126 | JB two_bytes_match_emit_encodeSnappyBetterBlockAsm | ||
15127 | CMPL BX, $0x00010000 | ||
15128 | JB three_bytes_match_emit_encodeSnappyBetterBlockAsm | ||
15129 | CMPL BX, $0x01000000 | ||
15130 | JB four_bytes_match_emit_encodeSnappyBetterBlockAsm | ||
15131 | MOVB $0xfc, (AX) | ||
15132 | MOVL BX, 1(AX) | ||
15133 | ADDQ $0x05, AX | ||
15134 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm | ||
15135 | |||
15136 | four_bytes_match_emit_encodeSnappyBetterBlockAsm: | ||
15137 | MOVL BX, R10 | ||
15138 | SHRL $0x10, R10 | ||
15139 | MOVB $0xf8, (AX) | ||
15140 | MOVW BX, 1(AX) | ||
15141 | MOVB R10, 3(AX) | ||
15142 | ADDQ $0x04, AX | ||
15143 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm | ||
15144 | |||
15145 | three_bytes_match_emit_encodeSnappyBetterBlockAsm: | ||
15146 | MOVB $0xf4, (AX) | ||
15147 | MOVW BX, 1(AX) | ||
15148 | ADDQ $0x03, AX | ||
15149 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm | ||
15150 | |||
15151 | two_bytes_match_emit_encodeSnappyBetterBlockAsm: | ||
15152 | MOVB $0xf0, (AX) | ||
15153 | MOVB BL, 1(AX) | ||
15154 | ADDQ $0x02, AX | ||
15155 | CMPL BX, $0x40 | ||
15156 | JB memmove_match_emit_encodeSnappyBetterBlockAsm | ||
15157 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm | ||
15158 | |||
15159 | one_byte_match_emit_encodeSnappyBetterBlockAsm: | ||
15160 | SHLB $0x02, BL | ||
15161 | MOVB BL, (AX) | ||
15162 | ADDQ $0x01, AX | ||
15163 | |||
15164 | memmove_match_emit_encodeSnappyBetterBlockAsm: | ||
15165 | LEAQ (AX)(R8*1), BX | ||
15166 | |||
15167 | // genMemMoveShort | ||
15168 | CMPQ R8, $0x08 | ||
15169 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 | ||
15170 | CMPQ R8, $0x10 | ||
15171 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 | ||
15172 | CMPQ R8, $0x20 | ||
15173 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 | ||
15174 | JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 | ||
15175 | |||
15176 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: | ||
15177 | MOVQ (R9), R10 | ||
15178 | MOVQ R10, (AX) | ||
15179 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm | ||
15180 | |||
15181 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: | ||
15182 | MOVQ (R9), R10 | ||
15183 | MOVQ -8(R9)(R8*1), R9 | ||
15184 | MOVQ R10, (AX) | ||
15185 | MOVQ R9, -8(AX)(R8*1) | ||
15186 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm | ||
15187 | |||
15188 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: | ||
15189 | MOVOU (R9), X0 | ||
15190 | MOVOU -16(R9)(R8*1), X1 | ||
15191 | MOVOU X0, (AX) | ||
15192 | MOVOU X1, -16(AX)(R8*1) | ||
15193 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm | ||
15194 | |||
15195 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: | ||
15196 | MOVOU (R9), X0 | ||
15197 | MOVOU 16(R9), X1 | ||
15198 | MOVOU -32(R9)(R8*1), X2 | ||
15199 | MOVOU -16(R9)(R8*1), X3 | ||
15200 | MOVOU X0, (AX) | ||
15201 | MOVOU X1, 16(AX) | ||
15202 | MOVOU X2, -32(AX)(R8*1) | ||
15203 | MOVOU X3, -16(AX)(R8*1) | ||
15204 | |||
15205 | memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: | ||
15206 | MOVQ BX, AX | ||
15207 | JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm | ||
15208 | |||
15209 | memmove_long_match_emit_encodeSnappyBetterBlockAsm: | ||
15210 | LEAQ (AX)(R8*1), BX | ||
15211 | |||
15212 | // genMemMoveLong | ||
15213 | MOVOU (R9), X0 | ||
15214 | MOVOU 16(R9), X1 | ||
15215 | MOVOU -32(R9)(R8*1), X2 | ||
15216 | MOVOU -16(R9)(R8*1), X3 | ||
15217 | MOVQ R8, R12 | ||
15218 | SHRQ $0x05, R12 | ||
15219 | MOVQ AX, R10 | ||
15220 | ANDL $0x0000001f, R10 | ||
15221 | MOVQ $0x00000040, R13 | ||
15222 | SUBQ R10, R13 | ||
15223 | DECQ R12 | ||
15224 | JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 | ||
15225 | LEAQ -32(R9)(R13*1), R10 | ||
15226 | LEAQ -32(AX)(R13*1), R14 | ||
15227 | |||
15228 | emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: | ||
15229 | MOVOU (R10), X4 | ||
15230 | MOVOU 16(R10), X5 | ||
15231 | MOVOA X4, (R14) | ||
15232 | MOVOA X5, 16(R14) | ||
15233 | ADDQ $0x20, R14 | ||
15234 | ADDQ $0x20, R10 | ||
15235 | ADDQ $0x20, R13 | ||
15236 | DECQ R12 | ||
15237 | JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back | ||
15238 | |||
15239 | emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: | ||
15240 | MOVOU -32(R9)(R13*1), X4 | ||
15241 | MOVOU -16(R9)(R13*1), X5 | ||
15242 | MOVOA X4, -32(AX)(R13*1) | ||
15243 | MOVOA X5, -16(AX)(R13*1) | ||
15244 | ADDQ $0x20, R13 | ||
15245 | CMPQ R8, R13 | ||
15246 | JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 | ||
15247 | MOVOU X0, (AX) | ||
15248 | MOVOU X1, 16(AX) | ||
15249 | MOVOU X2, -32(AX)(R8*1) | ||
15250 | MOVOU X3, -16(AX)(R8*1) | ||
15251 | MOVQ BX, AX | ||
15252 | |||
15253 | emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: | ||
15254 | ADDL R11, CX | ||
15255 | ADDL $0x04, R11 | ||
15256 | MOVL CX, 12(SP) | ||
15257 | |||
15258 | // emitCopy | ||
15259 | CMPL DI, $0x00010000 | ||
15260 | JB two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm | ||
15261 | |||
15262 | four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: | ||
15263 | CMPL R11, $0x40 | ||
15264 | JBE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm | ||
15265 | MOVB $0xff, (AX) | ||
15266 | MOVL DI, 1(AX) | ||
15267 | LEAL -64(R11), R11 | ||
15268 | ADDQ $0x05, AX | ||
15269 | CMPL R11, $0x04 | ||
15270 | JB four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm | ||
15271 | JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm | ||
15272 | |||
15273 | four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: | ||
15274 | TESTL R11, R11 | ||
15275 | JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm | ||
15276 | XORL BX, BX | ||
15277 | LEAL -1(BX)(R11*4), R11 | ||
15278 | MOVB R11, (AX) | ||
15279 | MOVL DI, 1(AX) | ||
15280 | ADDQ $0x05, AX | ||
15281 | JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm | ||
15282 | |||
15283 | two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: | ||
15284 | CMPL R11, $0x40 | ||
15285 | JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm | ||
15286 | MOVB $0xee, (AX) | ||
15287 | MOVW DI, 1(AX) | ||
15288 | LEAL -60(R11), R11 | ||
15289 | ADDQ $0x03, AX | ||
15290 | JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm | ||
15291 | |||
15292 | two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: | ||
15293 | MOVL R11, BX | ||
15294 | SHLL $0x02, BX | ||
15295 | CMPL R11, $0x0c | ||
15296 | JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm | ||
15297 | CMPL DI, $0x00000800 | ||
15298 | JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm | ||
15299 | LEAL -15(BX), BX | ||
15300 | MOVB DI, 1(AX) | ||
15301 | SHRL $0x08, DI | ||
15302 | SHLL $0x05, DI | ||
15303 | ORL DI, BX | ||
15304 | MOVB BL, (AX) | ||
15305 | ADDQ $0x02, AX | ||
15306 | JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm | ||
15307 | |||
15308 | emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: | ||
15309 | LEAL -2(BX), BX | ||
15310 | MOVB BL, (AX) | ||
15311 | MOVW DI, 1(AX) | ||
15312 | ADDQ $0x03, AX | ||
15313 | |||
15314 | match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: | ||
15315 | CMPL CX, 8(SP) | ||
15316 | JAE emit_remainder_encodeSnappyBetterBlockAsm | ||
15317 | CMPQ AX, (SP) | ||
15318 | JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm | ||
15319 | MOVQ $0x00000000, ret+48(FP) | ||
15320 | RET | ||
15321 | |||
15322 | match_nolit_dst_ok_encodeSnappyBetterBlockAsm: | ||
15323 | MOVQ $0x00cf1bbcdcbfa563, BX | ||
15324 | MOVQ $0x9e3779b1, DI | ||
15325 | LEAQ 1(SI), SI | ||
15326 | LEAQ -2(CX), R8 | ||
15327 | MOVQ (DX)(SI*1), R9 | ||
15328 | MOVQ 1(DX)(SI*1), R10 | ||
15329 | MOVQ (DX)(R8*1), R11 | ||
15330 | MOVQ 1(DX)(R8*1), R12 | ||
15331 | SHLQ $0x08, R9 | ||
15332 | IMULQ BX, R9 | ||
15333 | SHRQ $0x2f, R9 | ||
15334 | SHLQ $0x20, R10 | ||
15335 | IMULQ DI, R10 | ||
15336 | SHRQ $0x32, R10 | ||
15337 | SHLQ $0x08, R11 | ||
15338 | IMULQ BX, R11 | ||
15339 | SHRQ $0x2f, R11 | ||
15340 | SHLQ $0x20, R12 | ||
15341 | IMULQ DI, R12 | ||
15342 | SHRQ $0x32, R12 | ||
15343 | LEAQ 1(SI), DI | ||
15344 | LEAQ 1(R8), R13 | ||
15345 | MOVL SI, 24(SP)(R9*4) | ||
15346 | MOVL R8, 24(SP)(R11*4) | ||
15347 | MOVL DI, 524312(SP)(R10*4) | ||
15348 | MOVL R13, 524312(SP)(R12*4) | ||
15349 | LEAQ 1(R8)(SI*1), DI | ||
15350 | SHRQ $0x01, DI | ||
15351 | ADDQ $0x01, SI | ||
15352 | SUBQ $0x01, R8 | ||
15353 | |||
15354 | index_loop_encodeSnappyBetterBlockAsm: | ||
15355 | CMPQ DI, R8 | ||
15356 | JAE search_loop_encodeSnappyBetterBlockAsm | ||
15357 | MOVQ (DX)(SI*1), R9 | ||
15358 | MOVQ (DX)(DI*1), R10 | ||
15359 | SHLQ $0x08, R9 | ||
15360 | IMULQ BX, R9 | ||
15361 | SHRQ $0x2f, R9 | ||
15362 | SHLQ $0x08, R10 | ||
15363 | IMULQ BX, R10 | ||
15364 | SHRQ $0x2f, R10 | ||
15365 | MOVL SI, 24(SP)(R9*4) | ||
15366 | MOVL DI, 24(SP)(R10*4) | ||
15367 | ADDQ $0x02, SI | ||
15368 | ADDQ $0x02, DI | ||
15369 | JMP index_loop_encodeSnappyBetterBlockAsm | ||
15370 | |||
15371 | emit_remainder_encodeSnappyBetterBlockAsm: | ||
15372 | MOVQ src_len+32(FP), CX | ||
15373 | SUBL 12(SP), CX | ||
15374 | LEAQ 5(AX)(CX*1), CX | ||
15375 | CMPQ CX, (SP) | ||
15376 | JB emit_remainder_ok_encodeSnappyBetterBlockAsm | ||
15377 | MOVQ $0x00000000, ret+48(FP) | ||
15378 | RET | ||
15379 | |||
15380 | emit_remainder_ok_encodeSnappyBetterBlockAsm: | ||
15381 | MOVQ src_len+32(FP), CX | ||
15382 | MOVL 12(SP), BX | ||
15383 | CMPL BX, CX | ||
15384 | JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm | ||
15385 | MOVL CX, SI | ||
15386 | MOVL CX, 12(SP) | ||
15387 | LEAQ (DX)(BX*1), CX | ||
15388 | SUBL BX, SI | ||
15389 | LEAL -1(SI), DX | ||
15390 | CMPL DX, $0x3c | ||
15391 | JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm | ||
15392 | CMPL DX, $0x00000100 | ||
15393 | JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm | ||
15394 | CMPL DX, $0x00010000 | ||
15395 | JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm | ||
15396 | CMPL DX, $0x01000000 | ||
15397 | JB four_bytes_emit_remainder_encodeSnappyBetterBlockAsm | ||
15398 | MOVB $0xfc, (AX) | ||
15399 | MOVL DX, 1(AX) | ||
15400 | ADDQ $0x05, AX | ||
15401 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm | ||
15402 | |||
15403 | four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: | ||
15404 | MOVL DX, BX | ||
15405 | SHRL $0x10, BX | ||
15406 | MOVB $0xf8, (AX) | ||
15407 | MOVW DX, 1(AX) | ||
15408 | MOVB BL, 3(AX) | ||
15409 | ADDQ $0x04, AX | ||
15410 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm | ||
15411 | |||
15412 | three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: | ||
15413 | MOVB $0xf4, (AX) | ||
15414 | MOVW DX, 1(AX) | ||
15415 | ADDQ $0x03, AX | ||
15416 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm | ||
15417 | |||
15418 | two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: | ||
15419 | MOVB $0xf0, (AX) | ||
15420 | MOVB DL, 1(AX) | ||
15421 | ADDQ $0x02, AX | ||
15422 | CMPL DX, $0x40 | ||
15423 | JB memmove_emit_remainder_encodeSnappyBetterBlockAsm | ||
15424 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm | ||
15425 | |||
15426 | one_byte_emit_remainder_encodeSnappyBetterBlockAsm: | ||
15427 | SHLB $0x02, DL | ||
15428 | MOVB DL, (AX) | ||
15429 | ADDQ $0x01, AX | ||
15430 | |||
15431 | memmove_emit_remainder_encodeSnappyBetterBlockAsm: | ||
15432 | LEAQ (AX)(SI*1), DX | ||
15433 | MOVL SI, BX | ||
15434 | |||
15435 | // genMemMoveShort | ||
15436 | CMPQ BX, $0x03 | ||
15437 | JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2 | ||
15438 | JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3 | ||
15439 | CMPQ BX, $0x08 | ||
15440 | JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7 | ||
15441 | CMPQ BX, $0x10 | ||
15442 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 | ||
15443 | CMPQ BX, $0x20 | ||
15444 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 | ||
15445 | JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 | ||
15446 | |||
15447 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2: | ||
15448 | MOVB (CX), SI | ||
15449 | MOVB -1(CX)(BX*1), CL | ||
15450 | MOVB SI, (AX) | ||
15451 | MOVB CL, -1(AX)(BX*1) | ||
15452 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm | ||
15453 | |||
15454 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3: | ||
15455 | MOVW (CX), SI | ||
15456 | MOVB 2(CX), CL | ||
15457 | MOVW SI, (AX) | ||
15458 | MOVB CL, 2(AX) | ||
15459 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm | ||
15460 | |||
15461 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7: | ||
15462 | MOVL (CX), SI | ||
15463 | MOVL -4(CX)(BX*1), CX | ||
15464 | MOVL SI, (AX) | ||
15465 | MOVL CX, -4(AX)(BX*1) | ||
15466 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm | ||
15467 | |||
15468 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: | ||
15469 | MOVQ (CX), SI | ||
15470 | MOVQ -8(CX)(BX*1), CX | ||
15471 | MOVQ SI, (AX) | ||
15472 | MOVQ CX, -8(AX)(BX*1) | ||
15473 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm | ||
15474 | |||
15475 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: | ||
15476 | MOVOU (CX), X0 | ||
15477 | MOVOU -16(CX)(BX*1), X1 | ||
15478 | MOVOU X0, (AX) | ||
15479 | MOVOU X1, -16(AX)(BX*1) | ||
15480 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm | ||
15481 | |||
15482 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: | ||
15483 | MOVOU (CX), X0 | ||
15484 | MOVOU 16(CX), X1 | ||
15485 | MOVOU -32(CX)(BX*1), X2 | ||
15486 | MOVOU -16(CX)(BX*1), X3 | ||
15487 | MOVOU X0, (AX) | ||
15488 | MOVOU X1, 16(AX) | ||
15489 | MOVOU X2, -32(AX)(BX*1) | ||
15490 | MOVOU X3, -16(AX)(BX*1) | ||
15491 | |||
15492 | memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: | ||
15493 | MOVQ DX, AX | ||
15494 | JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm | ||
15495 | |||
15496 | memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: | ||
15497 | LEAQ (AX)(SI*1), DX | ||
15498 | MOVL SI, BX | ||
15499 | |||
15500 | // genMemMoveLong | ||
15501 | MOVOU (CX), X0 | ||
15502 | MOVOU 16(CX), X1 | ||
15503 | MOVOU -32(CX)(BX*1), X2 | ||
15504 | MOVOU -16(CX)(BX*1), X3 | ||
15505 | MOVQ BX, DI | ||
15506 | SHRQ $0x05, DI | ||
15507 | MOVQ AX, SI | ||
15508 | ANDL $0x0000001f, SI | ||
15509 | MOVQ $0x00000040, R8 | ||
15510 | SUBQ SI, R8 | ||
15511 | DECQ DI | ||
15512 | JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 | ||
15513 | LEAQ -32(CX)(R8*1), SI | ||
15514 | LEAQ -32(AX)(R8*1), R9 | ||
15515 | |||
15516 | emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: | ||
15517 | MOVOU (SI), X4 | ||
15518 | MOVOU 16(SI), X5 | ||
15519 | MOVOA X4, (R9) | ||
15520 | MOVOA X5, 16(R9) | ||
15521 | ADDQ $0x20, R9 | ||
15522 | ADDQ $0x20, SI | ||
15523 | ADDQ $0x20, R8 | ||
15524 | DECQ DI | ||
15525 | JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back | ||
15526 | |||
15527 | emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: | ||
15528 | MOVOU -32(CX)(R8*1), X4 | ||
15529 | MOVOU -16(CX)(R8*1), X5 | ||
15530 | MOVOA X4, -32(AX)(R8*1) | ||
15531 | MOVOA X5, -16(AX)(R8*1) | ||
15532 | ADDQ $0x20, R8 | ||
15533 | CMPQ BX, R8 | ||
15534 | JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 | ||
15535 | MOVOU X0, (AX) | ||
15536 | MOVOU X1, 16(AX) | ||
15537 | MOVOU X2, -32(AX)(BX*1) | ||
15538 | MOVOU X3, -16(AX)(BX*1) | ||
15539 | MOVQ DX, AX | ||
15540 | |||
15541 | emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: | ||
15542 | MOVQ dst_base+0(FP), CX | ||
15543 | SUBQ CX, AX | ||
15544 | MOVQ AX, ret+48(FP) | ||
15545 | RET | ||
15546 | |||
15547 | // func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int | ||
15548 | // Requires: BMI, SSE2 | ||
15549 | TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 | ||
15550 | MOVQ dst_base+0(FP), AX | ||
15551 | MOVQ $0x00000a00, CX | ||
15552 | LEAQ 24(SP), DX | ||
15553 | PXOR X0, X0 | ||
15554 | |||
15555 | zero_loop_encodeSnappyBetterBlockAsm64K: | ||
15556 | MOVOU X0, (DX) | ||
15557 | MOVOU X0, 16(DX) | ||
15558 | MOVOU X0, 32(DX) | ||
15559 | MOVOU X0, 48(DX) | ||
15560 | MOVOU X0, 64(DX) | ||
15561 | MOVOU X0, 80(DX) | ||
15562 | MOVOU X0, 96(DX) | ||
15563 | MOVOU X0, 112(DX) | ||
15564 | ADDQ $0x80, DX | ||
15565 | DECQ CX | ||
15566 | JNZ zero_loop_encodeSnappyBetterBlockAsm64K | ||
15567 | MOVL $0x00000000, 12(SP) | ||
15568 | MOVQ src_len+32(FP), CX | ||
15569 | LEAQ -9(CX), DX | ||
15570 | LEAQ -8(CX), BX | ||
15571 | MOVL BX, 8(SP) | ||
15572 | SHRQ $0x05, CX | ||
15573 | SUBL CX, DX | ||
15574 | LEAQ (AX)(DX*1), DX | ||
15575 | MOVQ DX, (SP) | ||
15576 | MOVL $0x00000001, CX | ||
15577 | MOVL $0x00000000, 16(SP) | ||
15578 | MOVQ src_base+24(FP), DX | ||
15579 | |||
15580 | search_loop_encodeSnappyBetterBlockAsm64K: | ||
15581 | MOVL CX, BX | ||
15582 | SUBL 12(SP), BX | ||
15583 | SHRL $0x07, BX | ||
15584 | LEAL 1(CX)(BX*1), BX | ||
15585 | CMPL BX, 8(SP) | ||
15586 | JAE emit_remainder_encodeSnappyBetterBlockAsm64K | ||
15587 | MOVQ (DX)(CX*1), SI | ||
15588 | MOVL BX, 20(SP) | ||
15589 | MOVQ $0x00cf1bbcdcbfa563, R8 | ||
15590 | MOVQ $0x9e3779b1, BX | ||
15591 | MOVQ SI, R9 | ||
15592 | MOVQ SI, R10 | ||
15593 | SHLQ $0x08, R9 | ||
15594 | IMULQ R8, R9 | ||
15595 | SHRQ $0x30, R9 | ||
15596 | SHLQ $0x20, R10 | ||
15597 | IMULQ BX, R10 | ||
15598 | SHRQ $0x32, R10 | ||
15599 | MOVL 24(SP)(R9*4), BX | ||
15600 | MOVL 262168(SP)(R10*4), DI | ||
15601 | MOVL CX, 24(SP)(R9*4) | ||
15602 | MOVL CX, 262168(SP)(R10*4) | ||
15603 | MOVQ (DX)(BX*1), R9 | ||
15604 | MOVQ (DX)(DI*1), R10 | ||
15605 | CMPQ R9, SI | ||
15606 | JEQ candidate_match_encodeSnappyBetterBlockAsm64K | ||
15607 | CMPQ R10, SI | ||
15608 | JNE no_short_found_encodeSnappyBetterBlockAsm64K | ||
15609 | MOVL DI, BX | ||
15610 | JMP candidate_match_encodeSnappyBetterBlockAsm64K | ||
15611 | |||
15612 | no_short_found_encodeSnappyBetterBlockAsm64K: | ||
15613 | CMPL R9, SI | ||
15614 | JEQ candidate_match_encodeSnappyBetterBlockAsm64K | ||
15615 | CMPL R10, SI | ||
15616 | JEQ candidateS_match_encodeSnappyBetterBlockAsm64K | ||
15617 | MOVL 20(SP), CX | ||
15618 | JMP search_loop_encodeSnappyBetterBlockAsm64K | ||
15619 | |||
15620 | candidateS_match_encodeSnappyBetterBlockAsm64K: | ||
15621 | SHRQ $0x08, SI | ||
15622 | MOVQ SI, R9 | ||
15623 | SHLQ $0x08, R9 | ||
15624 | IMULQ R8, R9 | ||
15625 | SHRQ $0x30, R9 | ||
15626 | MOVL 24(SP)(R9*4), BX | ||
15627 | INCL CX | ||
15628 | MOVL CX, 24(SP)(R9*4) | ||
15629 | CMPL (DX)(BX*1), SI | ||
15630 | JEQ candidate_match_encodeSnappyBetterBlockAsm64K | ||
15631 | DECL CX | ||
15632 | MOVL DI, BX | ||
15633 | |||
15634 | candidate_match_encodeSnappyBetterBlockAsm64K: | ||
15635 | MOVL 12(SP), SI | ||
15636 | TESTL BX, BX | ||
15637 | JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K | ||
15638 | |||
15639 | match_extend_back_loop_encodeSnappyBetterBlockAsm64K: | ||
15640 | CMPL CX, SI | ||
15641 | JBE match_extend_back_end_encodeSnappyBetterBlockAsm64K | ||
15642 | MOVB -1(DX)(BX*1), DI | ||
15643 | MOVB -1(DX)(CX*1), R8 | ||
15644 | CMPB DI, R8 | ||
15645 | JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K | ||
15646 | LEAL -1(CX), CX | ||
15647 | DECL BX | ||
15648 | JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K | ||
15649 | JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K | ||
15650 | |||
15651 | match_extend_back_end_encodeSnappyBetterBlockAsm64K: | ||
15652 | MOVL CX, SI | ||
15653 | SUBL 12(SP), SI | ||
15654 | LEAQ 3(AX)(SI*1), SI | ||
15655 | CMPQ SI, (SP) | ||
15656 | JB match_dst_size_check_encodeSnappyBetterBlockAsm64K | ||
15657 | MOVQ $0x00000000, ret+48(FP) | ||
15658 | RET | ||
15659 | |||
15660 | match_dst_size_check_encodeSnappyBetterBlockAsm64K: | ||
15661 | MOVL CX, SI | ||
15662 | ADDL $0x04, CX | ||
15663 | ADDL $0x04, BX | ||
15664 | MOVQ src_len+32(FP), DI | ||
15665 | SUBL CX, DI | ||
15666 | LEAQ (DX)(CX*1), R8 | ||
15667 | LEAQ (DX)(BX*1), R9 | ||
15668 | |||
15669 | // matchLen | ||
15670 | XORL R11, R11 | ||
15671 | |||
15672 | matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K: | ||
15673 | CMPL DI, $0x10 | ||
15674 | JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15675 | MOVQ (R8)(R11*1), R10 | ||
15676 | MOVQ 8(R8)(R11*1), R12 | ||
15677 | XORQ (R9)(R11*1), R10 | ||
15678 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15679 | XORQ 8(R9)(R11*1), R12 | ||
15680 | JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K | ||
15681 | LEAL -16(DI), DI | ||
15682 | LEAL 16(R11), R11 | ||
15683 | JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15684 | |||
15685 | matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K: | ||
15686 | #ifdef GOAMD64_v3 | ||
15687 | TZCNTQ R12, R12 | ||
15688 | |||
15689 | #else | ||
15690 | BSFQ R12, R12 | ||
15691 | |||
15692 | #endif | ||
15693 | SARQ $0x03, R12 | ||
15694 | LEAL 8(R11)(R12*1), R11 | ||
15695 | JMP match_nolit_end_encodeSnappyBetterBlockAsm64K | ||
15696 | |||
15697 | matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K: | ||
15698 | CMPL DI, $0x08 | ||
15699 | JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15700 | MOVQ (R8)(R11*1), R10 | ||
15701 | XORQ (R9)(R11*1), R10 | ||
15702 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15703 | LEAL -8(DI), DI | ||
15704 | LEAL 8(R11), R11 | ||
15705 | JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15706 | |||
15707 | matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K: | ||
15708 | #ifdef GOAMD64_v3 | ||
15709 | TZCNTQ R10, R10 | ||
15710 | |||
15711 | #else | ||
15712 | BSFQ R10, R10 | ||
15713 | |||
15714 | #endif | ||
15715 | SARQ $0x03, R10 | ||
15716 | LEAL (R11)(R10*1), R11 | ||
15717 | JMP match_nolit_end_encodeSnappyBetterBlockAsm64K | ||
15718 | |||
15719 | matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K: | ||
15720 | CMPL DI, $0x04 | ||
15721 | JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15722 | MOVL (R8)(R11*1), R10 | ||
15723 | CMPL (R9)(R11*1), R10 | ||
15724 | JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15725 | LEAL -4(DI), DI | ||
15726 | LEAL 4(R11), R11 | ||
15727 | |||
15728 | matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K: | ||
15729 | CMPL DI, $0x01 | ||
15730 | JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15731 | JB match_nolit_end_encodeSnappyBetterBlockAsm64K | ||
15732 | MOVW (R8)(R11*1), R10 | ||
15733 | CMPW (R9)(R11*1), R10 | ||
15734 | JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15735 | LEAL 2(R11), R11 | ||
15736 | SUBL $0x02, DI | ||
15737 | JZ match_nolit_end_encodeSnappyBetterBlockAsm64K | ||
15738 | |||
15739 | matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K: | ||
15740 | MOVB (R8)(R11*1), R10 | ||
15741 | CMPB (R9)(R11*1), R10 | ||
15742 | JNE match_nolit_end_encodeSnappyBetterBlockAsm64K | ||
15743 | LEAL 1(R11), R11 | ||
15744 | |||
15745 | match_nolit_end_encodeSnappyBetterBlockAsm64K: | ||
15746 | MOVL CX, DI | ||
15747 | SUBL BX, DI | ||
15748 | |||
15749 | // Check if repeat | ||
15750 | MOVL DI, 16(SP) | ||
15751 | MOVL 12(SP), BX | ||
15752 | CMPL BX, SI | ||
15753 | JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K | ||
15754 | MOVL SI, R8 | ||
15755 | MOVL SI, 12(SP) | ||
15756 | LEAQ (DX)(BX*1), R9 | ||
15757 | SUBL BX, R8 | ||
15758 | LEAL -1(R8), BX | ||
15759 | CMPL BX, $0x3c | ||
15760 | JB one_byte_match_emit_encodeSnappyBetterBlockAsm64K | ||
15761 | CMPL BX, $0x00000100 | ||
15762 | JB two_bytes_match_emit_encodeSnappyBetterBlockAsm64K | ||
15763 | JB three_bytes_match_emit_encodeSnappyBetterBlockAsm64K | ||
15764 | |||
15765 | three_bytes_match_emit_encodeSnappyBetterBlockAsm64K: | ||
15766 | MOVB $0xf4, (AX) | ||
15767 | MOVW BX, 1(AX) | ||
15768 | ADDQ $0x03, AX | ||
15769 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K | ||
15770 | |||
15771 | two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: | ||
15772 | MOVB $0xf0, (AX) | ||
15773 | MOVB BL, 1(AX) | ||
15774 | ADDQ $0x02, AX | ||
15775 | CMPL BX, $0x40 | ||
15776 | JB memmove_match_emit_encodeSnappyBetterBlockAsm64K | ||
15777 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K | ||
15778 | |||
15779 | one_byte_match_emit_encodeSnappyBetterBlockAsm64K: | ||
15780 | SHLB $0x02, BL | ||
15781 | MOVB BL, (AX) | ||
15782 | ADDQ $0x01, AX | ||
15783 | |||
15784 | memmove_match_emit_encodeSnappyBetterBlockAsm64K: | ||
15785 | LEAQ (AX)(R8*1), BX | ||
15786 | |||
15787 | // genMemMoveShort | ||
15788 | CMPQ R8, $0x08 | ||
15789 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 | ||
15790 | CMPQ R8, $0x10 | ||
15791 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 | ||
15792 | CMPQ R8, $0x20 | ||
15793 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 | ||
15794 | JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 | ||
15795 | |||
15796 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: | ||
15797 | MOVQ (R9), R10 | ||
15798 | MOVQ R10, (AX) | ||
15799 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K | ||
15800 | |||
15801 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: | ||
15802 | MOVQ (R9), R10 | ||
15803 | MOVQ -8(R9)(R8*1), R9 | ||
15804 | MOVQ R10, (AX) | ||
15805 | MOVQ R9, -8(AX)(R8*1) | ||
15806 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K | ||
15807 | |||
15808 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: | ||
15809 | MOVOU (R9), X0 | ||
15810 | MOVOU -16(R9)(R8*1), X1 | ||
15811 | MOVOU X0, (AX) | ||
15812 | MOVOU X1, -16(AX)(R8*1) | ||
15813 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K | ||
15814 | |||
15815 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: | ||
15816 | MOVOU (R9), X0 | ||
15817 | MOVOU 16(R9), X1 | ||
15818 | MOVOU -32(R9)(R8*1), X2 | ||
15819 | MOVOU -16(R9)(R8*1), X3 | ||
15820 | MOVOU X0, (AX) | ||
15821 | MOVOU X1, 16(AX) | ||
15822 | MOVOU X2, -32(AX)(R8*1) | ||
15823 | MOVOU X3, -16(AX)(R8*1) | ||
15824 | |||
15825 | memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: | ||
15826 | MOVQ BX, AX | ||
15827 | JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K | ||
15828 | |||
15829 | memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: | ||
15830 | LEAQ (AX)(R8*1), BX | ||
15831 | |||
15832 | // genMemMoveLong | ||
15833 | MOVOU (R9), X0 | ||
15834 | MOVOU 16(R9), X1 | ||
15835 | MOVOU -32(R9)(R8*1), X2 | ||
15836 | MOVOU -16(R9)(R8*1), X3 | ||
15837 | MOVQ R8, R12 | ||
15838 | SHRQ $0x05, R12 | ||
15839 | MOVQ AX, R10 | ||
15840 | ANDL $0x0000001f, R10 | ||
15841 | MOVQ $0x00000040, R13 | ||
15842 | SUBQ R10, R13 | ||
15843 | DECQ R12 | ||
15844 | JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 | ||
15845 | LEAQ -32(R9)(R13*1), R10 | ||
15846 | LEAQ -32(AX)(R13*1), R14 | ||
15847 | |||
15848 | emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: | ||
15849 | MOVOU (R10), X4 | ||
15850 | MOVOU 16(R10), X5 | ||
15851 | MOVOA X4, (R14) | ||
15852 | MOVOA X5, 16(R14) | ||
15853 | ADDQ $0x20, R14 | ||
15854 | ADDQ $0x20, R10 | ||
15855 | ADDQ $0x20, R13 | ||
15856 | DECQ R12 | ||
15857 | JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back | ||
15858 | |||
15859 | emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: | ||
15860 | MOVOU -32(R9)(R13*1), X4 | ||
15861 | MOVOU -16(R9)(R13*1), X5 | ||
15862 | MOVOA X4, -32(AX)(R13*1) | ||
15863 | MOVOA X5, -16(AX)(R13*1) | ||
15864 | ADDQ $0x20, R13 | ||
15865 | CMPQ R8, R13 | ||
15866 | JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 | ||
15867 | MOVOU X0, (AX) | ||
15868 | MOVOU X1, 16(AX) | ||
15869 | MOVOU X2, -32(AX)(R8*1) | ||
15870 | MOVOU X3, -16(AX)(R8*1) | ||
15871 | MOVQ BX, AX | ||
15872 | |||
15873 | emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: | ||
15874 | ADDL R11, CX | ||
15875 | ADDL $0x04, R11 | ||
15876 | MOVL CX, 12(SP) | ||
15877 | |||
15878 | // emitCopy | ||
15879 | two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: | ||
15880 | CMPL R11, $0x40 | ||
15881 | JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15882 | MOVB $0xee, (AX) | ||
15883 | MOVW DI, 1(AX) | ||
15884 | LEAL -60(R11), R11 | ||
15885 | ADDQ $0x03, AX | ||
15886 | JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15887 | |||
15888 | two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: | ||
15889 | MOVL R11, BX | ||
15890 | SHLL $0x02, BX | ||
15891 | CMPL R11, $0x0c | ||
15892 | JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15893 | CMPL DI, $0x00000800 | ||
15894 | JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K | ||
15895 | LEAL -15(BX), BX | ||
15896 | MOVB DI, 1(AX) | ||
15897 | SHRL $0x08, DI | ||
15898 | SHLL $0x05, DI | ||
15899 | ORL DI, BX | ||
15900 | MOVB BL, (AX) | ||
15901 | ADDQ $0x02, AX | ||
15902 | JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K | ||
15903 | |||
15904 | emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: | ||
15905 | LEAL -2(BX), BX | ||
15906 | MOVB BL, (AX) | ||
15907 | MOVW DI, 1(AX) | ||
15908 | ADDQ $0x03, AX | ||
15909 | |||
15910 | match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: | ||
15911 | CMPL CX, 8(SP) | ||
15912 | JAE emit_remainder_encodeSnappyBetterBlockAsm64K | ||
15913 | CMPQ AX, (SP) | ||
15914 | JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K | ||
15915 | MOVQ $0x00000000, ret+48(FP) | ||
15916 | RET | ||
15917 | |||
15918 | match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: | ||
15919 | MOVQ $0x00cf1bbcdcbfa563, BX | ||
15920 | MOVQ $0x9e3779b1, DI | ||
15921 | LEAQ 1(SI), SI | ||
15922 | LEAQ -2(CX), R8 | ||
15923 | MOVQ (DX)(SI*1), R9 | ||
15924 | MOVQ 1(DX)(SI*1), R10 | ||
15925 | MOVQ (DX)(R8*1), R11 | ||
15926 | MOVQ 1(DX)(R8*1), R12 | ||
15927 | SHLQ $0x08, R9 | ||
15928 | IMULQ BX, R9 | ||
15929 | SHRQ $0x30, R9 | ||
15930 | SHLQ $0x20, R10 | ||
15931 | IMULQ DI, R10 | ||
15932 | SHRQ $0x32, R10 | ||
15933 | SHLQ $0x08, R11 | ||
15934 | IMULQ BX, R11 | ||
15935 | SHRQ $0x30, R11 | ||
15936 | SHLQ $0x20, R12 | ||
15937 | IMULQ DI, R12 | ||
15938 | SHRQ $0x32, R12 | ||
15939 | LEAQ 1(SI), DI | ||
15940 | LEAQ 1(R8), R13 | ||
15941 | MOVL SI, 24(SP)(R9*4) | ||
15942 | MOVL R8, 24(SP)(R11*4) | ||
15943 | MOVL DI, 262168(SP)(R10*4) | ||
15944 | MOVL R13, 262168(SP)(R12*4) | ||
15945 | LEAQ 1(R8)(SI*1), DI | ||
15946 | SHRQ $0x01, DI | ||
15947 | ADDQ $0x01, SI | ||
15948 | SUBQ $0x01, R8 | ||
15949 | |||
15950 | index_loop_encodeSnappyBetterBlockAsm64K: | ||
15951 | CMPQ DI, R8 | ||
15952 | JAE search_loop_encodeSnappyBetterBlockAsm64K | ||
15953 | MOVQ (DX)(SI*1), R9 | ||
15954 | MOVQ (DX)(DI*1), R10 | ||
15955 | SHLQ $0x08, R9 | ||
15956 | IMULQ BX, R9 | ||
15957 | SHRQ $0x30, R9 | ||
15958 | SHLQ $0x08, R10 | ||
15959 | IMULQ BX, R10 | ||
15960 | SHRQ $0x30, R10 | ||
15961 | MOVL SI, 24(SP)(R9*4) | ||
15962 | MOVL DI, 24(SP)(R10*4) | ||
15963 | ADDQ $0x02, SI | ||
15964 | ADDQ $0x02, DI | ||
15965 | JMP index_loop_encodeSnappyBetterBlockAsm64K | ||
15966 | |||
15967 | emit_remainder_encodeSnappyBetterBlockAsm64K: | ||
15968 | MOVQ src_len+32(FP), CX | ||
15969 | SUBL 12(SP), CX | ||
15970 | LEAQ 3(AX)(CX*1), CX | ||
15971 | CMPQ CX, (SP) | ||
15972 | JB emit_remainder_ok_encodeSnappyBetterBlockAsm64K | ||
15973 | MOVQ $0x00000000, ret+48(FP) | ||
15974 | RET | ||
15975 | |||
15976 | emit_remainder_ok_encodeSnappyBetterBlockAsm64K: | ||
15977 | MOVQ src_len+32(FP), CX | ||
15978 | MOVL 12(SP), BX | ||
15979 | CMPL BX, CX | ||
15980 | JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
15981 | MOVL CX, SI | ||
15982 | MOVL CX, 12(SP) | ||
15983 | LEAQ (DX)(BX*1), CX | ||
15984 | SUBL BX, SI | ||
15985 | LEAL -1(SI), DX | ||
15986 | CMPL DX, $0x3c | ||
15987 | JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
15988 | CMPL DX, $0x00000100 | ||
15989 | JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
15990 | JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
15991 | |||
15992 | three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: | ||
15993 | MOVB $0xf4, (AX) | ||
15994 | MOVW DX, 1(AX) | ||
15995 | ADDQ $0x03, AX | ||
15996 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
15997 | |||
15998 | two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: | ||
15999 | MOVB $0xf0, (AX) | ||
16000 | MOVB DL, 1(AX) | ||
16001 | ADDQ $0x02, AX | ||
16002 | CMPL DX, $0x40 | ||
16003 | JB memmove_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
16004 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
16005 | |||
16006 | one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: | ||
16007 | SHLB $0x02, DL | ||
16008 | MOVB DL, (AX) | ||
16009 | ADDQ $0x01, AX | ||
16010 | |||
16011 | memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: | ||
16012 | LEAQ (AX)(SI*1), DX | ||
16013 | MOVL SI, BX | ||
16014 | |||
16015 | // genMemMoveShort | ||
16016 | CMPQ BX, $0x03 | ||
16017 | JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2 | ||
16018 | JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3 | ||
16019 | CMPQ BX, $0x08 | ||
16020 | JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7 | ||
16021 | CMPQ BX, $0x10 | ||
16022 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 | ||
16023 | CMPQ BX, $0x20 | ||
16024 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 | ||
16025 | JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 | ||
16026 | |||
16027 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2: | ||
16028 | MOVB (CX), SI | ||
16029 | MOVB -1(CX)(BX*1), CL | ||
16030 | MOVB SI, (AX) | ||
16031 | MOVB CL, -1(AX)(BX*1) | ||
16032 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
16033 | |||
16034 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3: | ||
16035 | MOVW (CX), SI | ||
16036 | MOVB 2(CX), CL | ||
16037 | MOVW SI, (AX) | ||
16038 | MOVB CL, 2(AX) | ||
16039 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
16040 | |||
16041 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7: | ||
16042 | MOVL (CX), SI | ||
16043 | MOVL -4(CX)(BX*1), CX | ||
16044 | MOVL SI, (AX) | ||
16045 | MOVL CX, -4(AX)(BX*1) | ||
16046 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
16047 | |||
16048 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: | ||
16049 | MOVQ (CX), SI | ||
16050 | MOVQ -8(CX)(BX*1), CX | ||
16051 | MOVQ SI, (AX) | ||
16052 | MOVQ CX, -8(AX)(BX*1) | ||
16053 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
16054 | |||
16055 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: | ||
16056 | MOVOU (CX), X0 | ||
16057 | MOVOU -16(CX)(BX*1), X1 | ||
16058 | MOVOU X0, (AX) | ||
16059 | MOVOU X1, -16(AX)(BX*1) | ||
16060 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
16061 | |||
16062 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: | ||
16063 | MOVOU (CX), X0 | ||
16064 | MOVOU 16(CX), X1 | ||
16065 | MOVOU -32(CX)(BX*1), X2 | ||
16066 | MOVOU -16(CX)(BX*1), X3 | ||
16067 | MOVOU X0, (AX) | ||
16068 | MOVOU X1, 16(AX) | ||
16069 | MOVOU X2, -32(AX)(BX*1) | ||
16070 | MOVOU X3, -16(AX)(BX*1) | ||
16071 | |||
16072 | memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: | ||
16073 | MOVQ DX, AX | ||
16074 | JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K | ||
16075 | |||
16076 | memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: | ||
16077 | LEAQ (AX)(SI*1), DX | ||
16078 | MOVL SI, BX | ||
16079 | |||
16080 | // genMemMoveLong | ||
16081 | MOVOU (CX), X0 | ||
16082 | MOVOU 16(CX), X1 | ||
16083 | MOVOU -32(CX)(BX*1), X2 | ||
16084 | MOVOU -16(CX)(BX*1), X3 | ||
16085 | MOVQ BX, DI | ||
16086 | SHRQ $0x05, DI | ||
16087 | MOVQ AX, SI | ||
16088 | ANDL $0x0000001f, SI | ||
16089 | MOVQ $0x00000040, R8 | ||
16090 | SUBQ SI, R8 | ||
16091 | DECQ DI | ||
16092 | JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 | ||
16093 | LEAQ -32(CX)(R8*1), SI | ||
16094 | LEAQ -32(AX)(R8*1), R9 | ||
16095 | |||
16096 | emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: | ||
16097 | MOVOU (SI), X4 | ||
16098 | MOVOU 16(SI), X5 | ||
16099 | MOVOA X4, (R9) | ||
16100 | MOVOA X5, 16(R9) | ||
16101 | ADDQ $0x20, R9 | ||
16102 | ADDQ $0x20, SI | ||
16103 | ADDQ $0x20, R8 | ||
16104 | DECQ DI | ||
16105 | JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back | ||
16106 | |||
16107 | emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: | ||
16108 | MOVOU -32(CX)(R8*1), X4 | ||
16109 | MOVOU -16(CX)(R8*1), X5 | ||
16110 | MOVOA X4, -32(AX)(R8*1) | ||
16111 | MOVOA X5, -16(AX)(R8*1) | ||
16112 | ADDQ $0x20, R8 | ||
16113 | CMPQ BX, R8 | ||
16114 | JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 | ||
16115 | MOVOU X0, (AX) | ||
16116 | MOVOU X1, 16(AX) | ||
16117 | MOVOU X2, -32(AX)(BX*1) | ||
16118 | MOVOU X3, -16(AX)(BX*1) | ||
16119 | MOVQ DX, AX | ||
16120 | |||
16121 | emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: | ||
16122 | MOVQ dst_base+0(FP), CX | ||
16123 | SUBQ CX, AX | ||
16124 | MOVQ AX, ret+48(FP) | ||
16125 | RET | ||
16126 | |||
16127 | // func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int | ||
16128 | // Requires: BMI, SSE2 | ||
16129 | TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 | ||
16130 | MOVQ dst_base+0(FP), AX | ||
16131 | MOVQ $0x00000280, CX | ||
16132 | LEAQ 24(SP), DX | ||
16133 | PXOR X0, X0 | ||
16134 | |||
16135 | zero_loop_encodeSnappyBetterBlockAsm12B: | ||
16136 | MOVOU X0, (DX) | ||
16137 | MOVOU X0, 16(DX) | ||
16138 | MOVOU X0, 32(DX) | ||
16139 | MOVOU X0, 48(DX) | ||
16140 | MOVOU X0, 64(DX) | ||
16141 | MOVOU X0, 80(DX) | ||
16142 | MOVOU X0, 96(DX) | ||
16143 | MOVOU X0, 112(DX) | ||
16144 | ADDQ $0x80, DX | ||
16145 | DECQ CX | ||
16146 | JNZ zero_loop_encodeSnappyBetterBlockAsm12B | ||
16147 | MOVL $0x00000000, 12(SP) | ||
16148 | MOVQ src_len+32(FP), CX | ||
16149 | LEAQ -9(CX), DX | ||
16150 | LEAQ -8(CX), BX | ||
16151 | MOVL BX, 8(SP) | ||
16152 | SHRQ $0x05, CX | ||
16153 | SUBL CX, DX | ||
16154 | LEAQ (AX)(DX*1), DX | ||
16155 | MOVQ DX, (SP) | ||
16156 | MOVL $0x00000001, CX | ||
16157 | MOVL $0x00000000, 16(SP) | ||
16158 | MOVQ src_base+24(FP), DX | ||
16159 | |||
16160 | search_loop_encodeSnappyBetterBlockAsm12B: | ||
16161 | MOVL CX, BX | ||
16162 | SUBL 12(SP), BX | ||
16163 | SHRL $0x06, BX | ||
16164 | LEAL 1(CX)(BX*1), BX | ||
16165 | CMPL BX, 8(SP) | ||
16166 | JAE emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16167 | MOVQ (DX)(CX*1), SI | ||
16168 | MOVL BX, 20(SP) | ||
16169 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
16170 | MOVQ $0x9e3779b1, BX | ||
16171 | MOVQ SI, R9 | ||
16172 | MOVQ SI, R10 | ||
16173 | SHLQ $0x10, R9 | ||
16174 | IMULQ R8, R9 | ||
16175 | SHRQ $0x32, R9 | ||
16176 | SHLQ $0x20, R10 | ||
16177 | IMULQ BX, R10 | ||
16178 | SHRQ $0x34, R10 | ||
16179 | MOVL 24(SP)(R9*4), BX | ||
16180 | MOVL 65560(SP)(R10*4), DI | ||
16181 | MOVL CX, 24(SP)(R9*4) | ||
16182 | MOVL CX, 65560(SP)(R10*4) | ||
16183 | MOVQ (DX)(BX*1), R9 | ||
16184 | MOVQ (DX)(DI*1), R10 | ||
16185 | CMPQ R9, SI | ||
16186 | JEQ candidate_match_encodeSnappyBetterBlockAsm12B | ||
16187 | CMPQ R10, SI | ||
16188 | JNE no_short_found_encodeSnappyBetterBlockAsm12B | ||
16189 | MOVL DI, BX | ||
16190 | JMP candidate_match_encodeSnappyBetterBlockAsm12B | ||
16191 | |||
16192 | no_short_found_encodeSnappyBetterBlockAsm12B: | ||
16193 | CMPL R9, SI | ||
16194 | JEQ candidate_match_encodeSnappyBetterBlockAsm12B | ||
16195 | CMPL R10, SI | ||
16196 | JEQ candidateS_match_encodeSnappyBetterBlockAsm12B | ||
16197 | MOVL 20(SP), CX | ||
16198 | JMP search_loop_encodeSnappyBetterBlockAsm12B | ||
16199 | |||
16200 | candidateS_match_encodeSnappyBetterBlockAsm12B: | ||
16201 | SHRQ $0x08, SI | ||
16202 | MOVQ SI, R9 | ||
16203 | SHLQ $0x10, R9 | ||
16204 | IMULQ R8, R9 | ||
16205 | SHRQ $0x32, R9 | ||
16206 | MOVL 24(SP)(R9*4), BX | ||
16207 | INCL CX | ||
16208 | MOVL CX, 24(SP)(R9*4) | ||
16209 | CMPL (DX)(BX*1), SI | ||
16210 | JEQ candidate_match_encodeSnappyBetterBlockAsm12B | ||
16211 | DECL CX | ||
16212 | MOVL DI, BX | ||
16213 | |||
16214 | candidate_match_encodeSnappyBetterBlockAsm12B: | ||
16215 | MOVL 12(SP), SI | ||
16216 | TESTL BX, BX | ||
16217 | JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B | ||
16218 | |||
16219 | match_extend_back_loop_encodeSnappyBetterBlockAsm12B: | ||
16220 | CMPL CX, SI | ||
16221 | JBE match_extend_back_end_encodeSnappyBetterBlockAsm12B | ||
16222 | MOVB -1(DX)(BX*1), DI | ||
16223 | MOVB -1(DX)(CX*1), R8 | ||
16224 | CMPB DI, R8 | ||
16225 | JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B | ||
16226 | LEAL -1(CX), CX | ||
16227 | DECL BX | ||
16228 | JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B | ||
16229 | JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B | ||
16230 | |||
16231 | match_extend_back_end_encodeSnappyBetterBlockAsm12B: | ||
16232 | MOVL CX, SI | ||
16233 | SUBL 12(SP), SI | ||
16234 | LEAQ 3(AX)(SI*1), SI | ||
16235 | CMPQ SI, (SP) | ||
16236 | JB match_dst_size_check_encodeSnappyBetterBlockAsm12B | ||
16237 | MOVQ $0x00000000, ret+48(FP) | ||
16238 | RET | ||
16239 | |||
16240 | match_dst_size_check_encodeSnappyBetterBlockAsm12B: | ||
16241 | MOVL CX, SI | ||
16242 | ADDL $0x04, CX | ||
16243 | ADDL $0x04, BX | ||
16244 | MOVQ src_len+32(FP), DI | ||
16245 | SUBL CX, DI | ||
16246 | LEAQ (DX)(CX*1), R8 | ||
16247 | LEAQ (DX)(BX*1), R9 | ||
16248 | |||
16249 | // matchLen | ||
16250 | XORL R11, R11 | ||
16251 | |||
16252 | matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B: | ||
16253 | CMPL DI, $0x10 | ||
16254 | JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16255 | MOVQ (R8)(R11*1), R10 | ||
16256 | MOVQ 8(R8)(R11*1), R12 | ||
16257 | XORQ (R9)(R11*1), R10 | ||
16258 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16259 | XORQ 8(R9)(R11*1), R12 | ||
16260 | JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B | ||
16261 | LEAL -16(DI), DI | ||
16262 | LEAL 16(R11), R11 | ||
16263 | JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16264 | |||
16265 | matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B: | ||
16266 | #ifdef GOAMD64_v3 | ||
16267 | TZCNTQ R12, R12 | ||
16268 | |||
16269 | #else | ||
16270 | BSFQ R12, R12 | ||
16271 | |||
16272 | #endif | ||
16273 | SARQ $0x03, R12 | ||
16274 | LEAL 8(R11)(R12*1), R11 | ||
16275 | JMP match_nolit_end_encodeSnappyBetterBlockAsm12B | ||
16276 | |||
16277 | matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B: | ||
16278 | CMPL DI, $0x08 | ||
16279 | JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16280 | MOVQ (R8)(R11*1), R10 | ||
16281 | XORQ (R9)(R11*1), R10 | ||
16282 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16283 | LEAL -8(DI), DI | ||
16284 | LEAL 8(R11), R11 | ||
16285 | JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16286 | |||
16287 | matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B: | ||
16288 | #ifdef GOAMD64_v3 | ||
16289 | TZCNTQ R10, R10 | ||
16290 | |||
16291 | #else | ||
16292 | BSFQ R10, R10 | ||
16293 | |||
16294 | #endif | ||
16295 | SARQ $0x03, R10 | ||
16296 | LEAL (R11)(R10*1), R11 | ||
16297 | JMP match_nolit_end_encodeSnappyBetterBlockAsm12B | ||
16298 | |||
16299 | matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B: | ||
16300 | CMPL DI, $0x04 | ||
16301 | JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16302 | MOVL (R8)(R11*1), R10 | ||
16303 | CMPL (R9)(R11*1), R10 | ||
16304 | JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16305 | LEAL -4(DI), DI | ||
16306 | LEAL 4(R11), R11 | ||
16307 | |||
16308 | matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B: | ||
16309 | CMPL DI, $0x01 | ||
16310 | JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16311 | JB match_nolit_end_encodeSnappyBetterBlockAsm12B | ||
16312 | MOVW (R8)(R11*1), R10 | ||
16313 | CMPW (R9)(R11*1), R10 | ||
16314 | JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16315 | LEAL 2(R11), R11 | ||
16316 | SUBL $0x02, DI | ||
16317 | JZ match_nolit_end_encodeSnappyBetterBlockAsm12B | ||
16318 | |||
16319 | matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B: | ||
16320 | MOVB (R8)(R11*1), R10 | ||
16321 | CMPB (R9)(R11*1), R10 | ||
16322 | JNE match_nolit_end_encodeSnappyBetterBlockAsm12B | ||
16323 | LEAL 1(R11), R11 | ||
16324 | |||
16325 | match_nolit_end_encodeSnappyBetterBlockAsm12B: | ||
16326 | MOVL CX, DI | ||
16327 | SUBL BX, DI | ||
16328 | |||
16329 | // Check if repeat | ||
16330 | MOVL DI, 16(SP) | ||
16331 | MOVL 12(SP), BX | ||
16332 | CMPL BX, SI | ||
16333 | JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B | ||
16334 | MOVL SI, R8 | ||
16335 | MOVL SI, 12(SP) | ||
16336 | LEAQ (DX)(BX*1), R9 | ||
16337 | SUBL BX, R8 | ||
16338 | LEAL -1(R8), BX | ||
16339 | CMPL BX, $0x3c | ||
16340 | JB one_byte_match_emit_encodeSnappyBetterBlockAsm12B | ||
16341 | CMPL BX, $0x00000100 | ||
16342 | JB two_bytes_match_emit_encodeSnappyBetterBlockAsm12B | ||
16343 | JB three_bytes_match_emit_encodeSnappyBetterBlockAsm12B | ||
16344 | |||
16345 | three_bytes_match_emit_encodeSnappyBetterBlockAsm12B: | ||
16346 | MOVB $0xf4, (AX) | ||
16347 | MOVW BX, 1(AX) | ||
16348 | ADDQ $0x03, AX | ||
16349 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B | ||
16350 | |||
16351 | two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: | ||
16352 | MOVB $0xf0, (AX) | ||
16353 | MOVB BL, 1(AX) | ||
16354 | ADDQ $0x02, AX | ||
16355 | CMPL BX, $0x40 | ||
16356 | JB memmove_match_emit_encodeSnappyBetterBlockAsm12B | ||
16357 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B | ||
16358 | |||
16359 | one_byte_match_emit_encodeSnappyBetterBlockAsm12B: | ||
16360 | SHLB $0x02, BL | ||
16361 | MOVB BL, (AX) | ||
16362 | ADDQ $0x01, AX | ||
16363 | |||
16364 | memmove_match_emit_encodeSnappyBetterBlockAsm12B: | ||
16365 | LEAQ (AX)(R8*1), BX | ||
16366 | |||
16367 | // genMemMoveShort | ||
16368 | CMPQ R8, $0x08 | ||
16369 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 | ||
16370 | CMPQ R8, $0x10 | ||
16371 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 | ||
16372 | CMPQ R8, $0x20 | ||
16373 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 | ||
16374 | JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 | ||
16375 | |||
16376 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: | ||
16377 | MOVQ (R9), R10 | ||
16378 | MOVQ R10, (AX) | ||
16379 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B | ||
16380 | |||
16381 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: | ||
16382 | MOVQ (R9), R10 | ||
16383 | MOVQ -8(R9)(R8*1), R9 | ||
16384 | MOVQ R10, (AX) | ||
16385 | MOVQ R9, -8(AX)(R8*1) | ||
16386 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B | ||
16387 | |||
16388 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: | ||
16389 | MOVOU (R9), X0 | ||
16390 | MOVOU -16(R9)(R8*1), X1 | ||
16391 | MOVOU X0, (AX) | ||
16392 | MOVOU X1, -16(AX)(R8*1) | ||
16393 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B | ||
16394 | |||
16395 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: | ||
16396 | MOVOU (R9), X0 | ||
16397 | MOVOU 16(R9), X1 | ||
16398 | MOVOU -32(R9)(R8*1), X2 | ||
16399 | MOVOU -16(R9)(R8*1), X3 | ||
16400 | MOVOU X0, (AX) | ||
16401 | MOVOU X1, 16(AX) | ||
16402 | MOVOU X2, -32(AX)(R8*1) | ||
16403 | MOVOU X3, -16(AX)(R8*1) | ||
16404 | |||
16405 | memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: | ||
16406 | MOVQ BX, AX | ||
16407 | JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B | ||
16408 | |||
16409 | memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: | ||
16410 | LEAQ (AX)(R8*1), BX | ||
16411 | |||
16412 | // genMemMoveLong | ||
16413 | MOVOU (R9), X0 | ||
16414 | MOVOU 16(R9), X1 | ||
16415 | MOVOU -32(R9)(R8*1), X2 | ||
16416 | MOVOU -16(R9)(R8*1), X3 | ||
16417 | MOVQ R8, R12 | ||
16418 | SHRQ $0x05, R12 | ||
16419 | MOVQ AX, R10 | ||
16420 | ANDL $0x0000001f, R10 | ||
16421 | MOVQ $0x00000040, R13 | ||
16422 | SUBQ R10, R13 | ||
16423 | DECQ R12 | ||
16424 | JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 | ||
16425 | LEAQ -32(R9)(R13*1), R10 | ||
16426 | LEAQ -32(AX)(R13*1), R14 | ||
16427 | |||
16428 | emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: | ||
16429 | MOVOU (R10), X4 | ||
16430 | MOVOU 16(R10), X5 | ||
16431 | MOVOA X4, (R14) | ||
16432 | MOVOA X5, 16(R14) | ||
16433 | ADDQ $0x20, R14 | ||
16434 | ADDQ $0x20, R10 | ||
16435 | ADDQ $0x20, R13 | ||
16436 | DECQ R12 | ||
16437 | JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back | ||
16438 | |||
16439 | emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: | ||
16440 | MOVOU -32(R9)(R13*1), X4 | ||
16441 | MOVOU -16(R9)(R13*1), X5 | ||
16442 | MOVOA X4, -32(AX)(R13*1) | ||
16443 | MOVOA X5, -16(AX)(R13*1) | ||
16444 | ADDQ $0x20, R13 | ||
16445 | CMPQ R8, R13 | ||
16446 | JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 | ||
16447 | MOVOU X0, (AX) | ||
16448 | MOVOU X1, 16(AX) | ||
16449 | MOVOU X2, -32(AX)(R8*1) | ||
16450 | MOVOU X3, -16(AX)(R8*1) | ||
16451 | MOVQ BX, AX | ||
16452 | |||
16453 | emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: | ||
16454 | ADDL R11, CX | ||
16455 | ADDL $0x04, R11 | ||
16456 | MOVL CX, 12(SP) | ||
16457 | |||
16458 | // emitCopy | ||
16459 | two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: | ||
16460 | CMPL R11, $0x40 | ||
16461 | JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16462 | MOVB $0xee, (AX) | ||
16463 | MOVW DI, 1(AX) | ||
16464 | LEAL -60(R11), R11 | ||
16465 | ADDQ $0x03, AX | ||
16466 | JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16467 | |||
16468 | two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: | ||
16469 | MOVL R11, BX | ||
16470 | SHLL $0x02, BX | ||
16471 | CMPL R11, $0x0c | ||
16472 | JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16473 | CMPL DI, $0x00000800 | ||
16474 | JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B | ||
16475 | LEAL -15(BX), BX | ||
16476 | MOVB DI, 1(AX) | ||
16477 | SHRL $0x08, DI | ||
16478 | SHLL $0x05, DI | ||
16479 | ORL DI, BX | ||
16480 | MOVB BL, (AX) | ||
16481 | ADDQ $0x02, AX | ||
16482 | JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B | ||
16483 | |||
16484 | emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: | ||
16485 | LEAL -2(BX), BX | ||
16486 | MOVB BL, (AX) | ||
16487 | MOVW DI, 1(AX) | ||
16488 | ADDQ $0x03, AX | ||
16489 | |||
16490 | match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: | ||
16491 | CMPL CX, 8(SP) | ||
16492 | JAE emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16493 | CMPQ AX, (SP) | ||
16494 | JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B | ||
16495 | MOVQ $0x00000000, ret+48(FP) | ||
16496 | RET | ||
16497 | |||
16498 | match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: | ||
16499 | MOVQ $0x0000cf1bbcdcbf9b, BX | ||
16500 | MOVQ $0x9e3779b1, DI | ||
16501 | LEAQ 1(SI), SI | ||
16502 | LEAQ -2(CX), R8 | ||
16503 | MOVQ (DX)(SI*1), R9 | ||
16504 | MOVQ 1(DX)(SI*1), R10 | ||
16505 | MOVQ (DX)(R8*1), R11 | ||
16506 | MOVQ 1(DX)(R8*1), R12 | ||
16507 | SHLQ $0x10, R9 | ||
16508 | IMULQ BX, R9 | ||
16509 | SHRQ $0x32, R9 | ||
16510 | SHLQ $0x20, R10 | ||
16511 | IMULQ DI, R10 | ||
16512 | SHRQ $0x34, R10 | ||
16513 | SHLQ $0x10, R11 | ||
16514 | IMULQ BX, R11 | ||
16515 | SHRQ $0x32, R11 | ||
16516 | SHLQ $0x20, R12 | ||
16517 | IMULQ DI, R12 | ||
16518 | SHRQ $0x34, R12 | ||
16519 | LEAQ 1(SI), DI | ||
16520 | LEAQ 1(R8), R13 | ||
16521 | MOVL SI, 24(SP)(R9*4) | ||
16522 | MOVL R8, 24(SP)(R11*4) | ||
16523 | MOVL DI, 65560(SP)(R10*4) | ||
16524 | MOVL R13, 65560(SP)(R12*4) | ||
16525 | LEAQ 1(R8)(SI*1), DI | ||
16526 | SHRQ $0x01, DI | ||
16527 | ADDQ $0x01, SI | ||
16528 | SUBQ $0x01, R8 | ||
16529 | |||
16530 | index_loop_encodeSnappyBetterBlockAsm12B: | ||
16531 | CMPQ DI, R8 | ||
16532 | JAE search_loop_encodeSnappyBetterBlockAsm12B | ||
16533 | MOVQ (DX)(SI*1), R9 | ||
16534 | MOVQ (DX)(DI*1), R10 | ||
16535 | SHLQ $0x10, R9 | ||
16536 | IMULQ BX, R9 | ||
16537 | SHRQ $0x32, R9 | ||
16538 | SHLQ $0x10, R10 | ||
16539 | IMULQ BX, R10 | ||
16540 | SHRQ $0x32, R10 | ||
16541 | MOVL SI, 24(SP)(R9*4) | ||
16542 | MOVL DI, 24(SP)(R10*4) | ||
16543 | ADDQ $0x02, SI | ||
16544 | ADDQ $0x02, DI | ||
16545 | JMP index_loop_encodeSnappyBetterBlockAsm12B | ||
16546 | |||
16547 | emit_remainder_encodeSnappyBetterBlockAsm12B: | ||
16548 | MOVQ src_len+32(FP), CX | ||
16549 | SUBL 12(SP), CX | ||
16550 | LEAQ 3(AX)(CX*1), CX | ||
16551 | CMPQ CX, (SP) | ||
16552 | JB emit_remainder_ok_encodeSnappyBetterBlockAsm12B | ||
16553 | MOVQ $0x00000000, ret+48(FP) | ||
16554 | RET | ||
16555 | |||
16556 | emit_remainder_ok_encodeSnappyBetterBlockAsm12B: | ||
16557 | MOVQ src_len+32(FP), CX | ||
16558 | MOVL 12(SP), BX | ||
16559 | CMPL BX, CX | ||
16560 | JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16561 | MOVL CX, SI | ||
16562 | MOVL CX, 12(SP) | ||
16563 | LEAQ (DX)(BX*1), CX | ||
16564 | SUBL BX, SI | ||
16565 | LEAL -1(SI), DX | ||
16566 | CMPL DX, $0x3c | ||
16567 | JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16568 | CMPL DX, $0x00000100 | ||
16569 | JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16570 | JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16571 | |||
16572 | three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: | ||
16573 | MOVB $0xf4, (AX) | ||
16574 | MOVW DX, 1(AX) | ||
16575 | ADDQ $0x03, AX | ||
16576 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16577 | |||
16578 | two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: | ||
16579 | MOVB $0xf0, (AX) | ||
16580 | MOVB DL, 1(AX) | ||
16581 | ADDQ $0x02, AX | ||
16582 | CMPL DX, $0x40 | ||
16583 | JB memmove_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16584 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16585 | |||
16586 | one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: | ||
16587 | SHLB $0x02, DL | ||
16588 | MOVB DL, (AX) | ||
16589 | ADDQ $0x01, AX | ||
16590 | |||
16591 | memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: | ||
16592 | LEAQ (AX)(SI*1), DX | ||
16593 | MOVL SI, BX | ||
16594 | |||
16595 | // genMemMoveShort | ||
16596 | CMPQ BX, $0x03 | ||
16597 | JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2 | ||
16598 | JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3 | ||
16599 | CMPQ BX, $0x08 | ||
16600 | JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7 | ||
16601 | CMPQ BX, $0x10 | ||
16602 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 | ||
16603 | CMPQ BX, $0x20 | ||
16604 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 | ||
16605 | JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 | ||
16606 | |||
16607 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2: | ||
16608 | MOVB (CX), SI | ||
16609 | MOVB -1(CX)(BX*1), CL | ||
16610 | MOVB SI, (AX) | ||
16611 | MOVB CL, -1(AX)(BX*1) | ||
16612 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16613 | |||
16614 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3: | ||
16615 | MOVW (CX), SI | ||
16616 | MOVB 2(CX), CL | ||
16617 | MOVW SI, (AX) | ||
16618 | MOVB CL, 2(AX) | ||
16619 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16620 | |||
16621 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7: | ||
16622 | MOVL (CX), SI | ||
16623 | MOVL -4(CX)(BX*1), CX | ||
16624 | MOVL SI, (AX) | ||
16625 | MOVL CX, -4(AX)(BX*1) | ||
16626 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16627 | |||
16628 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: | ||
16629 | MOVQ (CX), SI | ||
16630 | MOVQ -8(CX)(BX*1), CX | ||
16631 | MOVQ SI, (AX) | ||
16632 | MOVQ CX, -8(AX)(BX*1) | ||
16633 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16634 | |||
16635 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: | ||
16636 | MOVOU (CX), X0 | ||
16637 | MOVOU -16(CX)(BX*1), X1 | ||
16638 | MOVOU X0, (AX) | ||
16639 | MOVOU X1, -16(AX)(BX*1) | ||
16640 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16641 | |||
16642 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: | ||
16643 | MOVOU (CX), X0 | ||
16644 | MOVOU 16(CX), X1 | ||
16645 | MOVOU -32(CX)(BX*1), X2 | ||
16646 | MOVOU -16(CX)(BX*1), X3 | ||
16647 | MOVOU X0, (AX) | ||
16648 | MOVOU X1, 16(AX) | ||
16649 | MOVOU X2, -32(AX)(BX*1) | ||
16650 | MOVOU X3, -16(AX)(BX*1) | ||
16651 | |||
16652 | memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: | ||
16653 | MOVQ DX, AX | ||
16654 | JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B | ||
16655 | |||
16656 | memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: | ||
16657 | LEAQ (AX)(SI*1), DX | ||
16658 | MOVL SI, BX | ||
16659 | |||
16660 | // genMemMoveLong | ||
16661 | MOVOU (CX), X0 | ||
16662 | MOVOU 16(CX), X1 | ||
16663 | MOVOU -32(CX)(BX*1), X2 | ||
16664 | MOVOU -16(CX)(BX*1), X3 | ||
16665 | MOVQ BX, DI | ||
16666 | SHRQ $0x05, DI | ||
16667 | MOVQ AX, SI | ||
16668 | ANDL $0x0000001f, SI | ||
16669 | MOVQ $0x00000040, R8 | ||
16670 | SUBQ SI, R8 | ||
16671 | DECQ DI | ||
16672 | JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 | ||
16673 | LEAQ -32(CX)(R8*1), SI | ||
16674 | LEAQ -32(AX)(R8*1), R9 | ||
16675 | |||
16676 | emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: | ||
16677 | MOVOU (SI), X4 | ||
16678 | MOVOU 16(SI), X5 | ||
16679 | MOVOA X4, (R9) | ||
16680 | MOVOA X5, 16(R9) | ||
16681 | ADDQ $0x20, R9 | ||
16682 | ADDQ $0x20, SI | ||
16683 | ADDQ $0x20, R8 | ||
16684 | DECQ DI | ||
16685 | JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back | ||
16686 | |||
16687 | emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: | ||
16688 | MOVOU -32(CX)(R8*1), X4 | ||
16689 | MOVOU -16(CX)(R8*1), X5 | ||
16690 | MOVOA X4, -32(AX)(R8*1) | ||
16691 | MOVOA X5, -16(AX)(R8*1) | ||
16692 | ADDQ $0x20, R8 | ||
16693 | CMPQ BX, R8 | ||
16694 | JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 | ||
16695 | MOVOU X0, (AX) | ||
16696 | MOVOU X1, 16(AX) | ||
16697 | MOVOU X2, -32(AX)(BX*1) | ||
16698 | MOVOU X3, -16(AX)(BX*1) | ||
16699 | MOVQ DX, AX | ||
16700 | |||
16701 | emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: | ||
16702 | MOVQ dst_base+0(FP), CX | ||
16703 | SUBQ CX, AX | ||
16704 | MOVQ AX, ret+48(FP) | ||
16705 | RET | ||
16706 | |||
16707 | // func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int | ||
16708 | // Requires: BMI, SSE2 | ||
16709 | TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 | ||
16710 | MOVQ dst_base+0(FP), AX | ||
16711 | MOVQ $0x000000a0, CX | ||
16712 | LEAQ 24(SP), DX | ||
16713 | PXOR X0, X0 | ||
16714 | |||
16715 | zero_loop_encodeSnappyBetterBlockAsm10B: | ||
16716 | MOVOU X0, (DX) | ||
16717 | MOVOU X0, 16(DX) | ||
16718 | MOVOU X0, 32(DX) | ||
16719 | MOVOU X0, 48(DX) | ||
16720 | MOVOU X0, 64(DX) | ||
16721 | MOVOU X0, 80(DX) | ||
16722 | MOVOU X0, 96(DX) | ||
16723 | MOVOU X0, 112(DX) | ||
16724 | ADDQ $0x80, DX | ||
16725 | DECQ CX | ||
16726 | JNZ zero_loop_encodeSnappyBetterBlockAsm10B | ||
16727 | MOVL $0x00000000, 12(SP) | ||
16728 | MOVQ src_len+32(FP), CX | ||
16729 | LEAQ -9(CX), DX | ||
16730 | LEAQ -8(CX), BX | ||
16731 | MOVL BX, 8(SP) | ||
16732 | SHRQ $0x05, CX | ||
16733 | SUBL CX, DX | ||
16734 | LEAQ (AX)(DX*1), DX | ||
16735 | MOVQ DX, (SP) | ||
16736 | MOVL $0x00000001, CX | ||
16737 | MOVL $0x00000000, 16(SP) | ||
16738 | MOVQ src_base+24(FP), DX | ||
16739 | |||
16740 | search_loop_encodeSnappyBetterBlockAsm10B: | ||
16741 | MOVL CX, BX | ||
16742 | SUBL 12(SP), BX | ||
16743 | SHRL $0x05, BX | ||
16744 | LEAL 1(CX)(BX*1), BX | ||
16745 | CMPL BX, 8(SP) | ||
16746 | JAE emit_remainder_encodeSnappyBetterBlockAsm10B | ||
16747 | MOVQ (DX)(CX*1), SI | ||
16748 | MOVL BX, 20(SP) | ||
16749 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
16750 | MOVQ $0x9e3779b1, BX | ||
16751 | MOVQ SI, R9 | ||
16752 | MOVQ SI, R10 | ||
16753 | SHLQ $0x10, R9 | ||
16754 | IMULQ R8, R9 | ||
16755 | SHRQ $0x34, R9 | ||
16756 | SHLQ $0x20, R10 | ||
16757 | IMULQ BX, R10 | ||
16758 | SHRQ $0x36, R10 | ||
16759 | MOVL 24(SP)(R9*4), BX | ||
16760 | MOVL 16408(SP)(R10*4), DI | ||
16761 | MOVL CX, 24(SP)(R9*4) | ||
16762 | MOVL CX, 16408(SP)(R10*4) | ||
16763 | MOVQ (DX)(BX*1), R9 | ||
16764 | MOVQ (DX)(DI*1), R10 | ||
16765 | CMPQ R9, SI | ||
16766 | JEQ candidate_match_encodeSnappyBetterBlockAsm10B | ||
16767 | CMPQ R10, SI | ||
16768 | JNE no_short_found_encodeSnappyBetterBlockAsm10B | ||
16769 | MOVL DI, BX | ||
16770 | JMP candidate_match_encodeSnappyBetterBlockAsm10B | ||
16771 | |||
16772 | no_short_found_encodeSnappyBetterBlockAsm10B: | ||
16773 | CMPL R9, SI | ||
16774 | JEQ candidate_match_encodeSnappyBetterBlockAsm10B | ||
16775 | CMPL R10, SI | ||
16776 | JEQ candidateS_match_encodeSnappyBetterBlockAsm10B | ||
16777 | MOVL 20(SP), CX | ||
16778 | JMP search_loop_encodeSnappyBetterBlockAsm10B | ||
16779 | |||
16780 | candidateS_match_encodeSnappyBetterBlockAsm10B: | ||
16781 | SHRQ $0x08, SI | ||
16782 | MOVQ SI, R9 | ||
16783 | SHLQ $0x10, R9 | ||
16784 | IMULQ R8, R9 | ||
16785 | SHRQ $0x34, R9 | ||
16786 | MOVL 24(SP)(R9*4), BX | ||
16787 | INCL CX | ||
16788 | MOVL CX, 24(SP)(R9*4) | ||
16789 | CMPL (DX)(BX*1), SI | ||
16790 | JEQ candidate_match_encodeSnappyBetterBlockAsm10B | ||
16791 | DECL CX | ||
16792 | MOVL DI, BX | ||
16793 | |||
16794 | candidate_match_encodeSnappyBetterBlockAsm10B: | ||
16795 | MOVL 12(SP), SI | ||
16796 | TESTL BX, BX | ||
16797 | JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B | ||
16798 | |||
16799 | match_extend_back_loop_encodeSnappyBetterBlockAsm10B: | ||
16800 | CMPL CX, SI | ||
16801 | JBE match_extend_back_end_encodeSnappyBetterBlockAsm10B | ||
16802 | MOVB -1(DX)(BX*1), DI | ||
16803 | MOVB -1(DX)(CX*1), R8 | ||
16804 | CMPB DI, R8 | ||
16805 | JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B | ||
16806 | LEAL -1(CX), CX | ||
16807 | DECL BX | ||
16808 | JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B | ||
16809 | JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B | ||
16810 | |||
16811 | match_extend_back_end_encodeSnappyBetterBlockAsm10B: | ||
16812 | MOVL CX, SI | ||
16813 | SUBL 12(SP), SI | ||
16814 | LEAQ 3(AX)(SI*1), SI | ||
16815 | CMPQ SI, (SP) | ||
16816 | JB match_dst_size_check_encodeSnappyBetterBlockAsm10B | ||
16817 | MOVQ $0x00000000, ret+48(FP) | ||
16818 | RET | ||
16819 | |||
16820 | match_dst_size_check_encodeSnappyBetterBlockAsm10B: | ||
16821 | MOVL CX, SI | ||
16822 | ADDL $0x04, CX | ||
16823 | ADDL $0x04, BX | ||
16824 | MOVQ src_len+32(FP), DI | ||
16825 | SUBL CX, DI | ||
16826 | LEAQ (DX)(CX*1), R8 | ||
16827 | LEAQ (DX)(BX*1), R9 | ||
16828 | |||
16829 | // matchLen | ||
16830 | XORL R11, R11 | ||
16831 | |||
16832 | matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B: | ||
16833 | CMPL DI, $0x10 | ||
16834 | JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B | ||
16835 | MOVQ (R8)(R11*1), R10 | ||
16836 | MOVQ 8(R8)(R11*1), R12 | ||
16837 | XORQ (R9)(R11*1), R10 | ||
16838 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B | ||
16839 | XORQ 8(R9)(R11*1), R12 | ||
16840 | JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B | ||
16841 | LEAL -16(DI), DI | ||
16842 | LEAL 16(R11), R11 | ||
16843 | JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B | ||
16844 | |||
16845 | matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B: | ||
16846 | #ifdef GOAMD64_v3 | ||
16847 | TZCNTQ R12, R12 | ||
16848 | |||
16849 | #else | ||
16850 | BSFQ R12, R12 | ||
16851 | |||
16852 | #endif | ||
16853 | SARQ $0x03, R12 | ||
16854 | LEAL 8(R11)(R12*1), R11 | ||
16855 | JMP match_nolit_end_encodeSnappyBetterBlockAsm10B | ||
16856 | |||
16857 | matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B: | ||
16858 | CMPL DI, $0x08 | ||
16859 | JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B | ||
16860 | MOVQ (R8)(R11*1), R10 | ||
16861 | XORQ (R9)(R11*1), R10 | ||
16862 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B | ||
16863 | LEAL -8(DI), DI | ||
16864 | LEAL 8(R11), R11 | ||
16865 | JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B | ||
16866 | |||
16867 | matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B: | ||
16868 | #ifdef GOAMD64_v3 | ||
16869 | TZCNTQ R10, R10 | ||
16870 | |||
16871 | #else | ||
16872 | BSFQ R10, R10 | ||
16873 | |||
16874 | #endif | ||
16875 | SARQ $0x03, R10 | ||
16876 | LEAL (R11)(R10*1), R11 | ||
16877 | JMP match_nolit_end_encodeSnappyBetterBlockAsm10B | ||
16878 | |||
16879 | matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B: | ||
16880 | CMPL DI, $0x04 | ||
16881 | JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B | ||
16882 | MOVL (R8)(R11*1), R10 | ||
16883 | CMPL (R9)(R11*1), R10 | ||
16884 | JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B | ||
16885 | LEAL -4(DI), DI | ||
16886 | LEAL 4(R11), R11 | ||
16887 | |||
16888 | matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B: | ||
16889 | CMPL DI, $0x01 | ||
16890 | JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B | ||
16891 | JB match_nolit_end_encodeSnappyBetterBlockAsm10B | ||
16892 | MOVW (R8)(R11*1), R10 | ||
16893 | CMPW (R9)(R11*1), R10 | ||
16894 | JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B | ||
16895 | LEAL 2(R11), R11 | ||
16896 | SUBL $0x02, DI | ||
16897 | JZ match_nolit_end_encodeSnappyBetterBlockAsm10B | ||
16898 | |||
16899 | matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B: | ||
16900 | MOVB (R8)(R11*1), R10 | ||
16901 | CMPB (R9)(R11*1), R10 | ||
16902 | JNE match_nolit_end_encodeSnappyBetterBlockAsm10B | ||
16903 | LEAL 1(R11), R11 | ||
16904 | |||
16905 | match_nolit_end_encodeSnappyBetterBlockAsm10B: | ||
16906 | MOVL CX, DI | ||
16907 | SUBL BX, DI | ||
16908 | |||
16909 | // Check if repeat | ||
16910 | MOVL DI, 16(SP) | ||
16911 | MOVL 12(SP), BX | ||
16912 | CMPL BX, SI | ||
16913 | JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B | ||
16914 | MOVL SI, R8 | ||
16915 | MOVL SI, 12(SP) | ||
16916 | LEAQ (DX)(BX*1), R9 | ||
16917 | SUBL BX, R8 | ||
16918 | LEAL -1(R8), BX | ||
16919 | CMPL BX, $0x3c | ||
16920 | JB one_byte_match_emit_encodeSnappyBetterBlockAsm10B | ||
16921 | CMPL BX, $0x00000100 | ||
16922 | JB two_bytes_match_emit_encodeSnappyBetterBlockAsm10B | ||
16923 | JB three_bytes_match_emit_encodeSnappyBetterBlockAsm10B | ||
16924 | |||
16925 | three_bytes_match_emit_encodeSnappyBetterBlockAsm10B: | ||
16926 | MOVB $0xf4, (AX) | ||
16927 | MOVW BX, 1(AX) | ||
16928 | ADDQ $0x03, AX | ||
16929 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B | ||
16930 | |||
16931 | two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: | ||
16932 | MOVB $0xf0, (AX) | ||
16933 | MOVB BL, 1(AX) | ||
16934 | ADDQ $0x02, AX | ||
16935 | CMPL BX, $0x40 | ||
16936 | JB memmove_match_emit_encodeSnappyBetterBlockAsm10B | ||
16937 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B | ||
16938 | |||
16939 | one_byte_match_emit_encodeSnappyBetterBlockAsm10B: | ||
16940 | SHLB $0x02, BL | ||
16941 | MOVB BL, (AX) | ||
16942 | ADDQ $0x01, AX | ||
16943 | |||
16944 | memmove_match_emit_encodeSnappyBetterBlockAsm10B: | ||
16945 | LEAQ (AX)(R8*1), BX | ||
16946 | |||
16947 | // genMemMoveShort | ||
16948 | CMPQ R8, $0x08 | ||
16949 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 | ||
16950 | CMPQ R8, $0x10 | ||
16951 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 | ||
16952 | CMPQ R8, $0x20 | ||
16953 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 | ||
16954 | JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 | ||
16955 | |||
16956 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: | ||
16957 | MOVQ (R9), R10 | ||
16958 | MOVQ R10, (AX) | ||
16959 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B | ||
16960 | |||
16961 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: | ||
16962 | MOVQ (R9), R10 | ||
16963 | MOVQ -8(R9)(R8*1), R9 | ||
16964 | MOVQ R10, (AX) | ||
16965 | MOVQ R9, -8(AX)(R8*1) | ||
16966 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B | ||
16967 | |||
16968 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: | ||
16969 | MOVOU (R9), X0 | ||
16970 | MOVOU -16(R9)(R8*1), X1 | ||
16971 | MOVOU X0, (AX) | ||
16972 | MOVOU X1, -16(AX)(R8*1) | ||
16973 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B | ||
16974 | |||
16975 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: | ||
16976 | MOVOU (R9), X0 | ||
16977 | MOVOU 16(R9), X1 | ||
16978 | MOVOU -32(R9)(R8*1), X2 | ||
16979 | MOVOU -16(R9)(R8*1), X3 | ||
16980 | MOVOU X0, (AX) | ||
16981 | MOVOU X1, 16(AX) | ||
16982 | MOVOU X2, -32(AX)(R8*1) | ||
16983 | MOVOU X3, -16(AX)(R8*1) | ||
16984 | |||
16985 | memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: | ||
16986 | MOVQ BX, AX | ||
16987 | JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B | ||
16988 | |||
16989 | memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: | ||
16990 | LEAQ (AX)(R8*1), BX | ||
16991 | |||
16992 | // genMemMoveLong | ||
16993 | MOVOU (R9), X0 | ||
16994 | MOVOU 16(R9), X1 | ||
16995 | MOVOU -32(R9)(R8*1), X2 | ||
16996 | MOVOU -16(R9)(R8*1), X3 | ||
16997 | MOVQ R8, R12 | ||
16998 | SHRQ $0x05, R12 | ||
16999 | MOVQ AX, R10 | ||
17000 | ANDL $0x0000001f, R10 | ||
17001 | MOVQ $0x00000040, R13 | ||
17002 | SUBQ R10, R13 | ||
17003 | DECQ R12 | ||
17004 | JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 | ||
17005 | LEAQ -32(R9)(R13*1), R10 | ||
17006 | LEAQ -32(AX)(R13*1), R14 | ||
17007 | |||
17008 | emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: | ||
17009 | MOVOU (R10), X4 | ||
17010 | MOVOU 16(R10), X5 | ||
17011 | MOVOA X4, (R14) | ||
17012 | MOVOA X5, 16(R14) | ||
17013 | ADDQ $0x20, R14 | ||
17014 | ADDQ $0x20, R10 | ||
17015 | ADDQ $0x20, R13 | ||
17016 | DECQ R12 | ||
17017 | JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back | ||
17018 | |||
17019 | emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: | ||
17020 | MOVOU -32(R9)(R13*1), X4 | ||
17021 | MOVOU -16(R9)(R13*1), X5 | ||
17022 | MOVOA X4, -32(AX)(R13*1) | ||
17023 | MOVOA X5, -16(AX)(R13*1) | ||
17024 | ADDQ $0x20, R13 | ||
17025 | CMPQ R8, R13 | ||
17026 | JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 | ||
17027 | MOVOU X0, (AX) | ||
17028 | MOVOU X1, 16(AX) | ||
17029 | MOVOU X2, -32(AX)(R8*1) | ||
17030 | MOVOU X3, -16(AX)(R8*1) | ||
17031 | MOVQ BX, AX | ||
17032 | |||
17033 | emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: | ||
17034 | ADDL R11, CX | ||
17035 | ADDL $0x04, R11 | ||
17036 | MOVL CX, 12(SP) | ||
17037 | |||
17038 | // emitCopy | ||
17039 | two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: | ||
17040 | CMPL R11, $0x40 | ||
17041 | JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B | ||
17042 | MOVB $0xee, (AX) | ||
17043 | MOVW DI, 1(AX) | ||
17044 | LEAL -60(R11), R11 | ||
17045 | ADDQ $0x03, AX | ||
17046 | JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B | ||
17047 | |||
17048 | two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: | ||
17049 | MOVL R11, BX | ||
17050 | SHLL $0x02, BX | ||
17051 | CMPL R11, $0x0c | ||
17052 | JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B | ||
17053 | CMPL DI, $0x00000800 | ||
17054 | JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B | ||
17055 | LEAL -15(BX), BX | ||
17056 | MOVB DI, 1(AX) | ||
17057 | SHRL $0x08, DI | ||
17058 | SHLL $0x05, DI | ||
17059 | ORL DI, BX | ||
17060 | MOVB BL, (AX) | ||
17061 | ADDQ $0x02, AX | ||
17062 | JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B | ||
17063 | |||
17064 | emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: | ||
17065 | LEAL -2(BX), BX | ||
17066 | MOVB BL, (AX) | ||
17067 | MOVW DI, 1(AX) | ||
17068 | ADDQ $0x03, AX | ||
17069 | |||
17070 | match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: | ||
17071 | CMPL CX, 8(SP) | ||
17072 | JAE emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17073 | CMPQ AX, (SP) | ||
17074 | JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B | ||
17075 | MOVQ $0x00000000, ret+48(FP) | ||
17076 | RET | ||
17077 | |||
17078 | match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: | ||
17079 | MOVQ $0x0000cf1bbcdcbf9b, BX | ||
17080 | MOVQ $0x9e3779b1, DI | ||
17081 | LEAQ 1(SI), SI | ||
17082 | LEAQ -2(CX), R8 | ||
17083 | MOVQ (DX)(SI*1), R9 | ||
17084 | MOVQ 1(DX)(SI*1), R10 | ||
17085 | MOVQ (DX)(R8*1), R11 | ||
17086 | MOVQ 1(DX)(R8*1), R12 | ||
17087 | SHLQ $0x10, R9 | ||
17088 | IMULQ BX, R9 | ||
17089 | SHRQ $0x34, R9 | ||
17090 | SHLQ $0x20, R10 | ||
17091 | IMULQ DI, R10 | ||
17092 | SHRQ $0x36, R10 | ||
17093 | SHLQ $0x10, R11 | ||
17094 | IMULQ BX, R11 | ||
17095 | SHRQ $0x34, R11 | ||
17096 | SHLQ $0x20, R12 | ||
17097 | IMULQ DI, R12 | ||
17098 | SHRQ $0x36, R12 | ||
17099 | LEAQ 1(SI), DI | ||
17100 | LEAQ 1(R8), R13 | ||
17101 | MOVL SI, 24(SP)(R9*4) | ||
17102 | MOVL R8, 24(SP)(R11*4) | ||
17103 | MOVL DI, 16408(SP)(R10*4) | ||
17104 | MOVL R13, 16408(SP)(R12*4) | ||
17105 | LEAQ 1(R8)(SI*1), DI | ||
17106 | SHRQ $0x01, DI | ||
17107 | ADDQ $0x01, SI | ||
17108 | SUBQ $0x01, R8 | ||
17109 | |||
17110 | index_loop_encodeSnappyBetterBlockAsm10B: | ||
17111 | CMPQ DI, R8 | ||
17112 | JAE search_loop_encodeSnappyBetterBlockAsm10B | ||
17113 | MOVQ (DX)(SI*1), R9 | ||
17114 | MOVQ (DX)(DI*1), R10 | ||
17115 | SHLQ $0x10, R9 | ||
17116 | IMULQ BX, R9 | ||
17117 | SHRQ $0x34, R9 | ||
17118 | SHLQ $0x10, R10 | ||
17119 | IMULQ BX, R10 | ||
17120 | SHRQ $0x34, R10 | ||
17121 | MOVL SI, 24(SP)(R9*4) | ||
17122 | MOVL DI, 24(SP)(R10*4) | ||
17123 | ADDQ $0x02, SI | ||
17124 | ADDQ $0x02, DI | ||
17125 | JMP index_loop_encodeSnappyBetterBlockAsm10B | ||
17126 | |||
17127 | emit_remainder_encodeSnappyBetterBlockAsm10B: | ||
17128 | MOVQ src_len+32(FP), CX | ||
17129 | SUBL 12(SP), CX | ||
17130 | LEAQ 3(AX)(CX*1), CX | ||
17131 | CMPQ CX, (SP) | ||
17132 | JB emit_remainder_ok_encodeSnappyBetterBlockAsm10B | ||
17133 | MOVQ $0x00000000, ret+48(FP) | ||
17134 | RET | ||
17135 | |||
17136 | emit_remainder_ok_encodeSnappyBetterBlockAsm10B: | ||
17137 | MOVQ src_len+32(FP), CX | ||
17138 | MOVL 12(SP), BX | ||
17139 | CMPL BX, CX | ||
17140 | JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17141 | MOVL CX, SI | ||
17142 | MOVL CX, 12(SP) | ||
17143 | LEAQ (DX)(BX*1), CX | ||
17144 | SUBL BX, SI | ||
17145 | LEAL -1(SI), DX | ||
17146 | CMPL DX, $0x3c | ||
17147 | JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17148 | CMPL DX, $0x00000100 | ||
17149 | JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17150 | JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17151 | |||
17152 | three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: | ||
17153 | MOVB $0xf4, (AX) | ||
17154 | MOVW DX, 1(AX) | ||
17155 | ADDQ $0x03, AX | ||
17156 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17157 | |||
17158 | two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: | ||
17159 | MOVB $0xf0, (AX) | ||
17160 | MOVB DL, 1(AX) | ||
17161 | ADDQ $0x02, AX | ||
17162 | CMPL DX, $0x40 | ||
17163 | JB memmove_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17164 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17165 | |||
17166 | one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: | ||
17167 | SHLB $0x02, DL | ||
17168 | MOVB DL, (AX) | ||
17169 | ADDQ $0x01, AX | ||
17170 | |||
17171 | memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: | ||
17172 | LEAQ (AX)(SI*1), DX | ||
17173 | MOVL SI, BX | ||
17174 | |||
17175 | // genMemMoveShort | ||
17176 | CMPQ BX, $0x03 | ||
17177 | JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2 | ||
17178 | JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3 | ||
17179 | CMPQ BX, $0x08 | ||
17180 | JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7 | ||
17181 | CMPQ BX, $0x10 | ||
17182 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 | ||
17183 | CMPQ BX, $0x20 | ||
17184 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 | ||
17185 | JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 | ||
17186 | |||
17187 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2: | ||
17188 | MOVB (CX), SI | ||
17189 | MOVB -1(CX)(BX*1), CL | ||
17190 | MOVB SI, (AX) | ||
17191 | MOVB CL, -1(AX)(BX*1) | ||
17192 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17193 | |||
17194 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3: | ||
17195 | MOVW (CX), SI | ||
17196 | MOVB 2(CX), CL | ||
17197 | MOVW SI, (AX) | ||
17198 | MOVB CL, 2(AX) | ||
17199 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17200 | |||
17201 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7: | ||
17202 | MOVL (CX), SI | ||
17203 | MOVL -4(CX)(BX*1), CX | ||
17204 | MOVL SI, (AX) | ||
17205 | MOVL CX, -4(AX)(BX*1) | ||
17206 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17207 | |||
17208 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: | ||
17209 | MOVQ (CX), SI | ||
17210 | MOVQ -8(CX)(BX*1), CX | ||
17211 | MOVQ SI, (AX) | ||
17212 | MOVQ CX, -8(AX)(BX*1) | ||
17213 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17214 | |||
17215 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: | ||
17216 | MOVOU (CX), X0 | ||
17217 | MOVOU -16(CX)(BX*1), X1 | ||
17218 | MOVOU X0, (AX) | ||
17219 | MOVOU X1, -16(AX)(BX*1) | ||
17220 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17221 | |||
17222 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: | ||
17223 | MOVOU (CX), X0 | ||
17224 | MOVOU 16(CX), X1 | ||
17225 | MOVOU -32(CX)(BX*1), X2 | ||
17226 | MOVOU -16(CX)(BX*1), X3 | ||
17227 | MOVOU X0, (AX) | ||
17228 | MOVOU X1, 16(AX) | ||
17229 | MOVOU X2, -32(AX)(BX*1) | ||
17230 | MOVOU X3, -16(AX)(BX*1) | ||
17231 | |||
17232 | memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: | ||
17233 | MOVQ DX, AX | ||
17234 | JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B | ||
17235 | |||
17236 | memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: | ||
17237 | LEAQ (AX)(SI*1), DX | ||
17238 | MOVL SI, BX | ||
17239 | |||
17240 | // genMemMoveLong | ||
17241 | MOVOU (CX), X0 | ||
17242 | MOVOU 16(CX), X1 | ||
17243 | MOVOU -32(CX)(BX*1), X2 | ||
17244 | MOVOU -16(CX)(BX*1), X3 | ||
17245 | MOVQ BX, DI | ||
17246 | SHRQ $0x05, DI | ||
17247 | MOVQ AX, SI | ||
17248 | ANDL $0x0000001f, SI | ||
17249 | MOVQ $0x00000040, R8 | ||
17250 | SUBQ SI, R8 | ||
17251 | DECQ DI | ||
17252 | JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 | ||
17253 | LEAQ -32(CX)(R8*1), SI | ||
17254 | LEAQ -32(AX)(R8*1), R9 | ||
17255 | |||
17256 | emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: | ||
17257 | MOVOU (SI), X4 | ||
17258 | MOVOU 16(SI), X5 | ||
17259 | MOVOA X4, (R9) | ||
17260 | MOVOA X5, 16(R9) | ||
17261 | ADDQ $0x20, R9 | ||
17262 | ADDQ $0x20, SI | ||
17263 | ADDQ $0x20, R8 | ||
17264 | DECQ DI | ||
17265 | JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back | ||
17266 | |||
17267 | emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: | ||
17268 | MOVOU -32(CX)(R8*1), X4 | ||
17269 | MOVOU -16(CX)(R8*1), X5 | ||
17270 | MOVOA X4, -32(AX)(R8*1) | ||
17271 | MOVOA X5, -16(AX)(R8*1) | ||
17272 | ADDQ $0x20, R8 | ||
17273 | CMPQ BX, R8 | ||
17274 | JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 | ||
17275 | MOVOU X0, (AX) | ||
17276 | MOVOU X1, 16(AX) | ||
17277 | MOVOU X2, -32(AX)(BX*1) | ||
17278 | MOVOU X3, -16(AX)(BX*1) | ||
17279 | MOVQ DX, AX | ||
17280 | |||
17281 | emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: | ||
17282 | MOVQ dst_base+0(FP), CX | ||
17283 | SUBQ CX, AX | ||
17284 | MOVQ AX, ret+48(FP) | ||
17285 | RET | ||
17286 | |||
17287 | // func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int | ||
17288 | // Requires: BMI, SSE2 | ||
17289 | TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 | ||
17290 | MOVQ dst_base+0(FP), AX | ||
17291 | MOVQ $0x00000028, CX | ||
17292 | LEAQ 24(SP), DX | ||
17293 | PXOR X0, X0 | ||
17294 | |||
17295 | zero_loop_encodeSnappyBetterBlockAsm8B: | ||
17296 | MOVOU X0, (DX) | ||
17297 | MOVOU X0, 16(DX) | ||
17298 | MOVOU X0, 32(DX) | ||
17299 | MOVOU X0, 48(DX) | ||
17300 | MOVOU X0, 64(DX) | ||
17301 | MOVOU X0, 80(DX) | ||
17302 | MOVOU X0, 96(DX) | ||
17303 | MOVOU X0, 112(DX) | ||
17304 | ADDQ $0x80, DX | ||
17305 | DECQ CX | ||
17306 | JNZ zero_loop_encodeSnappyBetterBlockAsm8B | ||
17307 | MOVL $0x00000000, 12(SP) | ||
17308 | MOVQ src_len+32(FP), CX | ||
17309 | LEAQ -9(CX), DX | ||
17310 | LEAQ -8(CX), BX | ||
17311 | MOVL BX, 8(SP) | ||
17312 | SHRQ $0x05, CX | ||
17313 | SUBL CX, DX | ||
17314 | LEAQ (AX)(DX*1), DX | ||
17315 | MOVQ DX, (SP) | ||
17316 | MOVL $0x00000001, CX | ||
17317 | MOVL $0x00000000, 16(SP) | ||
17318 | MOVQ src_base+24(FP), DX | ||
17319 | |||
17320 | search_loop_encodeSnappyBetterBlockAsm8B: | ||
17321 | MOVL CX, BX | ||
17322 | SUBL 12(SP), BX | ||
17323 | SHRL $0x04, BX | ||
17324 | LEAL 1(CX)(BX*1), BX | ||
17325 | CMPL BX, 8(SP) | ||
17326 | JAE emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17327 | MOVQ (DX)(CX*1), SI | ||
17328 | MOVL BX, 20(SP) | ||
17329 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
17330 | MOVQ $0x9e3779b1, BX | ||
17331 | MOVQ SI, R9 | ||
17332 | MOVQ SI, R10 | ||
17333 | SHLQ $0x10, R9 | ||
17334 | IMULQ R8, R9 | ||
17335 | SHRQ $0x36, R9 | ||
17336 | SHLQ $0x20, R10 | ||
17337 | IMULQ BX, R10 | ||
17338 | SHRQ $0x38, R10 | ||
17339 | MOVL 24(SP)(R9*4), BX | ||
17340 | MOVL 4120(SP)(R10*4), DI | ||
17341 | MOVL CX, 24(SP)(R9*4) | ||
17342 | MOVL CX, 4120(SP)(R10*4) | ||
17343 | MOVQ (DX)(BX*1), R9 | ||
17344 | MOVQ (DX)(DI*1), R10 | ||
17345 | CMPQ R9, SI | ||
17346 | JEQ candidate_match_encodeSnappyBetterBlockAsm8B | ||
17347 | CMPQ R10, SI | ||
17348 | JNE no_short_found_encodeSnappyBetterBlockAsm8B | ||
17349 | MOVL DI, BX | ||
17350 | JMP candidate_match_encodeSnappyBetterBlockAsm8B | ||
17351 | |||
17352 | no_short_found_encodeSnappyBetterBlockAsm8B: | ||
17353 | CMPL R9, SI | ||
17354 | JEQ candidate_match_encodeSnappyBetterBlockAsm8B | ||
17355 | CMPL R10, SI | ||
17356 | JEQ candidateS_match_encodeSnappyBetterBlockAsm8B | ||
17357 | MOVL 20(SP), CX | ||
17358 | JMP search_loop_encodeSnappyBetterBlockAsm8B | ||
17359 | |||
17360 | candidateS_match_encodeSnappyBetterBlockAsm8B: | ||
17361 | SHRQ $0x08, SI | ||
17362 | MOVQ SI, R9 | ||
17363 | SHLQ $0x10, R9 | ||
17364 | IMULQ R8, R9 | ||
17365 | SHRQ $0x36, R9 | ||
17366 | MOVL 24(SP)(R9*4), BX | ||
17367 | INCL CX | ||
17368 | MOVL CX, 24(SP)(R9*4) | ||
17369 | CMPL (DX)(BX*1), SI | ||
17370 | JEQ candidate_match_encodeSnappyBetterBlockAsm8B | ||
17371 | DECL CX | ||
17372 | MOVL DI, BX | ||
17373 | |||
17374 | candidate_match_encodeSnappyBetterBlockAsm8B: | ||
17375 | MOVL 12(SP), SI | ||
17376 | TESTL BX, BX | ||
17377 | JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B | ||
17378 | |||
17379 | match_extend_back_loop_encodeSnappyBetterBlockAsm8B: | ||
17380 | CMPL CX, SI | ||
17381 | JBE match_extend_back_end_encodeSnappyBetterBlockAsm8B | ||
17382 | MOVB -1(DX)(BX*1), DI | ||
17383 | MOVB -1(DX)(CX*1), R8 | ||
17384 | CMPB DI, R8 | ||
17385 | JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B | ||
17386 | LEAL -1(CX), CX | ||
17387 | DECL BX | ||
17388 | JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B | ||
17389 | JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B | ||
17390 | |||
17391 | match_extend_back_end_encodeSnappyBetterBlockAsm8B: | ||
17392 | MOVL CX, SI | ||
17393 | SUBL 12(SP), SI | ||
17394 | LEAQ 3(AX)(SI*1), SI | ||
17395 | CMPQ SI, (SP) | ||
17396 | JB match_dst_size_check_encodeSnappyBetterBlockAsm8B | ||
17397 | MOVQ $0x00000000, ret+48(FP) | ||
17398 | RET | ||
17399 | |||
17400 | match_dst_size_check_encodeSnappyBetterBlockAsm8B: | ||
17401 | MOVL CX, SI | ||
17402 | ADDL $0x04, CX | ||
17403 | ADDL $0x04, BX | ||
17404 | MOVQ src_len+32(FP), DI | ||
17405 | SUBL CX, DI | ||
17406 | LEAQ (DX)(CX*1), R8 | ||
17407 | LEAQ (DX)(BX*1), R9 | ||
17408 | |||
17409 | // matchLen | ||
17410 | XORL R11, R11 | ||
17411 | |||
17412 | matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B: | ||
17413 | CMPL DI, $0x10 | ||
17414 | JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17415 | MOVQ (R8)(R11*1), R10 | ||
17416 | MOVQ 8(R8)(R11*1), R12 | ||
17417 | XORQ (R9)(R11*1), R10 | ||
17418 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17419 | XORQ 8(R9)(R11*1), R12 | ||
17420 | JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B | ||
17421 | LEAL -16(DI), DI | ||
17422 | LEAL 16(R11), R11 | ||
17423 | JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17424 | |||
17425 | matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B: | ||
17426 | #ifdef GOAMD64_v3 | ||
17427 | TZCNTQ R12, R12 | ||
17428 | |||
17429 | #else | ||
17430 | BSFQ R12, R12 | ||
17431 | |||
17432 | #endif | ||
17433 | SARQ $0x03, R12 | ||
17434 | LEAL 8(R11)(R12*1), R11 | ||
17435 | JMP match_nolit_end_encodeSnappyBetterBlockAsm8B | ||
17436 | |||
17437 | matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B: | ||
17438 | CMPL DI, $0x08 | ||
17439 | JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17440 | MOVQ (R8)(R11*1), R10 | ||
17441 | XORQ (R9)(R11*1), R10 | ||
17442 | JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17443 | LEAL -8(DI), DI | ||
17444 | LEAL 8(R11), R11 | ||
17445 | JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17446 | |||
17447 | matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B: | ||
17448 | #ifdef GOAMD64_v3 | ||
17449 | TZCNTQ R10, R10 | ||
17450 | |||
17451 | #else | ||
17452 | BSFQ R10, R10 | ||
17453 | |||
17454 | #endif | ||
17455 | SARQ $0x03, R10 | ||
17456 | LEAL (R11)(R10*1), R11 | ||
17457 | JMP match_nolit_end_encodeSnappyBetterBlockAsm8B | ||
17458 | |||
17459 | matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B: | ||
17460 | CMPL DI, $0x04 | ||
17461 | JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17462 | MOVL (R8)(R11*1), R10 | ||
17463 | CMPL (R9)(R11*1), R10 | ||
17464 | JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17465 | LEAL -4(DI), DI | ||
17466 | LEAL 4(R11), R11 | ||
17467 | |||
17468 | matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B: | ||
17469 | CMPL DI, $0x01 | ||
17470 | JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17471 | JB match_nolit_end_encodeSnappyBetterBlockAsm8B | ||
17472 | MOVW (R8)(R11*1), R10 | ||
17473 | CMPW (R9)(R11*1), R10 | ||
17474 | JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17475 | LEAL 2(R11), R11 | ||
17476 | SUBL $0x02, DI | ||
17477 | JZ match_nolit_end_encodeSnappyBetterBlockAsm8B | ||
17478 | |||
17479 | matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B: | ||
17480 | MOVB (R8)(R11*1), R10 | ||
17481 | CMPB (R9)(R11*1), R10 | ||
17482 | JNE match_nolit_end_encodeSnappyBetterBlockAsm8B | ||
17483 | LEAL 1(R11), R11 | ||
17484 | |||
17485 | match_nolit_end_encodeSnappyBetterBlockAsm8B: | ||
17486 | MOVL CX, DI | ||
17487 | SUBL BX, DI | ||
17488 | |||
17489 | // Check if repeat | ||
17490 | MOVL DI, 16(SP) | ||
17491 | MOVL 12(SP), BX | ||
17492 | CMPL BX, SI | ||
17493 | JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B | ||
17494 | MOVL SI, R8 | ||
17495 | MOVL SI, 12(SP) | ||
17496 | LEAQ (DX)(BX*1), R9 | ||
17497 | SUBL BX, R8 | ||
17498 | LEAL -1(R8), BX | ||
17499 | CMPL BX, $0x3c | ||
17500 | JB one_byte_match_emit_encodeSnappyBetterBlockAsm8B | ||
17501 | CMPL BX, $0x00000100 | ||
17502 | JB two_bytes_match_emit_encodeSnappyBetterBlockAsm8B | ||
17503 | JB three_bytes_match_emit_encodeSnappyBetterBlockAsm8B | ||
17504 | |||
17505 | three_bytes_match_emit_encodeSnappyBetterBlockAsm8B: | ||
17506 | MOVB $0xf4, (AX) | ||
17507 | MOVW BX, 1(AX) | ||
17508 | ADDQ $0x03, AX | ||
17509 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B | ||
17510 | |||
17511 | two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: | ||
17512 | MOVB $0xf0, (AX) | ||
17513 | MOVB BL, 1(AX) | ||
17514 | ADDQ $0x02, AX | ||
17515 | CMPL BX, $0x40 | ||
17516 | JB memmove_match_emit_encodeSnappyBetterBlockAsm8B | ||
17517 | JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B | ||
17518 | |||
17519 | one_byte_match_emit_encodeSnappyBetterBlockAsm8B: | ||
17520 | SHLB $0x02, BL | ||
17521 | MOVB BL, (AX) | ||
17522 | ADDQ $0x01, AX | ||
17523 | |||
17524 | memmove_match_emit_encodeSnappyBetterBlockAsm8B: | ||
17525 | LEAQ (AX)(R8*1), BX | ||
17526 | |||
17527 | // genMemMoveShort | ||
17528 | CMPQ R8, $0x08 | ||
17529 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 | ||
17530 | CMPQ R8, $0x10 | ||
17531 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 | ||
17532 | CMPQ R8, $0x20 | ||
17533 | JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 | ||
17534 | JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 | ||
17535 | |||
17536 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: | ||
17537 | MOVQ (R9), R10 | ||
17538 | MOVQ R10, (AX) | ||
17539 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B | ||
17540 | |||
17541 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: | ||
17542 | MOVQ (R9), R10 | ||
17543 | MOVQ -8(R9)(R8*1), R9 | ||
17544 | MOVQ R10, (AX) | ||
17545 | MOVQ R9, -8(AX)(R8*1) | ||
17546 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B | ||
17547 | |||
17548 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: | ||
17549 | MOVOU (R9), X0 | ||
17550 | MOVOU -16(R9)(R8*1), X1 | ||
17551 | MOVOU X0, (AX) | ||
17552 | MOVOU X1, -16(AX)(R8*1) | ||
17553 | JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B | ||
17554 | |||
17555 | emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: | ||
17556 | MOVOU (R9), X0 | ||
17557 | MOVOU 16(R9), X1 | ||
17558 | MOVOU -32(R9)(R8*1), X2 | ||
17559 | MOVOU -16(R9)(R8*1), X3 | ||
17560 | MOVOU X0, (AX) | ||
17561 | MOVOU X1, 16(AX) | ||
17562 | MOVOU X2, -32(AX)(R8*1) | ||
17563 | MOVOU X3, -16(AX)(R8*1) | ||
17564 | |||
17565 | memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: | ||
17566 | MOVQ BX, AX | ||
17567 | JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B | ||
17568 | |||
17569 | memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: | ||
17570 | LEAQ (AX)(R8*1), BX | ||
17571 | |||
17572 | // genMemMoveLong | ||
17573 | MOVOU (R9), X0 | ||
17574 | MOVOU 16(R9), X1 | ||
17575 | MOVOU -32(R9)(R8*1), X2 | ||
17576 | MOVOU -16(R9)(R8*1), X3 | ||
17577 | MOVQ R8, R12 | ||
17578 | SHRQ $0x05, R12 | ||
17579 | MOVQ AX, R10 | ||
17580 | ANDL $0x0000001f, R10 | ||
17581 | MOVQ $0x00000040, R13 | ||
17582 | SUBQ R10, R13 | ||
17583 | DECQ R12 | ||
17584 | JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 | ||
17585 | LEAQ -32(R9)(R13*1), R10 | ||
17586 | LEAQ -32(AX)(R13*1), R14 | ||
17587 | |||
17588 | emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: | ||
17589 | MOVOU (R10), X4 | ||
17590 | MOVOU 16(R10), X5 | ||
17591 | MOVOA X4, (R14) | ||
17592 | MOVOA X5, 16(R14) | ||
17593 | ADDQ $0x20, R14 | ||
17594 | ADDQ $0x20, R10 | ||
17595 | ADDQ $0x20, R13 | ||
17596 | DECQ R12 | ||
17597 | JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back | ||
17598 | |||
17599 | emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: | ||
17600 | MOVOU -32(R9)(R13*1), X4 | ||
17601 | MOVOU -16(R9)(R13*1), X5 | ||
17602 | MOVOA X4, -32(AX)(R13*1) | ||
17603 | MOVOA X5, -16(AX)(R13*1) | ||
17604 | ADDQ $0x20, R13 | ||
17605 | CMPQ R8, R13 | ||
17606 | JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 | ||
17607 | MOVOU X0, (AX) | ||
17608 | MOVOU X1, 16(AX) | ||
17609 | MOVOU X2, -32(AX)(R8*1) | ||
17610 | MOVOU X3, -16(AX)(R8*1) | ||
17611 | MOVQ BX, AX | ||
17612 | |||
17613 | emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: | ||
17614 | ADDL R11, CX | ||
17615 | ADDL $0x04, R11 | ||
17616 | MOVL CX, 12(SP) | ||
17617 | |||
17618 | // emitCopy | ||
17619 | two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: | ||
17620 | CMPL R11, $0x40 | ||
17621 | JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17622 | MOVB $0xee, (AX) | ||
17623 | MOVW DI, 1(AX) | ||
17624 | LEAL -60(R11), R11 | ||
17625 | ADDQ $0x03, AX | ||
17626 | JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17627 | |||
17628 | two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: | ||
17629 | MOVL R11, BX | ||
17630 | SHLL $0x02, BX | ||
17631 | CMPL R11, $0x0c | ||
17632 | JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B | ||
17633 | LEAL -15(BX), BX | ||
17634 | MOVB DI, 1(AX) | ||
17635 | SHRL $0x08, DI | ||
17636 | SHLL $0x05, DI | ||
17637 | ORL DI, BX | ||
17638 | MOVB BL, (AX) | ||
17639 | ADDQ $0x02, AX | ||
17640 | JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B | ||
17641 | |||
17642 | emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: | ||
17643 | LEAL -2(BX), BX | ||
17644 | MOVB BL, (AX) | ||
17645 | MOVW DI, 1(AX) | ||
17646 | ADDQ $0x03, AX | ||
17647 | |||
17648 | match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: | ||
17649 | CMPL CX, 8(SP) | ||
17650 | JAE emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17651 | CMPQ AX, (SP) | ||
17652 | JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B | ||
17653 | MOVQ $0x00000000, ret+48(FP) | ||
17654 | RET | ||
17655 | |||
17656 | match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: | ||
17657 | MOVQ $0x0000cf1bbcdcbf9b, BX | ||
17658 | MOVQ $0x9e3779b1, DI | ||
17659 | LEAQ 1(SI), SI | ||
17660 | LEAQ -2(CX), R8 | ||
17661 | MOVQ (DX)(SI*1), R9 | ||
17662 | MOVQ 1(DX)(SI*1), R10 | ||
17663 | MOVQ (DX)(R8*1), R11 | ||
17664 | MOVQ 1(DX)(R8*1), R12 | ||
17665 | SHLQ $0x10, R9 | ||
17666 | IMULQ BX, R9 | ||
17667 | SHRQ $0x36, R9 | ||
17668 | SHLQ $0x20, R10 | ||
17669 | IMULQ DI, R10 | ||
17670 | SHRQ $0x38, R10 | ||
17671 | SHLQ $0x10, R11 | ||
17672 | IMULQ BX, R11 | ||
17673 | SHRQ $0x36, R11 | ||
17674 | SHLQ $0x20, R12 | ||
17675 | IMULQ DI, R12 | ||
17676 | SHRQ $0x38, R12 | ||
17677 | LEAQ 1(SI), DI | ||
17678 | LEAQ 1(R8), R13 | ||
17679 | MOVL SI, 24(SP)(R9*4) | ||
17680 | MOVL R8, 24(SP)(R11*4) | ||
17681 | MOVL DI, 4120(SP)(R10*4) | ||
17682 | MOVL R13, 4120(SP)(R12*4) | ||
17683 | LEAQ 1(R8)(SI*1), DI | ||
17684 | SHRQ $0x01, DI | ||
17685 | ADDQ $0x01, SI | ||
17686 | SUBQ $0x01, R8 | ||
17687 | |||
17688 | index_loop_encodeSnappyBetterBlockAsm8B: | ||
17689 | CMPQ DI, R8 | ||
17690 | JAE search_loop_encodeSnappyBetterBlockAsm8B | ||
17691 | MOVQ (DX)(SI*1), R9 | ||
17692 | MOVQ (DX)(DI*1), R10 | ||
17693 | SHLQ $0x10, R9 | ||
17694 | IMULQ BX, R9 | ||
17695 | SHRQ $0x36, R9 | ||
17696 | SHLQ $0x10, R10 | ||
17697 | IMULQ BX, R10 | ||
17698 | SHRQ $0x36, R10 | ||
17699 | MOVL SI, 24(SP)(R9*4) | ||
17700 | MOVL DI, 24(SP)(R10*4) | ||
17701 | ADDQ $0x02, SI | ||
17702 | ADDQ $0x02, DI | ||
17703 | JMP index_loop_encodeSnappyBetterBlockAsm8B | ||
17704 | |||
17705 | emit_remainder_encodeSnappyBetterBlockAsm8B: | ||
17706 | MOVQ src_len+32(FP), CX | ||
17707 | SUBL 12(SP), CX | ||
17708 | LEAQ 3(AX)(CX*1), CX | ||
17709 | CMPQ CX, (SP) | ||
17710 | JB emit_remainder_ok_encodeSnappyBetterBlockAsm8B | ||
17711 | MOVQ $0x00000000, ret+48(FP) | ||
17712 | RET | ||
17713 | |||
17714 | emit_remainder_ok_encodeSnappyBetterBlockAsm8B: | ||
17715 | MOVQ src_len+32(FP), CX | ||
17716 | MOVL 12(SP), BX | ||
17717 | CMPL BX, CX | ||
17718 | JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17719 | MOVL CX, SI | ||
17720 | MOVL CX, 12(SP) | ||
17721 | LEAQ (DX)(BX*1), CX | ||
17722 | SUBL BX, SI | ||
17723 | LEAL -1(SI), DX | ||
17724 | CMPL DX, $0x3c | ||
17725 | JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17726 | CMPL DX, $0x00000100 | ||
17727 | JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17728 | JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17729 | |||
17730 | three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: | ||
17731 | MOVB $0xf4, (AX) | ||
17732 | MOVW DX, 1(AX) | ||
17733 | ADDQ $0x03, AX | ||
17734 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17735 | |||
17736 | two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: | ||
17737 | MOVB $0xf0, (AX) | ||
17738 | MOVB DL, 1(AX) | ||
17739 | ADDQ $0x02, AX | ||
17740 | CMPL DX, $0x40 | ||
17741 | JB memmove_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17742 | JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17743 | |||
17744 | one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: | ||
17745 | SHLB $0x02, DL | ||
17746 | MOVB DL, (AX) | ||
17747 | ADDQ $0x01, AX | ||
17748 | |||
17749 | memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: | ||
17750 | LEAQ (AX)(SI*1), DX | ||
17751 | MOVL SI, BX | ||
17752 | |||
17753 | // genMemMoveShort | ||
17754 | CMPQ BX, $0x03 | ||
17755 | JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2 | ||
17756 | JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3 | ||
17757 | CMPQ BX, $0x08 | ||
17758 | JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7 | ||
17759 | CMPQ BX, $0x10 | ||
17760 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 | ||
17761 | CMPQ BX, $0x20 | ||
17762 | JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 | ||
17763 | JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 | ||
17764 | |||
17765 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2: | ||
17766 | MOVB (CX), SI | ||
17767 | MOVB -1(CX)(BX*1), CL | ||
17768 | MOVB SI, (AX) | ||
17769 | MOVB CL, -1(AX)(BX*1) | ||
17770 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17771 | |||
17772 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3: | ||
17773 | MOVW (CX), SI | ||
17774 | MOVB 2(CX), CL | ||
17775 | MOVW SI, (AX) | ||
17776 | MOVB CL, 2(AX) | ||
17777 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17778 | |||
17779 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7: | ||
17780 | MOVL (CX), SI | ||
17781 | MOVL -4(CX)(BX*1), CX | ||
17782 | MOVL SI, (AX) | ||
17783 | MOVL CX, -4(AX)(BX*1) | ||
17784 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17785 | |||
17786 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: | ||
17787 | MOVQ (CX), SI | ||
17788 | MOVQ -8(CX)(BX*1), CX | ||
17789 | MOVQ SI, (AX) | ||
17790 | MOVQ CX, -8(AX)(BX*1) | ||
17791 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17792 | |||
17793 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: | ||
17794 | MOVOU (CX), X0 | ||
17795 | MOVOU -16(CX)(BX*1), X1 | ||
17796 | MOVOU X0, (AX) | ||
17797 | MOVOU X1, -16(AX)(BX*1) | ||
17798 | JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17799 | |||
17800 | emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: | ||
17801 | MOVOU (CX), X0 | ||
17802 | MOVOU 16(CX), X1 | ||
17803 | MOVOU -32(CX)(BX*1), X2 | ||
17804 | MOVOU -16(CX)(BX*1), X3 | ||
17805 | MOVOU X0, (AX) | ||
17806 | MOVOU X1, 16(AX) | ||
17807 | MOVOU X2, -32(AX)(BX*1) | ||
17808 | MOVOU X3, -16(AX)(BX*1) | ||
17809 | |||
17810 | memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: | ||
17811 | MOVQ DX, AX | ||
17812 | JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B | ||
17813 | |||
17814 | memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: | ||
17815 | LEAQ (AX)(SI*1), DX | ||
17816 | MOVL SI, BX | ||
17817 | |||
17818 | // genMemMoveLong | ||
17819 | MOVOU (CX), X0 | ||
17820 | MOVOU 16(CX), X1 | ||
17821 | MOVOU -32(CX)(BX*1), X2 | ||
17822 | MOVOU -16(CX)(BX*1), X3 | ||
17823 | MOVQ BX, DI | ||
17824 | SHRQ $0x05, DI | ||
17825 | MOVQ AX, SI | ||
17826 | ANDL $0x0000001f, SI | ||
17827 | MOVQ $0x00000040, R8 | ||
17828 | SUBQ SI, R8 | ||
17829 | DECQ DI | ||
17830 | JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 | ||
17831 | LEAQ -32(CX)(R8*1), SI | ||
17832 | LEAQ -32(AX)(R8*1), R9 | ||
17833 | |||
17834 | emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: | ||
17835 | MOVOU (SI), X4 | ||
17836 | MOVOU 16(SI), X5 | ||
17837 | MOVOA X4, (R9) | ||
17838 | MOVOA X5, 16(R9) | ||
17839 | ADDQ $0x20, R9 | ||
17840 | ADDQ $0x20, SI | ||
17841 | ADDQ $0x20, R8 | ||
17842 | DECQ DI | ||
17843 | JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back | ||
17844 | |||
17845 | emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: | ||
17846 | MOVOU -32(CX)(R8*1), X4 | ||
17847 | MOVOU -16(CX)(R8*1), X5 | ||
17848 | MOVOA X4, -32(AX)(R8*1) | ||
17849 | MOVOA X5, -16(AX)(R8*1) | ||
17850 | ADDQ $0x20, R8 | ||
17851 | CMPQ BX, R8 | ||
17852 | JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 | ||
17853 | MOVOU X0, (AX) | ||
17854 | MOVOU X1, 16(AX) | ||
17855 | MOVOU X2, -32(AX)(BX*1) | ||
17856 | MOVOU X3, -16(AX)(BX*1) | ||
17857 | MOVQ DX, AX | ||
17858 | |||
17859 | emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: | ||
17860 | MOVQ dst_base+0(FP), CX | ||
17861 | SUBQ CX, AX | ||
17862 | MOVQ AX, ret+48(FP) | ||
17863 | RET | ||
17864 | |||
17865 | // func calcBlockSize(src []byte) int | ||
17866 | // Requires: BMI, SSE2 | ||
17867 | TEXT ·calcBlockSize(SB), $32792-32 | ||
17868 | XORQ AX, AX | ||
17869 | MOVQ $0x00000100, CX | ||
17870 | LEAQ 24(SP), DX | ||
17871 | PXOR X0, X0 | ||
17872 | |||
17873 | zero_loop_calcBlockSize: | ||
17874 | MOVOU X0, (DX) | ||
17875 | MOVOU X0, 16(DX) | ||
17876 | MOVOU X0, 32(DX) | ||
17877 | MOVOU X0, 48(DX) | ||
17878 | MOVOU X0, 64(DX) | ||
17879 | MOVOU X0, 80(DX) | ||
17880 | MOVOU X0, 96(DX) | ||
17881 | MOVOU X0, 112(DX) | ||
17882 | ADDQ $0x80, DX | ||
17883 | DECQ CX | ||
17884 | JNZ zero_loop_calcBlockSize | ||
17885 | MOVL $0x00000000, 12(SP) | ||
17886 | MOVQ src_len+8(FP), CX | ||
17887 | LEAQ -9(CX), DX | ||
17888 | LEAQ -8(CX), BX | ||
17889 | MOVL BX, 8(SP) | ||
17890 | SHRQ $0x05, CX | ||
17891 | SUBL CX, DX | ||
17892 | LEAQ (AX)(DX*1), DX | ||
17893 | MOVQ DX, (SP) | ||
17894 | MOVL $0x00000001, CX | ||
17895 | MOVL CX, 16(SP) | ||
17896 | MOVQ src_base+0(FP), DX | ||
17897 | |||
17898 | search_loop_calcBlockSize: | ||
17899 | MOVL CX, BX | ||
17900 | SUBL 12(SP), BX | ||
17901 | SHRL $0x05, BX | ||
17902 | LEAL 4(CX)(BX*1), BX | ||
17903 | CMPL BX, 8(SP) | ||
17904 | JAE emit_remainder_calcBlockSize | ||
17905 | MOVQ (DX)(CX*1), SI | ||
17906 | MOVL BX, 20(SP) | ||
17907 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
17908 | MOVQ SI, R9 | ||
17909 | MOVQ SI, R10 | ||
17910 | SHRQ $0x08, R10 | ||
17911 | SHLQ $0x10, R9 | ||
17912 | IMULQ R8, R9 | ||
17913 | SHRQ $0x33, R9 | ||
17914 | SHLQ $0x10, R10 | ||
17915 | IMULQ R8, R10 | ||
17916 | SHRQ $0x33, R10 | ||
17917 | MOVL 24(SP)(R9*4), BX | ||
17918 | MOVL 24(SP)(R10*4), DI | ||
17919 | MOVL CX, 24(SP)(R9*4) | ||
17920 | LEAL 1(CX), R9 | ||
17921 | MOVL R9, 24(SP)(R10*4) | ||
17922 | MOVQ SI, R9 | ||
17923 | SHRQ $0x10, R9 | ||
17924 | SHLQ $0x10, R9 | ||
17925 | IMULQ R8, R9 | ||
17926 | SHRQ $0x33, R9 | ||
17927 | MOVL CX, R8 | ||
17928 | SUBL 16(SP), R8 | ||
17929 | MOVL 1(DX)(R8*1), R10 | ||
17930 | MOVQ SI, R8 | ||
17931 | SHRQ $0x08, R8 | ||
17932 | CMPL R8, R10 | ||
17933 | JNE no_repeat_found_calcBlockSize | ||
17934 | LEAL 1(CX), SI | ||
17935 | MOVL 12(SP), BX | ||
17936 | MOVL SI, DI | ||
17937 | SUBL 16(SP), DI | ||
17938 | JZ repeat_extend_back_end_calcBlockSize | ||
17939 | |||
17940 | repeat_extend_back_loop_calcBlockSize: | ||
17941 | CMPL SI, BX | ||
17942 | JBE repeat_extend_back_end_calcBlockSize | ||
17943 | MOVB -1(DX)(DI*1), R8 | ||
17944 | MOVB -1(DX)(SI*1), R9 | ||
17945 | CMPB R8, R9 | ||
17946 | JNE repeat_extend_back_end_calcBlockSize | ||
17947 | LEAL -1(SI), SI | ||
17948 | DECL DI | ||
17949 | JNZ repeat_extend_back_loop_calcBlockSize | ||
17950 | |||
17951 | repeat_extend_back_end_calcBlockSize: | ||
17952 | MOVL 12(SP), BX | ||
17953 | CMPL BX, SI | ||
17954 | JEQ emit_literal_done_repeat_emit_calcBlockSize | ||
17955 | MOVL SI, DI | ||
17956 | MOVL SI, 12(SP) | ||
17957 | LEAQ (DX)(BX*1), R8 | ||
17958 | SUBL BX, DI | ||
17959 | LEAL -1(DI), BX | ||
17960 | CMPL BX, $0x3c | ||
17961 | JB one_byte_repeat_emit_calcBlockSize | ||
17962 | CMPL BX, $0x00000100 | ||
17963 | JB two_bytes_repeat_emit_calcBlockSize | ||
17964 | CMPL BX, $0x00010000 | ||
17965 | JB three_bytes_repeat_emit_calcBlockSize | ||
17966 | CMPL BX, $0x01000000 | ||
17967 | JB four_bytes_repeat_emit_calcBlockSize | ||
17968 | ADDQ $0x05, AX | ||
17969 | JMP memmove_long_repeat_emit_calcBlockSize | ||
17970 | |||
17971 | four_bytes_repeat_emit_calcBlockSize: | ||
17972 | ADDQ $0x04, AX | ||
17973 | JMP memmove_long_repeat_emit_calcBlockSize | ||
17974 | |||
17975 | three_bytes_repeat_emit_calcBlockSize: | ||
17976 | ADDQ $0x03, AX | ||
17977 | JMP memmove_long_repeat_emit_calcBlockSize | ||
17978 | |||
17979 | two_bytes_repeat_emit_calcBlockSize: | ||
17980 | ADDQ $0x02, AX | ||
17981 | CMPL BX, $0x40 | ||
17982 | JB memmove_repeat_emit_calcBlockSize | ||
17983 | JMP memmove_long_repeat_emit_calcBlockSize | ||
17984 | |||
17985 | one_byte_repeat_emit_calcBlockSize: | ||
17986 | ADDQ $0x01, AX | ||
17987 | |||
17988 | memmove_repeat_emit_calcBlockSize: | ||
17989 | LEAQ (AX)(DI*1), AX | ||
17990 | JMP emit_literal_done_repeat_emit_calcBlockSize | ||
17991 | |||
17992 | memmove_long_repeat_emit_calcBlockSize: | ||
17993 | LEAQ (AX)(DI*1), AX | ||
17994 | |||
17995 | emit_literal_done_repeat_emit_calcBlockSize: | ||
17996 | ADDL $0x05, CX | ||
17997 | MOVL CX, BX | ||
17998 | SUBL 16(SP), BX | ||
17999 | MOVQ src_len+8(FP), DI | ||
18000 | SUBL CX, DI | ||
18001 | LEAQ (DX)(CX*1), R8 | ||
18002 | LEAQ (DX)(BX*1), BX | ||
18003 | |||
18004 | // matchLen | ||
18005 | XORL R10, R10 | ||
18006 | |||
18007 | matchlen_loopback_16_repeat_extend_calcBlockSize: | ||
18008 | CMPL DI, $0x10 | ||
18009 | JB matchlen_match8_repeat_extend_calcBlockSize | ||
18010 | MOVQ (R8)(R10*1), R9 | ||
18011 | MOVQ 8(R8)(R10*1), R11 | ||
18012 | XORQ (BX)(R10*1), R9 | ||
18013 | JNZ matchlen_bsf_8_repeat_extend_calcBlockSize | ||
18014 | XORQ 8(BX)(R10*1), R11 | ||
18015 | JNZ matchlen_bsf_16repeat_extend_calcBlockSize | ||
18016 | LEAL -16(DI), DI | ||
18017 | LEAL 16(R10), R10 | ||
18018 | JMP matchlen_loopback_16_repeat_extend_calcBlockSize | ||
18019 | |||
18020 | matchlen_bsf_16repeat_extend_calcBlockSize: | ||
18021 | #ifdef GOAMD64_v3 | ||
18022 | TZCNTQ R11, R11 | ||
18023 | |||
18024 | #else | ||
18025 | BSFQ R11, R11 | ||
18026 | |||
18027 | #endif | ||
18028 | SARQ $0x03, R11 | ||
18029 | LEAL 8(R10)(R11*1), R10 | ||
18030 | JMP repeat_extend_forward_end_calcBlockSize | ||
18031 | |||
18032 | matchlen_match8_repeat_extend_calcBlockSize: | ||
18033 | CMPL DI, $0x08 | ||
18034 | JB matchlen_match4_repeat_extend_calcBlockSize | ||
18035 | MOVQ (R8)(R10*1), R9 | ||
18036 | XORQ (BX)(R10*1), R9 | ||
18037 | JNZ matchlen_bsf_8_repeat_extend_calcBlockSize | ||
18038 | LEAL -8(DI), DI | ||
18039 | LEAL 8(R10), R10 | ||
18040 | JMP matchlen_match4_repeat_extend_calcBlockSize | ||
18041 | |||
18042 | matchlen_bsf_8_repeat_extend_calcBlockSize: | ||
18043 | #ifdef GOAMD64_v3 | ||
18044 | TZCNTQ R9, R9 | ||
18045 | |||
18046 | #else | ||
18047 | BSFQ R9, R9 | ||
18048 | |||
18049 | #endif | ||
18050 | SARQ $0x03, R9 | ||
18051 | LEAL (R10)(R9*1), R10 | ||
18052 | JMP repeat_extend_forward_end_calcBlockSize | ||
18053 | |||
18054 | matchlen_match4_repeat_extend_calcBlockSize: | ||
18055 | CMPL DI, $0x04 | ||
18056 | JB matchlen_match2_repeat_extend_calcBlockSize | ||
18057 | MOVL (R8)(R10*1), R9 | ||
18058 | CMPL (BX)(R10*1), R9 | ||
18059 | JNE matchlen_match2_repeat_extend_calcBlockSize | ||
18060 | LEAL -4(DI), DI | ||
18061 | LEAL 4(R10), R10 | ||
18062 | |||
18063 | matchlen_match2_repeat_extend_calcBlockSize: | ||
18064 | CMPL DI, $0x01 | ||
18065 | JE matchlen_match1_repeat_extend_calcBlockSize | ||
18066 | JB repeat_extend_forward_end_calcBlockSize | ||
18067 | MOVW (R8)(R10*1), R9 | ||
18068 | CMPW (BX)(R10*1), R9 | ||
18069 | JNE matchlen_match1_repeat_extend_calcBlockSize | ||
18070 | LEAL 2(R10), R10 | ||
18071 | SUBL $0x02, DI | ||
18072 | JZ repeat_extend_forward_end_calcBlockSize | ||
18073 | |||
18074 | matchlen_match1_repeat_extend_calcBlockSize: | ||
18075 | MOVB (R8)(R10*1), R9 | ||
18076 | CMPB (BX)(R10*1), R9 | ||
18077 | JNE repeat_extend_forward_end_calcBlockSize | ||
18078 | LEAL 1(R10), R10 | ||
18079 | |||
18080 | repeat_extend_forward_end_calcBlockSize: | ||
18081 | ADDL R10, CX | ||
18082 | MOVL CX, BX | ||
18083 | SUBL SI, BX | ||
18084 | MOVL 16(SP), SI | ||
18085 | |||
18086 | // emitCopy | ||
18087 | CMPL SI, $0x00010000 | ||
18088 | JB two_byte_offset_repeat_as_copy_calcBlockSize | ||
18089 | |||
18090 | four_bytes_loop_back_repeat_as_copy_calcBlockSize: | ||
18091 | CMPL BX, $0x40 | ||
18092 | JBE four_bytes_remain_repeat_as_copy_calcBlockSize | ||
18093 | LEAL -64(BX), BX | ||
18094 | ADDQ $0x05, AX | ||
18095 | CMPL BX, $0x04 | ||
18096 | JB four_bytes_remain_repeat_as_copy_calcBlockSize | ||
18097 | JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize | ||
18098 | |||
18099 | four_bytes_remain_repeat_as_copy_calcBlockSize: | ||
18100 | TESTL BX, BX | ||
18101 | JZ repeat_end_emit_calcBlockSize | ||
18102 | XORL BX, BX | ||
18103 | ADDQ $0x05, AX | ||
18104 | JMP repeat_end_emit_calcBlockSize | ||
18105 | |||
18106 | two_byte_offset_repeat_as_copy_calcBlockSize: | ||
18107 | CMPL BX, $0x40 | ||
18108 | JBE two_byte_offset_short_repeat_as_copy_calcBlockSize | ||
18109 | LEAL -60(BX), BX | ||
18110 | ADDQ $0x03, AX | ||
18111 | JMP two_byte_offset_repeat_as_copy_calcBlockSize | ||
18112 | |||
18113 | two_byte_offset_short_repeat_as_copy_calcBlockSize: | ||
18114 | MOVL BX, DI | ||
18115 | SHLL $0x02, DI | ||
18116 | CMPL BX, $0x0c | ||
18117 | JAE emit_copy_three_repeat_as_copy_calcBlockSize | ||
18118 | CMPL SI, $0x00000800 | ||
18119 | JAE emit_copy_three_repeat_as_copy_calcBlockSize | ||
18120 | ADDQ $0x02, AX | ||
18121 | JMP repeat_end_emit_calcBlockSize | ||
18122 | |||
18123 | emit_copy_three_repeat_as_copy_calcBlockSize: | ||
18124 | ADDQ $0x03, AX | ||
18125 | |||
18126 | repeat_end_emit_calcBlockSize: | ||
18127 | MOVL CX, 12(SP) | ||
18128 | JMP search_loop_calcBlockSize | ||
18129 | |||
18130 | no_repeat_found_calcBlockSize: | ||
18131 | CMPL (DX)(BX*1), SI | ||
18132 | JEQ candidate_match_calcBlockSize | ||
18133 | SHRQ $0x08, SI | ||
18134 | MOVL 24(SP)(R9*4), BX | ||
18135 | LEAL 2(CX), R8 | ||
18136 | CMPL (DX)(DI*1), SI | ||
18137 | JEQ candidate2_match_calcBlockSize | ||
18138 | MOVL R8, 24(SP)(R9*4) | ||
18139 | SHRQ $0x08, SI | ||
18140 | CMPL (DX)(BX*1), SI | ||
18141 | JEQ candidate3_match_calcBlockSize | ||
18142 | MOVL 20(SP), CX | ||
18143 | JMP search_loop_calcBlockSize | ||
18144 | |||
18145 | candidate3_match_calcBlockSize: | ||
18146 | ADDL $0x02, CX | ||
18147 | JMP candidate_match_calcBlockSize | ||
18148 | |||
18149 | candidate2_match_calcBlockSize: | ||
18150 | MOVL R8, 24(SP)(R9*4) | ||
18151 | INCL CX | ||
18152 | MOVL DI, BX | ||
18153 | |||
18154 | candidate_match_calcBlockSize: | ||
18155 | MOVL 12(SP), SI | ||
18156 | TESTL BX, BX | ||
18157 | JZ match_extend_back_end_calcBlockSize | ||
18158 | |||
18159 | match_extend_back_loop_calcBlockSize: | ||
18160 | CMPL CX, SI | ||
18161 | JBE match_extend_back_end_calcBlockSize | ||
18162 | MOVB -1(DX)(BX*1), DI | ||
18163 | MOVB -1(DX)(CX*1), R8 | ||
18164 | CMPB DI, R8 | ||
18165 | JNE match_extend_back_end_calcBlockSize | ||
18166 | LEAL -1(CX), CX | ||
18167 | DECL BX | ||
18168 | JZ match_extend_back_end_calcBlockSize | ||
18169 | JMP match_extend_back_loop_calcBlockSize | ||
18170 | |||
18171 | match_extend_back_end_calcBlockSize: | ||
18172 | MOVL CX, SI | ||
18173 | SUBL 12(SP), SI | ||
18174 | LEAQ 5(AX)(SI*1), SI | ||
18175 | CMPQ SI, (SP) | ||
18176 | JB match_dst_size_check_calcBlockSize | ||
18177 | MOVQ $0x00000000, ret+24(FP) | ||
18178 | RET | ||
18179 | |||
18180 | match_dst_size_check_calcBlockSize: | ||
18181 | MOVL CX, SI | ||
18182 | MOVL 12(SP), DI | ||
18183 | CMPL DI, SI | ||
18184 | JEQ emit_literal_done_match_emit_calcBlockSize | ||
18185 | MOVL SI, R8 | ||
18186 | MOVL SI, 12(SP) | ||
18187 | LEAQ (DX)(DI*1), SI | ||
18188 | SUBL DI, R8 | ||
18189 | LEAL -1(R8), SI | ||
18190 | CMPL SI, $0x3c | ||
18191 | JB one_byte_match_emit_calcBlockSize | ||
18192 | CMPL SI, $0x00000100 | ||
18193 | JB two_bytes_match_emit_calcBlockSize | ||
18194 | CMPL SI, $0x00010000 | ||
18195 | JB three_bytes_match_emit_calcBlockSize | ||
18196 | CMPL SI, $0x01000000 | ||
18197 | JB four_bytes_match_emit_calcBlockSize | ||
18198 | ADDQ $0x05, AX | ||
18199 | JMP memmove_long_match_emit_calcBlockSize | ||
18200 | |||
18201 | four_bytes_match_emit_calcBlockSize: | ||
18202 | ADDQ $0x04, AX | ||
18203 | JMP memmove_long_match_emit_calcBlockSize | ||
18204 | |||
18205 | three_bytes_match_emit_calcBlockSize: | ||
18206 | ADDQ $0x03, AX | ||
18207 | JMP memmove_long_match_emit_calcBlockSize | ||
18208 | |||
18209 | two_bytes_match_emit_calcBlockSize: | ||
18210 | ADDQ $0x02, AX | ||
18211 | CMPL SI, $0x40 | ||
18212 | JB memmove_match_emit_calcBlockSize | ||
18213 | JMP memmove_long_match_emit_calcBlockSize | ||
18214 | |||
18215 | one_byte_match_emit_calcBlockSize: | ||
18216 | ADDQ $0x01, AX | ||
18217 | |||
18218 | memmove_match_emit_calcBlockSize: | ||
18219 | LEAQ (AX)(R8*1), AX | ||
18220 | JMP emit_literal_done_match_emit_calcBlockSize | ||
18221 | |||
18222 | memmove_long_match_emit_calcBlockSize: | ||
18223 | LEAQ (AX)(R8*1), AX | ||
18224 | |||
18225 | emit_literal_done_match_emit_calcBlockSize: | ||
18226 | match_nolit_loop_calcBlockSize: | ||
18227 | MOVL CX, SI | ||
18228 | SUBL BX, SI | ||
18229 | MOVL SI, 16(SP) | ||
18230 | ADDL $0x04, CX | ||
18231 | ADDL $0x04, BX | ||
18232 | MOVQ src_len+8(FP), SI | ||
18233 | SUBL CX, SI | ||
18234 | LEAQ (DX)(CX*1), DI | ||
18235 | LEAQ (DX)(BX*1), BX | ||
18236 | |||
18237 | // matchLen | ||
18238 | XORL R9, R9 | ||
18239 | |||
18240 | matchlen_loopback_16_match_nolit_calcBlockSize: | ||
18241 | CMPL SI, $0x10 | ||
18242 | JB matchlen_match8_match_nolit_calcBlockSize | ||
18243 | MOVQ (DI)(R9*1), R8 | ||
18244 | MOVQ 8(DI)(R9*1), R10 | ||
18245 | XORQ (BX)(R9*1), R8 | ||
18246 | JNZ matchlen_bsf_8_match_nolit_calcBlockSize | ||
18247 | XORQ 8(BX)(R9*1), R10 | ||
18248 | JNZ matchlen_bsf_16match_nolit_calcBlockSize | ||
18249 | LEAL -16(SI), SI | ||
18250 | LEAL 16(R9), R9 | ||
18251 | JMP matchlen_loopback_16_match_nolit_calcBlockSize | ||
18252 | |||
18253 | matchlen_bsf_16match_nolit_calcBlockSize: | ||
18254 | #ifdef GOAMD64_v3 | ||
18255 | TZCNTQ R10, R10 | ||
18256 | |||
18257 | #else | ||
18258 | BSFQ R10, R10 | ||
18259 | |||
18260 | #endif | ||
18261 | SARQ $0x03, R10 | ||
18262 | LEAL 8(R9)(R10*1), R9 | ||
18263 | JMP match_nolit_end_calcBlockSize | ||
18264 | |||
18265 | matchlen_match8_match_nolit_calcBlockSize: | ||
18266 | CMPL SI, $0x08 | ||
18267 | JB matchlen_match4_match_nolit_calcBlockSize | ||
18268 | MOVQ (DI)(R9*1), R8 | ||
18269 | XORQ (BX)(R9*1), R8 | ||
18270 | JNZ matchlen_bsf_8_match_nolit_calcBlockSize | ||
18271 | LEAL -8(SI), SI | ||
18272 | LEAL 8(R9), R9 | ||
18273 | JMP matchlen_match4_match_nolit_calcBlockSize | ||
18274 | |||
18275 | matchlen_bsf_8_match_nolit_calcBlockSize: | ||
18276 | #ifdef GOAMD64_v3 | ||
18277 | TZCNTQ R8, R8 | ||
18278 | |||
18279 | #else | ||
18280 | BSFQ R8, R8 | ||
18281 | |||
18282 | #endif | ||
18283 | SARQ $0x03, R8 | ||
18284 | LEAL (R9)(R8*1), R9 | ||
18285 | JMP match_nolit_end_calcBlockSize | ||
18286 | |||
18287 | matchlen_match4_match_nolit_calcBlockSize: | ||
18288 | CMPL SI, $0x04 | ||
18289 | JB matchlen_match2_match_nolit_calcBlockSize | ||
18290 | MOVL (DI)(R9*1), R8 | ||
18291 | CMPL (BX)(R9*1), R8 | ||
18292 | JNE matchlen_match2_match_nolit_calcBlockSize | ||
18293 | LEAL -4(SI), SI | ||
18294 | LEAL 4(R9), R9 | ||
18295 | |||
18296 | matchlen_match2_match_nolit_calcBlockSize: | ||
18297 | CMPL SI, $0x01 | ||
18298 | JE matchlen_match1_match_nolit_calcBlockSize | ||
18299 | JB match_nolit_end_calcBlockSize | ||
18300 | MOVW (DI)(R9*1), R8 | ||
18301 | CMPW (BX)(R9*1), R8 | ||
18302 | JNE matchlen_match1_match_nolit_calcBlockSize | ||
18303 | LEAL 2(R9), R9 | ||
18304 | SUBL $0x02, SI | ||
18305 | JZ match_nolit_end_calcBlockSize | ||
18306 | |||
18307 | matchlen_match1_match_nolit_calcBlockSize: | ||
18308 | MOVB (DI)(R9*1), R8 | ||
18309 | CMPB (BX)(R9*1), R8 | ||
18310 | JNE match_nolit_end_calcBlockSize | ||
18311 | LEAL 1(R9), R9 | ||
18312 | |||
18313 | match_nolit_end_calcBlockSize: | ||
18314 | ADDL R9, CX | ||
18315 | MOVL 16(SP), BX | ||
18316 | ADDL $0x04, R9 | ||
18317 | MOVL CX, 12(SP) | ||
18318 | |||
18319 | // emitCopy | ||
18320 | CMPL BX, $0x00010000 | ||
18321 | JB two_byte_offset_match_nolit_calcBlockSize | ||
18322 | |||
18323 | four_bytes_loop_back_match_nolit_calcBlockSize: | ||
18324 | CMPL R9, $0x40 | ||
18325 | JBE four_bytes_remain_match_nolit_calcBlockSize | ||
18326 | LEAL -64(R9), R9 | ||
18327 | ADDQ $0x05, AX | ||
18328 | CMPL R9, $0x04 | ||
18329 | JB four_bytes_remain_match_nolit_calcBlockSize | ||
18330 | JMP four_bytes_loop_back_match_nolit_calcBlockSize | ||
18331 | |||
18332 | four_bytes_remain_match_nolit_calcBlockSize: | ||
18333 | TESTL R9, R9 | ||
18334 | JZ match_nolit_emitcopy_end_calcBlockSize | ||
18335 | XORL BX, BX | ||
18336 | ADDQ $0x05, AX | ||
18337 | JMP match_nolit_emitcopy_end_calcBlockSize | ||
18338 | |||
18339 | two_byte_offset_match_nolit_calcBlockSize: | ||
18340 | CMPL R9, $0x40 | ||
18341 | JBE two_byte_offset_short_match_nolit_calcBlockSize | ||
18342 | LEAL -60(R9), R9 | ||
18343 | ADDQ $0x03, AX | ||
18344 | JMP two_byte_offset_match_nolit_calcBlockSize | ||
18345 | |||
18346 | two_byte_offset_short_match_nolit_calcBlockSize: | ||
18347 | MOVL R9, SI | ||
18348 | SHLL $0x02, SI | ||
18349 | CMPL R9, $0x0c | ||
18350 | JAE emit_copy_three_match_nolit_calcBlockSize | ||
18351 | CMPL BX, $0x00000800 | ||
18352 | JAE emit_copy_three_match_nolit_calcBlockSize | ||
18353 | ADDQ $0x02, AX | ||
18354 | JMP match_nolit_emitcopy_end_calcBlockSize | ||
18355 | |||
18356 | emit_copy_three_match_nolit_calcBlockSize: | ||
18357 | ADDQ $0x03, AX | ||
18358 | |||
18359 | match_nolit_emitcopy_end_calcBlockSize: | ||
18360 | CMPL CX, 8(SP) | ||
18361 | JAE emit_remainder_calcBlockSize | ||
18362 | MOVQ -2(DX)(CX*1), SI | ||
18363 | CMPQ AX, (SP) | ||
18364 | JB match_nolit_dst_ok_calcBlockSize | ||
18365 | MOVQ $0x00000000, ret+24(FP) | ||
18366 | RET | ||
18367 | |||
18368 | match_nolit_dst_ok_calcBlockSize: | ||
18369 | MOVQ $0x0000cf1bbcdcbf9b, R8 | ||
18370 | MOVQ SI, DI | ||
18371 | SHRQ $0x10, SI | ||
18372 | MOVQ SI, BX | ||
18373 | SHLQ $0x10, DI | ||
18374 | IMULQ R8, DI | ||
18375 | SHRQ $0x33, DI | ||
18376 | SHLQ $0x10, BX | ||
18377 | IMULQ R8, BX | ||
18378 | SHRQ $0x33, BX | ||
18379 | LEAL -2(CX), R8 | ||
18380 | LEAQ 24(SP)(BX*4), R9 | ||
18381 | MOVL (R9), BX | ||
18382 | MOVL R8, 24(SP)(DI*4) | ||
18383 | MOVL CX, (R9) | ||
18384 | CMPL (DX)(BX*1), SI | ||
18385 | JEQ match_nolit_loop_calcBlockSize | ||
18386 | INCL CX | ||
18387 | JMP search_loop_calcBlockSize | ||
18388 | |||
18389 | emit_remainder_calcBlockSize: | ||
18390 | MOVQ src_len+8(FP), CX | ||
18391 | SUBL 12(SP), CX | ||
18392 | LEAQ 5(AX)(CX*1), CX | ||
18393 | CMPQ CX, (SP) | ||
18394 | JB emit_remainder_ok_calcBlockSize | ||
18395 | MOVQ $0x00000000, ret+24(FP) | ||
18396 | RET | ||
18397 | |||
18398 | emit_remainder_ok_calcBlockSize: | ||
18399 | MOVQ src_len+8(FP), CX | ||
18400 | MOVL 12(SP), BX | ||
18401 | CMPL BX, CX | ||
18402 | JEQ emit_literal_done_emit_remainder_calcBlockSize | ||
18403 | MOVL CX, SI | ||
18404 | MOVL CX, 12(SP) | ||
18405 | LEAQ (DX)(BX*1), CX | ||
18406 | SUBL BX, SI | ||
18407 | LEAL -1(SI), CX | ||
18408 | CMPL CX, $0x3c | ||
18409 | JB one_byte_emit_remainder_calcBlockSize | ||
18410 | CMPL CX, $0x00000100 | ||
18411 | JB two_bytes_emit_remainder_calcBlockSize | ||
18412 | CMPL CX, $0x00010000 | ||
18413 | JB three_bytes_emit_remainder_calcBlockSize | ||
18414 | CMPL CX, $0x01000000 | ||
18415 | JB four_bytes_emit_remainder_calcBlockSize | ||
18416 | ADDQ $0x05, AX | ||
18417 | JMP memmove_long_emit_remainder_calcBlockSize | ||
18418 | |||
18419 | four_bytes_emit_remainder_calcBlockSize: | ||
18420 | ADDQ $0x04, AX | ||
18421 | JMP memmove_long_emit_remainder_calcBlockSize | ||
18422 | |||
18423 | three_bytes_emit_remainder_calcBlockSize: | ||
18424 | ADDQ $0x03, AX | ||
18425 | JMP memmove_long_emit_remainder_calcBlockSize | ||
18426 | |||
18427 | two_bytes_emit_remainder_calcBlockSize: | ||
18428 | ADDQ $0x02, AX | ||
18429 | CMPL CX, $0x40 | ||
18430 | JB memmove_emit_remainder_calcBlockSize | ||
18431 | JMP memmove_long_emit_remainder_calcBlockSize | ||
18432 | |||
18433 | one_byte_emit_remainder_calcBlockSize: | ||
18434 | ADDQ $0x01, AX | ||
18435 | |||
18436 | memmove_emit_remainder_calcBlockSize: | ||
18437 | LEAQ (AX)(SI*1), AX | ||
18438 | JMP emit_literal_done_emit_remainder_calcBlockSize | ||
18439 | |||
18440 | memmove_long_emit_remainder_calcBlockSize: | ||
18441 | LEAQ (AX)(SI*1), AX | ||
18442 | |||
18443 | emit_literal_done_emit_remainder_calcBlockSize: | ||
18444 | MOVQ AX, ret+24(FP) | ||
18445 | RET | ||
18446 | |||
18447 | // func calcBlockSizeSmall(src []byte) int | ||
18448 | // Requires: BMI, SSE2 | ||
18449 | TEXT ·calcBlockSizeSmall(SB), $2072-32 | ||
18450 | XORQ AX, AX | ||
18451 | MOVQ $0x00000010, CX | ||
18452 | LEAQ 24(SP), DX | ||
18453 | PXOR X0, X0 | ||
18454 | |||
18455 | zero_loop_calcBlockSizeSmall: | ||
18456 | MOVOU X0, (DX) | ||
18457 | MOVOU X0, 16(DX) | ||
18458 | MOVOU X0, 32(DX) | ||
18459 | MOVOU X0, 48(DX) | ||
18460 | MOVOU X0, 64(DX) | ||
18461 | MOVOU X0, 80(DX) | ||
18462 | MOVOU X0, 96(DX) | ||
18463 | MOVOU X0, 112(DX) | ||
18464 | ADDQ $0x80, DX | ||
18465 | DECQ CX | ||
18466 | JNZ zero_loop_calcBlockSizeSmall | ||
18467 | MOVL $0x00000000, 12(SP) | ||
18468 | MOVQ src_len+8(FP), CX | ||
18469 | LEAQ -9(CX), DX | ||
18470 | LEAQ -8(CX), BX | ||
18471 | MOVL BX, 8(SP) | ||
18472 | SHRQ $0x05, CX | ||
18473 | SUBL CX, DX | ||
18474 | LEAQ (AX)(DX*1), DX | ||
18475 | MOVQ DX, (SP) | ||
18476 | MOVL $0x00000001, CX | ||
18477 | MOVL CX, 16(SP) | ||
18478 | MOVQ src_base+0(FP), DX | ||
18479 | |||
18480 | search_loop_calcBlockSizeSmall: | ||
18481 | MOVL CX, BX | ||
18482 | SUBL 12(SP), BX | ||
18483 | SHRL $0x04, BX | ||
18484 | LEAL 4(CX)(BX*1), BX | ||
18485 | CMPL BX, 8(SP) | ||
18486 | JAE emit_remainder_calcBlockSizeSmall | ||
18487 | MOVQ (DX)(CX*1), SI | ||
18488 | MOVL BX, 20(SP) | ||
18489 | MOVQ $0x9e3779b1, R8 | ||
18490 | MOVQ SI, R9 | ||
18491 | MOVQ SI, R10 | ||
18492 | SHRQ $0x08, R10 | ||
18493 | SHLQ $0x20, R9 | ||
18494 | IMULQ R8, R9 | ||
18495 | SHRQ $0x37, R9 | ||
18496 | SHLQ $0x20, R10 | ||
18497 | IMULQ R8, R10 | ||
18498 | SHRQ $0x37, R10 | ||
18499 | MOVL 24(SP)(R9*4), BX | ||
18500 | MOVL 24(SP)(R10*4), DI | ||
18501 | MOVL CX, 24(SP)(R9*4) | ||
18502 | LEAL 1(CX), R9 | ||
18503 | MOVL R9, 24(SP)(R10*4) | ||
18504 | MOVQ SI, R9 | ||
18505 | SHRQ $0x10, R9 | ||
18506 | SHLQ $0x20, R9 | ||
18507 | IMULQ R8, R9 | ||
18508 | SHRQ $0x37, R9 | ||
18509 | MOVL CX, R8 | ||
18510 | SUBL 16(SP), R8 | ||
18511 | MOVL 1(DX)(R8*1), R10 | ||
18512 | MOVQ SI, R8 | ||
18513 | SHRQ $0x08, R8 | ||
18514 | CMPL R8, R10 | ||
18515 | JNE no_repeat_found_calcBlockSizeSmall | ||
18516 | LEAL 1(CX), SI | ||
18517 | MOVL 12(SP), BX | ||
18518 | MOVL SI, DI | ||
18519 | SUBL 16(SP), DI | ||
18520 | JZ repeat_extend_back_end_calcBlockSizeSmall | ||
18521 | |||
18522 | repeat_extend_back_loop_calcBlockSizeSmall: | ||
18523 | CMPL SI, BX | ||
18524 | JBE repeat_extend_back_end_calcBlockSizeSmall | ||
18525 | MOVB -1(DX)(DI*1), R8 | ||
18526 | MOVB -1(DX)(SI*1), R9 | ||
18527 | CMPB R8, R9 | ||
18528 | JNE repeat_extend_back_end_calcBlockSizeSmall | ||
18529 | LEAL -1(SI), SI | ||
18530 | DECL DI | ||
18531 | JNZ repeat_extend_back_loop_calcBlockSizeSmall | ||
18532 | |||
18533 | repeat_extend_back_end_calcBlockSizeSmall: | ||
18534 | MOVL 12(SP), BX | ||
18535 | CMPL BX, SI | ||
18536 | JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall | ||
18537 | MOVL SI, DI | ||
18538 | MOVL SI, 12(SP) | ||
18539 | LEAQ (DX)(BX*1), R8 | ||
18540 | SUBL BX, DI | ||
18541 | LEAL -1(DI), BX | ||
18542 | CMPL BX, $0x3c | ||
18543 | JB one_byte_repeat_emit_calcBlockSizeSmall | ||
18544 | CMPL BX, $0x00000100 | ||
18545 | JB two_bytes_repeat_emit_calcBlockSizeSmall | ||
18546 | JB three_bytes_repeat_emit_calcBlockSizeSmall | ||
18547 | |||
18548 | three_bytes_repeat_emit_calcBlockSizeSmall: | ||
18549 | ADDQ $0x03, AX | ||
18550 | JMP memmove_long_repeat_emit_calcBlockSizeSmall | ||
18551 | |||
18552 | two_bytes_repeat_emit_calcBlockSizeSmall: | ||
18553 | ADDQ $0x02, AX | ||
18554 | CMPL BX, $0x40 | ||
18555 | JB memmove_repeat_emit_calcBlockSizeSmall | ||
18556 | JMP memmove_long_repeat_emit_calcBlockSizeSmall | ||
18557 | |||
18558 | one_byte_repeat_emit_calcBlockSizeSmall: | ||
18559 | ADDQ $0x01, AX | ||
18560 | |||
18561 | memmove_repeat_emit_calcBlockSizeSmall: | ||
18562 | LEAQ (AX)(DI*1), AX | ||
18563 | JMP emit_literal_done_repeat_emit_calcBlockSizeSmall | ||
18564 | |||
18565 | memmove_long_repeat_emit_calcBlockSizeSmall: | ||
18566 | LEAQ (AX)(DI*1), AX | ||
18567 | |||
18568 | emit_literal_done_repeat_emit_calcBlockSizeSmall: | ||
18569 | ADDL $0x05, CX | ||
18570 | MOVL CX, BX | ||
18571 | SUBL 16(SP), BX | ||
18572 | MOVQ src_len+8(FP), DI | ||
18573 | SUBL CX, DI | ||
18574 | LEAQ (DX)(CX*1), R8 | ||
18575 | LEAQ (DX)(BX*1), BX | ||
18576 | |||
18577 | // matchLen | ||
18578 | XORL R10, R10 | ||
18579 | |||
18580 | matchlen_loopback_16_repeat_extend_calcBlockSizeSmall: | ||
18581 | CMPL DI, $0x10 | ||
18582 | JB matchlen_match8_repeat_extend_calcBlockSizeSmall | ||
18583 | MOVQ (R8)(R10*1), R9 | ||
18584 | MOVQ 8(R8)(R10*1), R11 | ||
18585 | XORQ (BX)(R10*1), R9 | ||
18586 | JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall | ||
18587 | XORQ 8(BX)(R10*1), R11 | ||
18588 | JNZ matchlen_bsf_16repeat_extend_calcBlockSizeSmall | ||
18589 | LEAL -16(DI), DI | ||
18590 | LEAL 16(R10), R10 | ||
18591 | JMP matchlen_loopback_16_repeat_extend_calcBlockSizeSmall | ||
18592 | |||
18593 | matchlen_bsf_16repeat_extend_calcBlockSizeSmall: | ||
18594 | #ifdef GOAMD64_v3 | ||
18595 | TZCNTQ R11, R11 | ||
18596 | |||
18597 | #else | ||
18598 | BSFQ R11, R11 | ||
18599 | |||
18600 | #endif | ||
18601 | SARQ $0x03, R11 | ||
18602 | LEAL 8(R10)(R11*1), R10 | ||
18603 | JMP repeat_extend_forward_end_calcBlockSizeSmall | ||
18604 | |||
18605 | matchlen_match8_repeat_extend_calcBlockSizeSmall: | ||
18606 | CMPL DI, $0x08 | ||
18607 | JB matchlen_match4_repeat_extend_calcBlockSizeSmall | ||
18608 | MOVQ (R8)(R10*1), R9 | ||
18609 | XORQ (BX)(R10*1), R9 | ||
18610 | JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall | ||
18611 | LEAL -8(DI), DI | ||
18612 | LEAL 8(R10), R10 | ||
18613 | JMP matchlen_match4_repeat_extend_calcBlockSizeSmall | ||
18614 | |||
18615 | matchlen_bsf_8_repeat_extend_calcBlockSizeSmall: | ||
18616 | #ifdef GOAMD64_v3 | ||
18617 | TZCNTQ R9, R9 | ||
18618 | |||
18619 | #else | ||
18620 | BSFQ R9, R9 | ||
18621 | |||
18622 | #endif | ||
18623 | SARQ $0x03, R9 | ||
18624 | LEAL (R10)(R9*1), R10 | ||
18625 | JMP repeat_extend_forward_end_calcBlockSizeSmall | ||
18626 | |||
18627 | matchlen_match4_repeat_extend_calcBlockSizeSmall: | ||
18628 | CMPL DI, $0x04 | ||
18629 | JB matchlen_match2_repeat_extend_calcBlockSizeSmall | ||
18630 | MOVL (R8)(R10*1), R9 | ||
18631 | CMPL (BX)(R10*1), R9 | ||
18632 | JNE matchlen_match2_repeat_extend_calcBlockSizeSmall | ||
18633 | LEAL -4(DI), DI | ||
18634 | LEAL 4(R10), R10 | ||
18635 | |||
18636 | matchlen_match2_repeat_extend_calcBlockSizeSmall: | ||
18637 | CMPL DI, $0x01 | ||
18638 | JE matchlen_match1_repeat_extend_calcBlockSizeSmall | ||
18639 | JB repeat_extend_forward_end_calcBlockSizeSmall | ||
18640 | MOVW (R8)(R10*1), R9 | ||
18641 | CMPW (BX)(R10*1), R9 | ||
18642 | JNE matchlen_match1_repeat_extend_calcBlockSizeSmall | ||
18643 | LEAL 2(R10), R10 | ||
18644 | SUBL $0x02, DI | ||
18645 | JZ repeat_extend_forward_end_calcBlockSizeSmall | ||
18646 | |||
18647 | matchlen_match1_repeat_extend_calcBlockSizeSmall: | ||
18648 | MOVB (R8)(R10*1), R9 | ||
18649 | CMPB (BX)(R10*1), R9 | ||
18650 | JNE repeat_extend_forward_end_calcBlockSizeSmall | ||
18651 | LEAL 1(R10), R10 | ||
18652 | |||
18653 | repeat_extend_forward_end_calcBlockSizeSmall: | ||
18654 | ADDL R10, CX | ||
18655 | MOVL CX, BX | ||
18656 | SUBL SI, BX | ||
18657 | MOVL 16(SP), SI | ||
18658 | |||
18659 | // emitCopy | ||
18660 | two_byte_offset_repeat_as_copy_calcBlockSizeSmall: | ||
18661 | CMPL BX, $0x40 | ||
18662 | JBE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall | ||
18663 | LEAL -60(BX), BX | ||
18664 | ADDQ $0x03, AX | ||
18665 | JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall | ||
18666 | |||
18667 | two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall: | ||
18668 | MOVL BX, SI | ||
18669 | SHLL $0x02, SI | ||
18670 | CMPL BX, $0x0c | ||
18671 | JAE emit_copy_three_repeat_as_copy_calcBlockSizeSmall | ||
18672 | ADDQ $0x02, AX | ||
18673 | JMP repeat_end_emit_calcBlockSizeSmall | ||
18674 | |||
18675 | emit_copy_three_repeat_as_copy_calcBlockSizeSmall: | ||
18676 | ADDQ $0x03, AX | ||
18677 | |||
18678 | repeat_end_emit_calcBlockSizeSmall: | ||
18679 | MOVL CX, 12(SP) | ||
18680 | JMP search_loop_calcBlockSizeSmall | ||
18681 | |||
18682 | no_repeat_found_calcBlockSizeSmall: | ||
18683 | CMPL (DX)(BX*1), SI | ||
18684 | JEQ candidate_match_calcBlockSizeSmall | ||
18685 | SHRQ $0x08, SI | ||
18686 | MOVL 24(SP)(R9*4), BX | ||
18687 | LEAL 2(CX), R8 | ||
18688 | CMPL (DX)(DI*1), SI | ||
18689 | JEQ candidate2_match_calcBlockSizeSmall | ||
18690 | MOVL R8, 24(SP)(R9*4) | ||
18691 | SHRQ $0x08, SI | ||
18692 | CMPL (DX)(BX*1), SI | ||
18693 | JEQ candidate3_match_calcBlockSizeSmall | ||
18694 | MOVL 20(SP), CX | ||
18695 | JMP search_loop_calcBlockSizeSmall | ||
18696 | |||
18697 | candidate3_match_calcBlockSizeSmall: | ||
18698 | ADDL $0x02, CX | ||
18699 | JMP candidate_match_calcBlockSizeSmall | ||
18700 | |||
18701 | candidate2_match_calcBlockSizeSmall: | ||
18702 | MOVL R8, 24(SP)(R9*4) | ||
18703 | INCL CX | ||
18704 | MOVL DI, BX | ||
18705 | |||
18706 | candidate_match_calcBlockSizeSmall: | ||
18707 | MOVL 12(SP), SI | ||
18708 | TESTL BX, BX | ||
18709 | JZ match_extend_back_end_calcBlockSizeSmall | ||
18710 | |||
18711 | match_extend_back_loop_calcBlockSizeSmall: | ||
18712 | CMPL CX, SI | ||
18713 | JBE match_extend_back_end_calcBlockSizeSmall | ||
18714 | MOVB -1(DX)(BX*1), DI | ||
18715 | MOVB -1(DX)(CX*1), R8 | ||
18716 | CMPB DI, R8 | ||
18717 | JNE match_extend_back_end_calcBlockSizeSmall | ||
18718 | LEAL -1(CX), CX | ||
18719 | DECL BX | ||
18720 | JZ match_extend_back_end_calcBlockSizeSmall | ||
18721 | JMP match_extend_back_loop_calcBlockSizeSmall | ||
18722 | |||
18723 | match_extend_back_end_calcBlockSizeSmall: | ||
18724 | MOVL CX, SI | ||
18725 | SUBL 12(SP), SI | ||
18726 | LEAQ 3(AX)(SI*1), SI | ||
18727 | CMPQ SI, (SP) | ||
18728 | JB match_dst_size_check_calcBlockSizeSmall | ||
18729 | MOVQ $0x00000000, ret+24(FP) | ||
18730 | RET | ||
18731 | |||
18732 | match_dst_size_check_calcBlockSizeSmall: | ||
18733 | MOVL CX, SI | ||
18734 | MOVL 12(SP), DI | ||
18735 | CMPL DI, SI | ||
18736 | JEQ emit_literal_done_match_emit_calcBlockSizeSmall | ||
18737 | MOVL SI, R8 | ||
18738 | MOVL SI, 12(SP) | ||
18739 | LEAQ (DX)(DI*1), SI | ||
18740 | SUBL DI, R8 | ||
18741 | LEAL -1(R8), SI | ||
18742 | CMPL SI, $0x3c | ||
18743 | JB one_byte_match_emit_calcBlockSizeSmall | ||
18744 | CMPL SI, $0x00000100 | ||
18745 | JB two_bytes_match_emit_calcBlockSizeSmall | ||
18746 | JB three_bytes_match_emit_calcBlockSizeSmall | ||
18747 | |||
18748 | three_bytes_match_emit_calcBlockSizeSmall: | ||
18749 | ADDQ $0x03, AX | ||
18750 | JMP memmove_long_match_emit_calcBlockSizeSmall | ||
18751 | |||
18752 | two_bytes_match_emit_calcBlockSizeSmall: | ||
18753 | ADDQ $0x02, AX | ||
18754 | CMPL SI, $0x40 | ||
18755 | JB memmove_match_emit_calcBlockSizeSmall | ||
18756 | JMP memmove_long_match_emit_calcBlockSizeSmall | ||
18757 | |||
18758 | one_byte_match_emit_calcBlockSizeSmall: | ||
18759 | ADDQ $0x01, AX | ||
18760 | |||
18761 | memmove_match_emit_calcBlockSizeSmall: | ||
18762 | LEAQ (AX)(R8*1), AX | ||
18763 | JMP emit_literal_done_match_emit_calcBlockSizeSmall | ||
18764 | |||
18765 | memmove_long_match_emit_calcBlockSizeSmall: | ||
18766 | LEAQ (AX)(R8*1), AX | ||
18767 | |||
18768 | emit_literal_done_match_emit_calcBlockSizeSmall: | ||
18769 | match_nolit_loop_calcBlockSizeSmall: | ||
18770 | MOVL CX, SI | ||
18771 | SUBL BX, SI | ||
18772 | MOVL SI, 16(SP) | ||
18773 | ADDL $0x04, CX | ||
18774 | ADDL $0x04, BX | ||
18775 | MOVQ src_len+8(FP), SI | ||
18776 | SUBL CX, SI | ||
18777 | LEAQ (DX)(CX*1), DI | ||
18778 | LEAQ (DX)(BX*1), BX | ||
18779 | |||
18780 | // matchLen | ||
18781 | XORL R9, R9 | ||
18782 | |||
18783 | matchlen_loopback_16_match_nolit_calcBlockSizeSmall: | ||
18784 | CMPL SI, $0x10 | ||
18785 | JB matchlen_match8_match_nolit_calcBlockSizeSmall | ||
18786 | MOVQ (DI)(R9*1), R8 | ||
18787 | MOVQ 8(DI)(R9*1), R10 | ||
18788 | XORQ (BX)(R9*1), R8 | ||
18789 | JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall | ||
18790 | XORQ 8(BX)(R9*1), R10 | ||
18791 | JNZ matchlen_bsf_16match_nolit_calcBlockSizeSmall | ||
18792 | LEAL -16(SI), SI | ||
18793 | LEAL 16(R9), R9 | ||
18794 | JMP matchlen_loopback_16_match_nolit_calcBlockSizeSmall | ||
18795 | |||
18796 | matchlen_bsf_16match_nolit_calcBlockSizeSmall: | ||
18797 | #ifdef GOAMD64_v3 | ||
18798 | TZCNTQ R10, R10 | ||
18799 | |||
18800 | #else | ||
18801 | BSFQ R10, R10 | ||
18802 | |||
18803 | #endif | ||
18804 | SARQ $0x03, R10 | ||
18805 | LEAL 8(R9)(R10*1), R9 | ||
18806 | JMP match_nolit_end_calcBlockSizeSmall | ||
18807 | |||
18808 | matchlen_match8_match_nolit_calcBlockSizeSmall: | ||
18809 | CMPL SI, $0x08 | ||
18810 | JB matchlen_match4_match_nolit_calcBlockSizeSmall | ||
18811 | MOVQ (DI)(R9*1), R8 | ||
18812 | XORQ (BX)(R9*1), R8 | ||
18813 | JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall | ||
18814 | LEAL -8(SI), SI | ||
18815 | LEAL 8(R9), R9 | ||
18816 | JMP matchlen_match4_match_nolit_calcBlockSizeSmall | ||
18817 | |||
18818 | matchlen_bsf_8_match_nolit_calcBlockSizeSmall: | ||
18819 | #ifdef GOAMD64_v3 | ||
18820 | TZCNTQ R8, R8 | ||
18821 | |||
18822 | #else | ||
18823 | BSFQ R8, R8 | ||
18824 | |||
18825 | #endif | ||
18826 | SARQ $0x03, R8 | ||
18827 | LEAL (R9)(R8*1), R9 | ||
18828 | JMP match_nolit_end_calcBlockSizeSmall | ||
18829 | |||
18830 | matchlen_match4_match_nolit_calcBlockSizeSmall: | ||
18831 | CMPL SI, $0x04 | ||
18832 | JB matchlen_match2_match_nolit_calcBlockSizeSmall | ||
18833 | MOVL (DI)(R9*1), R8 | ||
18834 | CMPL (BX)(R9*1), R8 | ||
18835 | JNE matchlen_match2_match_nolit_calcBlockSizeSmall | ||
18836 | LEAL -4(SI), SI | ||
18837 | LEAL 4(R9), R9 | ||
18838 | |||
18839 | matchlen_match2_match_nolit_calcBlockSizeSmall: | ||
18840 | CMPL SI, $0x01 | ||
18841 | JE matchlen_match1_match_nolit_calcBlockSizeSmall | ||
18842 | JB match_nolit_end_calcBlockSizeSmall | ||
18843 | MOVW (DI)(R9*1), R8 | ||
18844 | CMPW (BX)(R9*1), R8 | ||
18845 | JNE matchlen_match1_match_nolit_calcBlockSizeSmall | ||
18846 | LEAL 2(R9), R9 | ||
18847 | SUBL $0x02, SI | ||
18848 | JZ match_nolit_end_calcBlockSizeSmall | ||
18849 | |||
18850 | matchlen_match1_match_nolit_calcBlockSizeSmall: | ||
18851 | MOVB (DI)(R9*1), R8 | ||
18852 | CMPB (BX)(R9*1), R8 | ||
18853 | JNE match_nolit_end_calcBlockSizeSmall | ||
18854 | LEAL 1(R9), R9 | ||
18855 | |||
18856 | match_nolit_end_calcBlockSizeSmall: | ||
18857 | ADDL R9, CX | ||
18858 | MOVL 16(SP), BX | ||
18859 | ADDL $0x04, R9 | ||
18860 | MOVL CX, 12(SP) | ||
18861 | |||
18862 | // emitCopy | ||
18863 | two_byte_offset_match_nolit_calcBlockSizeSmall: | ||
18864 | CMPL R9, $0x40 | ||
18865 | JBE two_byte_offset_short_match_nolit_calcBlockSizeSmall | ||
18866 | LEAL -60(R9), R9 | ||
18867 | ADDQ $0x03, AX | ||
18868 | JMP two_byte_offset_match_nolit_calcBlockSizeSmall | ||
18869 | |||
18870 | two_byte_offset_short_match_nolit_calcBlockSizeSmall: | ||
18871 | MOVL R9, BX | ||
18872 | SHLL $0x02, BX | ||
18873 | CMPL R9, $0x0c | ||
18874 | JAE emit_copy_three_match_nolit_calcBlockSizeSmall | ||
18875 | ADDQ $0x02, AX | ||
18876 | JMP match_nolit_emitcopy_end_calcBlockSizeSmall | ||
18877 | |||
18878 | emit_copy_three_match_nolit_calcBlockSizeSmall: | ||
18879 | ADDQ $0x03, AX | ||
18880 | |||
18881 | match_nolit_emitcopy_end_calcBlockSizeSmall: | ||
18882 | CMPL CX, 8(SP) | ||
18883 | JAE emit_remainder_calcBlockSizeSmall | ||
18884 | MOVQ -2(DX)(CX*1), SI | ||
18885 | CMPQ AX, (SP) | ||
18886 | JB match_nolit_dst_ok_calcBlockSizeSmall | ||
18887 | MOVQ $0x00000000, ret+24(FP) | ||
18888 | RET | ||
18889 | |||
18890 | match_nolit_dst_ok_calcBlockSizeSmall: | ||
18891 | MOVQ $0x9e3779b1, R8 | ||
18892 | MOVQ SI, DI | ||
18893 | SHRQ $0x10, SI | ||
18894 | MOVQ SI, BX | ||
18895 | SHLQ $0x20, DI | ||
18896 | IMULQ R8, DI | ||
18897 | SHRQ $0x37, DI | ||
18898 | SHLQ $0x20, BX | ||
18899 | IMULQ R8, BX | ||
18900 | SHRQ $0x37, BX | ||
18901 | LEAL -2(CX), R8 | ||
18902 | LEAQ 24(SP)(BX*4), R9 | ||
18903 | MOVL (R9), BX | ||
18904 | MOVL R8, 24(SP)(DI*4) | ||
18905 | MOVL CX, (R9) | ||
18906 | CMPL (DX)(BX*1), SI | ||
18907 | JEQ match_nolit_loop_calcBlockSizeSmall | ||
18908 | INCL CX | ||
18909 | JMP search_loop_calcBlockSizeSmall | ||
18910 | |||
18911 | emit_remainder_calcBlockSizeSmall: | ||
18912 | MOVQ src_len+8(FP), CX | ||
18913 | SUBL 12(SP), CX | ||
18914 | LEAQ 3(AX)(CX*1), CX | ||
18915 | CMPQ CX, (SP) | ||
18916 | JB emit_remainder_ok_calcBlockSizeSmall | ||
18917 | MOVQ $0x00000000, ret+24(FP) | ||
18918 | RET | ||
18919 | |||
18920 | emit_remainder_ok_calcBlockSizeSmall: | ||
18921 | MOVQ src_len+8(FP), CX | ||
18922 | MOVL 12(SP), BX | ||
18923 | CMPL BX, CX | ||
18924 | JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall | ||
18925 | MOVL CX, SI | ||
18926 | MOVL CX, 12(SP) | ||
18927 | LEAQ (DX)(BX*1), CX | ||
18928 | SUBL BX, SI | ||
18929 | LEAL -1(SI), CX | ||
18930 | CMPL CX, $0x3c | ||
18931 | JB one_byte_emit_remainder_calcBlockSizeSmall | ||
18932 | CMPL CX, $0x00000100 | ||
18933 | JB two_bytes_emit_remainder_calcBlockSizeSmall | ||
18934 | JB three_bytes_emit_remainder_calcBlockSizeSmall | ||
18935 | |||
18936 | three_bytes_emit_remainder_calcBlockSizeSmall: | ||
18937 | ADDQ $0x03, AX | ||
18938 | JMP memmove_long_emit_remainder_calcBlockSizeSmall | ||
18939 | |||
18940 | two_bytes_emit_remainder_calcBlockSizeSmall: | ||
18941 | ADDQ $0x02, AX | ||
18942 | CMPL CX, $0x40 | ||
18943 | JB memmove_emit_remainder_calcBlockSizeSmall | ||
18944 | JMP memmove_long_emit_remainder_calcBlockSizeSmall | ||
18945 | |||
18946 | one_byte_emit_remainder_calcBlockSizeSmall: | ||
18947 | ADDQ $0x01, AX | ||
18948 | |||
18949 | memmove_emit_remainder_calcBlockSizeSmall: | ||
18950 | LEAQ (AX)(SI*1), AX | ||
18951 | JMP emit_literal_done_emit_remainder_calcBlockSizeSmall | ||
18952 | |||
18953 | memmove_long_emit_remainder_calcBlockSizeSmall: | ||
18954 | LEAQ (AX)(SI*1), AX | ||
18955 | |||
18956 | emit_literal_done_emit_remainder_calcBlockSizeSmall: | ||
18957 | MOVQ AX, ret+24(FP) | ||
18958 | RET | ||
18959 | |||
18960 | // func emitLiteral(dst []byte, lit []byte) int | ||
18961 | // Requires: SSE2 | ||
18962 | TEXT ·emitLiteral(SB), NOSPLIT, $0-56 | ||
18963 | MOVQ lit_len+32(FP), DX | ||
18964 | MOVQ dst_base+0(FP), AX | ||
18965 | MOVQ lit_base+24(FP), CX | ||
18966 | TESTQ DX, DX | ||
18967 | JZ emit_literal_end_standalone_skip | ||
18968 | MOVL DX, BX | ||
18969 | LEAL -1(DX), SI | ||
18970 | CMPL SI, $0x3c | ||
18971 | JB one_byte_standalone | ||
18972 | CMPL SI, $0x00000100 | ||
18973 | JB two_bytes_standalone | ||
18974 | CMPL SI, $0x00010000 | ||
18975 | JB three_bytes_standalone | ||
18976 | CMPL SI, $0x01000000 | ||
18977 | JB four_bytes_standalone | ||
18978 | MOVB $0xfc, (AX) | ||
18979 | MOVL SI, 1(AX) | ||
18980 | ADDQ $0x05, BX | ||
18981 | ADDQ $0x05, AX | ||
18982 | JMP memmove_long_standalone | ||
18983 | |||
18984 | four_bytes_standalone: | ||
18985 | MOVL SI, DI | ||
18986 | SHRL $0x10, DI | ||
18987 | MOVB $0xf8, (AX) | ||
18988 | MOVW SI, 1(AX) | ||
18989 | MOVB DI, 3(AX) | ||
18990 | ADDQ $0x04, BX | ||
18991 | ADDQ $0x04, AX | ||
18992 | JMP memmove_long_standalone | ||
18993 | |||
18994 | three_bytes_standalone: | ||
18995 | MOVB $0xf4, (AX) | ||
18996 | MOVW SI, 1(AX) | ||
18997 | ADDQ $0x03, BX | ||
18998 | ADDQ $0x03, AX | ||
18999 | JMP memmove_long_standalone | ||
19000 | |||
19001 | two_bytes_standalone: | ||
19002 | MOVB $0xf0, (AX) | ||
19003 | MOVB SI, 1(AX) | ||
19004 | ADDQ $0x02, BX | ||
19005 | ADDQ $0x02, AX | ||
19006 | CMPL SI, $0x40 | ||
19007 | JB memmove_standalone | ||
19008 | JMP memmove_long_standalone | ||
19009 | |||
19010 | one_byte_standalone: | ||
19011 | SHLB $0x02, SI | ||
19012 | MOVB SI, (AX) | ||
19013 | ADDQ $0x01, BX | ||
19014 | ADDQ $0x01, AX | ||
19015 | |||
19016 | memmove_standalone: | ||
19017 | // genMemMoveShort | ||
19018 | CMPQ DX, $0x03 | ||
19019 | JB emit_lit_memmove_standalone_memmove_move_1or2 | ||
19020 | JE emit_lit_memmove_standalone_memmove_move_3 | ||
19021 | CMPQ DX, $0x08 | ||
19022 | JB emit_lit_memmove_standalone_memmove_move_4through7 | ||
19023 | CMPQ DX, $0x10 | ||
19024 | JBE emit_lit_memmove_standalone_memmove_move_8through16 | ||
19025 | CMPQ DX, $0x20 | ||
19026 | JBE emit_lit_memmove_standalone_memmove_move_17through32 | ||
19027 | JMP emit_lit_memmove_standalone_memmove_move_33through64 | ||
19028 | |||
19029 | emit_lit_memmove_standalone_memmove_move_1or2: | ||
19030 | MOVB (CX), SI | ||
19031 | MOVB -1(CX)(DX*1), CL | ||
19032 | MOVB SI, (AX) | ||
19033 | MOVB CL, -1(AX)(DX*1) | ||
19034 | JMP emit_literal_end_standalone | ||
19035 | |||
19036 | emit_lit_memmove_standalone_memmove_move_3: | ||
19037 | MOVW (CX), SI | ||
19038 | MOVB 2(CX), CL | ||
19039 | MOVW SI, (AX) | ||
19040 | MOVB CL, 2(AX) | ||
19041 | JMP emit_literal_end_standalone | ||
19042 | |||
19043 | emit_lit_memmove_standalone_memmove_move_4through7: | ||
19044 | MOVL (CX), SI | ||
19045 | MOVL -4(CX)(DX*1), CX | ||
19046 | MOVL SI, (AX) | ||
19047 | MOVL CX, -4(AX)(DX*1) | ||
19048 | JMP emit_literal_end_standalone | ||
19049 | |||
19050 | emit_lit_memmove_standalone_memmove_move_8through16: | ||
19051 | MOVQ (CX), SI | ||
19052 | MOVQ -8(CX)(DX*1), CX | ||
19053 | MOVQ SI, (AX) | ||
19054 | MOVQ CX, -8(AX)(DX*1) | ||
19055 | JMP emit_literal_end_standalone | ||
19056 | |||
19057 | emit_lit_memmove_standalone_memmove_move_17through32: | ||
19058 | MOVOU (CX), X0 | ||
19059 | MOVOU -16(CX)(DX*1), X1 | ||
19060 | MOVOU X0, (AX) | ||
19061 | MOVOU X1, -16(AX)(DX*1) | ||
19062 | JMP emit_literal_end_standalone | ||
19063 | |||
19064 | emit_lit_memmove_standalone_memmove_move_33through64: | ||
19065 | MOVOU (CX), X0 | ||
19066 | MOVOU 16(CX), X1 | ||
19067 | MOVOU -32(CX)(DX*1), X2 | ||
19068 | MOVOU -16(CX)(DX*1), X3 | ||
19069 | MOVOU X0, (AX) | ||
19070 | MOVOU X1, 16(AX) | ||
19071 | MOVOU X2, -32(AX)(DX*1) | ||
19072 | MOVOU X3, -16(AX)(DX*1) | ||
19073 | JMP emit_literal_end_standalone | ||
19074 | JMP emit_literal_end_standalone | ||
19075 | |||
19076 | memmove_long_standalone: | ||
19077 | // genMemMoveLong | ||
19078 | MOVOU (CX), X0 | ||
19079 | MOVOU 16(CX), X1 | ||
19080 | MOVOU -32(CX)(DX*1), X2 | ||
19081 | MOVOU -16(CX)(DX*1), X3 | ||
19082 | MOVQ DX, DI | ||
19083 | SHRQ $0x05, DI | ||
19084 | MOVQ AX, SI | ||
19085 | ANDL $0x0000001f, SI | ||
19086 | MOVQ $0x00000040, R8 | ||
19087 | SUBQ SI, R8 | ||
19088 | DECQ DI | ||
19089 | JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 | ||
19090 | LEAQ -32(CX)(R8*1), SI | ||
19091 | LEAQ -32(AX)(R8*1), R9 | ||
19092 | |||
19093 | emit_lit_memmove_long_standalonelarge_big_loop_back: | ||
19094 | MOVOU (SI), X4 | ||
19095 | MOVOU 16(SI), X5 | ||
19096 | MOVOA X4, (R9) | ||
19097 | MOVOA X5, 16(R9) | ||
19098 | ADDQ $0x20, R9 | ||
19099 | ADDQ $0x20, SI | ||
19100 | ADDQ $0x20, R8 | ||
19101 | DECQ DI | ||
19102 | JNA emit_lit_memmove_long_standalonelarge_big_loop_back | ||
19103 | |||
19104 | emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: | ||
19105 | MOVOU -32(CX)(R8*1), X4 | ||
19106 | MOVOU -16(CX)(R8*1), X5 | ||
19107 | MOVOA X4, -32(AX)(R8*1) | ||
19108 | MOVOA X5, -16(AX)(R8*1) | ||
19109 | ADDQ $0x20, R8 | ||
19110 | CMPQ DX, R8 | ||
19111 | JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 | ||
19112 | MOVOU X0, (AX) | ||
19113 | MOVOU X1, 16(AX) | ||
19114 | MOVOU X2, -32(AX)(DX*1) | ||
19115 | MOVOU X3, -16(AX)(DX*1) | ||
19116 | JMP emit_literal_end_standalone | ||
19117 | JMP emit_literal_end_standalone | ||
19118 | |||
19119 | emit_literal_end_standalone_skip: | ||
19120 | XORQ BX, BX | ||
19121 | |||
19122 | emit_literal_end_standalone: | ||
19123 | MOVQ BX, ret+48(FP) | ||
19124 | RET | ||
19125 | |||
19126 | // func emitRepeat(dst []byte, offset int, length int) int | ||
19127 | TEXT ·emitRepeat(SB), NOSPLIT, $0-48 | ||
19128 | XORQ BX, BX | ||
19129 | MOVQ dst_base+0(FP), AX | ||
19130 | MOVQ offset+24(FP), CX | ||
19131 | MOVQ length+32(FP), DX | ||
19132 | |||
19133 | // emitRepeat | ||
19134 | emit_repeat_again_standalone: | ||
19135 | MOVL DX, SI | ||
19136 | LEAL -4(DX), DX | ||
19137 | CMPL SI, $0x08 | ||
19138 | JBE repeat_two_standalone | ||
19139 | CMPL SI, $0x0c | ||
19140 | JAE cant_repeat_two_offset_standalone | ||
19141 | CMPL CX, $0x00000800 | ||
19142 | JB repeat_two_offset_standalone | ||
19143 | |||
19144 | cant_repeat_two_offset_standalone: | ||
19145 | CMPL DX, $0x00000104 | ||
19146 | JB repeat_three_standalone | ||
19147 | CMPL DX, $0x00010100 | ||
19148 | JB repeat_four_standalone | ||
19149 | CMPL DX, $0x0100ffff | ||
19150 | JB repeat_five_standalone | ||
19151 | LEAL -16842747(DX), DX | ||
19152 | MOVL $0xfffb001d, (AX) | ||
19153 | MOVB $0xff, 4(AX) | ||
19154 | ADDQ $0x05, AX | ||
19155 | ADDQ $0x05, BX | ||
19156 | JMP emit_repeat_again_standalone | ||
19157 | |||
19158 | repeat_five_standalone: | ||
19159 | LEAL -65536(DX), DX | ||
19160 | MOVL DX, CX | ||
19161 | MOVW $0x001d, (AX) | ||
19162 | MOVW DX, 2(AX) | ||
19163 | SARL $0x10, CX | ||
19164 | MOVB CL, 4(AX) | ||
19165 | ADDQ $0x05, BX | ||
19166 | ADDQ $0x05, AX | ||
19167 | JMP gen_emit_repeat_end | ||
19168 | |||
19169 | repeat_four_standalone: | ||
19170 | LEAL -256(DX), DX | ||
19171 | MOVW $0x0019, (AX) | ||
19172 | MOVW DX, 2(AX) | ||
19173 | ADDQ $0x04, BX | ||
19174 | ADDQ $0x04, AX | ||
19175 | JMP gen_emit_repeat_end | ||
19176 | |||
19177 | repeat_three_standalone: | ||
19178 | LEAL -4(DX), DX | ||
19179 | MOVW $0x0015, (AX) | ||
19180 | MOVB DL, 2(AX) | ||
19181 | ADDQ $0x03, BX | ||
19182 | ADDQ $0x03, AX | ||
19183 | JMP gen_emit_repeat_end | ||
19184 | |||
19185 | repeat_two_standalone: | ||
19186 | SHLL $0x02, DX | ||
19187 | ORL $0x01, DX | ||
19188 | MOVW DX, (AX) | ||
19189 | ADDQ $0x02, BX | ||
19190 | ADDQ $0x02, AX | ||
19191 | JMP gen_emit_repeat_end | ||
19192 | |||
19193 | repeat_two_offset_standalone: | ||
19194 | XORQ SI, SI | ||
19195 | LEAL 1(SI)(DX*4), DX | ||
19196 | MOVB CL, 1(AX) | ||
19197 | SARL $0x08, CX | ||
19198 | SHLL $0x05, CX | ||
19199 | ORL CX, DX | ||
19200 | MOVB DL, (AX) | ||
19201 | ADDQ $0x02, BX | ||
19202 | ADDQ $0x02, AX | ||
19203 | |||
19204 | gen_emit_repeat_end: | ||
19205 | MOVQ BX, ret+40(FP) | ||
19206 | RET | ||
19207 | |||
19208 | // func emitCopy(dst []byte, offset int, length int) int | ||
19209 | TEXT ·emitCopy(SB), NOSPLIT, $0-48 | ||
19210 | XORQ BX, BX | ||
19211 | MOVQ dst_base+0(FP), AX | ||
19212 | MOVQ offset+24(FP), CX | ||
19213 | MOVQ length+32(FP), DX | ||
19214 | |||
19215 | // emitCopy | ||
19216 | CMPL CX, $0x00010000 | ||
19217 | JB two_byte_offset_standalone | ||
19218 | CMPL DX, $0x40 | ||
19219 | JBE four_bytes_remain_standalone | ||
19220 | MOVB $0xff, (AX) | ||
19221 | MOVL CX, 1(AX) | ||
19222 | LEAL -64(DX), DX | ||
19223 | ADDQ $0x05, BX | ||
19224 | ADDQ $0x05, AX | ||
19225 | CMPL DX, $0x04 | ||
19226 | JB four_bytes_remain_standalone | ||
19227 | |||
19228 | // emitRepeat | ||
19229 | emit_repeat_again_standalone_emit_copy: | ||
19230 | MOVL DX, SI | ||
19231 | LEAL -4(DX), DX | ||
19232 | CMPL SI, $0x08 | ||
19233 | JBE repeat_two_standalone_emit_copy | ||
19234 | CMPL SI, $0x0c | ||
19235 | JAE cant_repeat_two_offset_standalone_emit_copy | ||
19236 | CMPL CX, $0x00000800 | ||
19237 | JB repeat_two_offset_standalone_emit_copy | ||
19238 | |||
19239 | cant_repeat_two_offset_standalone_emit_copy: | ||
19240 | CMPL DX, $0x00000104 | ||
19241 | JB repeat_three_standalone_emit_copy | ||
19242 | CMPL DX, $0x00010100 | ||
19243 | JB repeat_four_standalone_emit_copy | ||
19244 | CMPL DX, $0x0100ffff | ||
19245 | JB repeat_five_standalone_emit_copy | ||
19246 | LEAL -16842747(DX), DX | ||
19247 | MOVL $0xfffb001d, (AX) | ||
19248 | MOVB $0xff, 4(AX) | ||
19249 | ADDQ $0x05, AX | ||
19250 | ADDQ $0x05, BX | ||
19251 | JMP emit_repeat_again_standalone_emit_copy | ||
19252 | |||
19253 | repeat_five_standalone_emit_copy: | ||
19254 | LEAL -65536(DX), DX | ||
19255 | MOVL DX, CX | ||
19256 | MOVW $0x001d, (AX) | ||
19257 | MOVW DX, 2(AX) | ||
19258 | SARL $0x10, CX | ||
19259 | MOVB CL, 4(AX) | ||
19260 | ADDQ $0x05, BX | ||
19261 | ADDQ $0x05, AX | ||
19262 | JMP gen_emit_copy_end | ||
19263 | |||
19264 | repeat_four_standalone_emit_copy: | ||
19265 | LEAL -256(DX), DX | ||
19266 | MOVW $0x0019, (AX) | ||
19267 | MOVW DX, 2(AX) | ||
19268 | ADDQ $0x04, BX | ||
19269 | ADDQ $0x04, AX | ||
19270 | JMP gen_emit_copy_end | ||
19271 | |||
19272 | repeat_three_standalone_emit_copy: | ||
19273 | LEAL -4(DX), DX | ||
19274 | MOVW $0x0015, (AX) | ||
19275 | MOVB DL, 2(AX) | ||
19276 | ADDQ $0x03, BX | ||
19277 | ADDQ $0x03, AX | ||
19278 | JMP gen_emit_copy_end | ||
19279 | |||
19280 | repeat_two_standalone_emit_copy: | ||
19281 | SHLL $0x02, DX | ||
19282 | ORL $0x01, DX | ||
19283 | MOVW DX, (AX) | ||
19284 | ADDQ $0x02, BX | ||
19285 | ADDQ $0x02, AX | ||
19286 | JMP gen_emit_copy_end | ||
19287 | |||
19288 | repeat_two_offset_standalone_emit_copy: | ||
19289 | XORQ SI, SI | ||
19290 | LEAL 1(SI)(DX*4), DX | ||
19291 | MOVB CL, 1(AX) | ||
19292 | SARL $0x08, CX | ||
19293 | SHLL $0x05, CX | ||
19294 | ORL CX, DX | ||
19295 | MOVB DL, (AX) | ||
19296 | ADDQ $0x02, BX | ||
19297 | ADDQ $0x02, AX | ||
19298 | JMP gen_emit_copy_end | ||
19299 | |||
19300 | four_bytes_remain_standalone: | ||
19301 | TESTL DX, DX | ||
19302 | JZ gen_emit_copy_end | ||
19303 | XORL SI, SI | ||
19304 | LEAL -1(SI)(DX*4), DX | ||
19305 | MOVB DL, (AX) | ||
19306 | MOVL CX, 1(AX) | ||
19307 | ADDQ $0x05, BX | ||
19308 | ADDQ $0x05, AX | ||
19309 | JMP gen_emit_copy_end | ||
19310 | |||
19311 | two_byte_offset_standalone: | ||
19312 | CMPL DX, $0x40 | ||
19313 | JBE two_byte_offset_short_standalone | ||
19314 | CMPL CX, $0x00000800 | ||
19315 | JAE long_offset_short_standalone | ||
19316 | MOVL $0x00000001, SI | ||
19317 | LEAL 16(SI), SI | ||
19318 | MOVB CL, 1(AX) | ||
19319 | MOVL CX, DI | ||
19320 | SHRL $0x08, DI | ||
19321 | SHLL $0x05, DI | ||
19322 | ORL DI, SI | ||
19323 | MOVB SI, (AX) | ||
19324 | ADDQ $0x02, BX | ||
19325 | ADDQ $0x02, AX | ||
19326 | SUBL $0x08, DX | ||
19327 | |||
19328 | // emitRepeat | ||
19329 | LEAL -4(DX), DX | ||
19330 | JMP cant_repeat_two_offset_standalone_emit_copy_short_2b | ||
19331 | |||
19332 | emit_repeat_again_standalone_emit_copy_short_2b: | ||
19333 | MOVL DX, SI | ||
19334 | LEAL -4(DX), DX | ||
19335 | CMPL SI, $0x08 | ||
19336 | JBE repeat_two_standalone_emit_copy_short_2b | ||
19337 | CMPL SI, $0x0c | ||
19338 | JAE cant_repeat_two_offset_standalone_emit_copy_short_2b | ||
19339 | CMPL CX, $0x00000800 | ||
19340 | JB repeat_two_offset_standalone_emit_copy_short_2b | ||
19341 | |||
19342 | cant_repeat_two_offset_standalone_emit_copy_short_2b: | ||
19343 | CMPL DX, $0x00000104 | ||
19344 | JB repeat_three_standalone_emit_copy_short_2b | ||
19345 | CMPL DX, $0x00010100 | ||
19346 | JB repeat_four_standalone_emit_copy_short_2b | ||
19347 | CMPL DX, $0x0100ffff | ||
19348 | JB repeat_five_standalone_emit_copy_short_2b | ||
19349 | LEAL -16842747(DX), DX | ||
19350 | MOVL $0xfffb001d, (AX) | ||
19351 | MOVB $0xff, 4(AX) | ||
19352 | ADDQ $0x05, AX | ||
19353 | ADDQ $0x05, BX | ||
19354 | JMP emit_repeat_again_standalone_emit_copy_short_2b | ||
19355 | |||
19356 | repeat_five_standalone_emit_copy_short_2b: | ||
19357 | LEAL -65536(DX), DX | ||
19358 | MOVL DX, CX | ||
19359 | MOVW $0x001d, (AX) | ||
19360 | MOVW DX, 2(AX) | ||
19361 | SARL $0x10, CX | ||
19362 | MOVB CL, 4(AX) | ||
19363 | ADDQ $0x05, BX | ||
19364 | ADDQ $0x05, AX | ||
19365 | JMP gen_emit_copy_end | ||
19366 | |||
19367 | repeat_four_standalone_emit_copy_short_2b: | ||
19368 | LEAL -256(DX), DX | ||
19369 | MOVW $0x0019, (AX) | ||
19370 | MOVW DX, 2(AX) | ||
19371 | ADDQ $0x04, BX | ||
19372 | ADDQ $0x04, AX | ||
19373 | JMP gen_emit_copy_end | ||
19374 | |||
19375 | repeat_three_standalone_emit_copy_short_2b: | ||
19376 | LEAL -4(DX), DX | ||
19377 | MOVW $0x0015, (AX) | ||
19378 | MOVB DL, 2(AX) | ||
19379 | ADDQ $0x03, BX | ||
19380 | ADDQ $0x03, AX | ||
19381 | JMP gen_emit_copy_end | ||
19382 | |||
19383 | repeat_two_standalone_emit_copy_short_2b: | ||
19384 | SHLL $0x02, DX | ||
19385 | ORL $0x01, DX | ||
19386 | MOVW DX, (AX) | ||
19387 | ADDQ $0x02, BX | ||
19388 | ADDQ $0x02, AX | ||
19389 | JMP gen_emit_copy_end | ||
19390 | |||
19391 | repeat_two_offset_standalone_emit_copy_short_2b: | ||
19392 | XORQ SI, SI | ||
19393 | LEAL 1(SI)(DX*4), DX | ||
19394 | MOVB CL, 1(AX) | ||
19395 | SARL $0x08, CX | ||
19396 | SHLL $0x05, CX | ||
19397 | ORL CX, DX | ||
19398 | MOVB DL, (AX) | ||
19399 | ADDQ $0x02, BX | ||
19400 | ADDQ $0x02, AX | ||
19401 | JMP gen_emit_copy_end | ||
19402 | |||
19403 | long_offset_short_standalone: | ||
19404 | MOVB $0xee, (AX) | ||
19405 | MOVW CX, 1(AX) | ||
19406 | LEAL -60(DX), DX | ||
19407 | ADDQ $0x03, AX | ||
19408 | ADDQ $0x03, BX | ||
19409 | |||
19410 | // emitRepeat | ||
19411 | emit_repeat_again_standalone_emit_copy_short: | ||
19412 | MOVL DX, SI | ||
19413 | LEAL -4(DX), DX | ||
19414 | CMPL SI, $0x08 | ||
19415 | JBE repeat_two_standalone_emit_copy_short | ||
19416 | CMPL SI, $0x0c | ||
19417 | JAE cant_repeat_two_offset_standalone_emit_copy_short | ||
19418 | CMPL CX, $0x00000800 | ||
19419 | JB repeat_two_offset_standalone_emit_copy_short | ||
19420 | |||
19421 | cant_repeat_two_offset_standalone_emit_copy_short: | ||
19422 | CMPL DX, $0x00000104 | ||
19423 | JB repeat_three_standalone_emit_copy_short | ||
19424 | CMPL DX, $0x00010100 | ||
19425 | JB repeat_four_standalone_emit_copy_short | ||
19426 | CMPL DX, $0x0100ffff | ||
19427 | JB repeat_five_standalone_emit_copy_short | ||
19428 | LEAL -16842747(DX), DX | ||
19429 | MOVL $0xfffb001d, (AX) | ||
19430 | MOVB $0xff, 4(AX) | ||
19431 | ADDQ $0x05, AX | ||
19432 | ADDQ $0x05, BX | ||
19433 | JMP emit_repeat_again_standalone_emit_copy_short | ||
19434 | |||
19435 | repeat_five_standalone_emit_copy_short: | ||
19436 | LEAL -65536(DX), DX | ||
19437 | MOVL DX, CX | ||
19438 | MOVW $0x001d, (AX) | ||
19439 | MOVW DX, 2(AX) | ||
19440 | SARL $0x10, CX | ||
19441 | MOVB CL, 4(AX) | ||
19442 | ADDQ $0x05, BX | ||
19443 | ADDQ $0x05, AX | ||
19444 | JMP gen_emit_copy_end | ||
19445 | |||
19446 | repeat_four_standalone_emit_copy_short: | ||
19447 | LEAL -256(DX), DX | ||
19448 | MOVW $0x0019, (AX) | ||
19449 | MOVW DX, 2(AX) | ||
19450 | ADDQ $0x04, BX | ||
19451 | ADDQ $0x04, AX | ||
19452 | JMP gen_emit_copy_end | ||
19453 | |||
19454 | repeat_three_standalone_emit_copy_short: | ||
19455 | LEAL -4(DX), DX | ||
19456 | MOVW $0x0015, (AX) | ||
19457 | MOVB DL, 2(AX) | ||
19458 | ADDQ $0x03, BX | ||
19459 | ADDQ $0x03, AX | ||
19460 | JMP gen_emit_copy_end | ||
19461 | |||
19462 | repeat_two_standalone_emit_copy_short: | ||
19463 | SHLL $0x02, DX | ||
19464 | ORL $0x01, DX | ||
19465 | MOVW DX, (AX) | ||
19466 | ADDQ $0x02, BX | ||
19467 | ADDQ $0x02, AX | ||
19468 | JMP gen_emit_copy_end | ||
19469 | |||
19470 | repeat_two_offset_standalone_emit_copy_short: | ||
19471 | XORQ SI, SI | ||
19472 | LEAL 1(SI)(DX*4), DX | ||
19473 | MOVB CL, 1(AX) | ||
19474 | SARL $0x08, CX | ||
19475 | SHLL $0x05, CX | ||
19476 | ORL CX, DX | ||
19477 | MOVB DL, (AX) | ||
19478 | ADDQ $0x02, BX | ||
19479 | ADDQ $0x02, AX | ||
19480 | JMP gen_emit_copy_end | ||
19481 | |||
19482 | two_byte_offset_short_standalone: | ||
19483 | MOVL DX, SI | ||
19484 | SHLL $0x02, SI | ||
19485 | CMPL DX, $0x0c | ||
19486 | JAE emit_copy_three_standalone | ||
19487 | CMPL CX, $0x00000800 | ||
19488 | JAE emit_copy_three_standalone | ||
19489 | LEAL -15(SI), SI | ||
19490 | MOVB CL, 1(AX) | ||
19491 | SHRL $0x08, CX | ||
19492 | SHLL $0x05, CX | ||
19493 | ORL CX, SI | ||
19494 | MOVB SI, (AX) | ||
19495 | ADDQ $0x02, BX | ||
19496 | ADDQ $0x02, AX | ||
19497 | JMP gen_emit_copy_end | ||
19498 | |||
19499 | emit_copy_three_standalone: | ||
19500 | LEAL -2(SI), SI | ||
19501 | MOVB SI, (AX) | ||
19502 | MOVW CX, 1(AX) | ||
19503 | ADDQ $0x03, BX | ||
19504 | ADDQ $0x03, AX | ||
19505 | |||
19506 | gen_emit_copy_end: | ||
19507 | MOVQ BX, ret+40(FP) | ||
19508 | RET | ||
19509 | |||
19510 | // func emitCopyNoRepeat(dst []byte, offset int, length int) int | ||
19511 | TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 | ||
19512 | XORQ BX, BX | ||
19513 | MOVQ dst_base+0(FP), AX | ||
19514 | MOVQ offset+24(FP), CX | ||
19515 | MOVQ length+32(FP), DX | ||
19516 | |||
19517 | // emitCopy | ||
19518 | CMPL CX, $0x00010000 | ||
19519 | JB two_byte_offset_standalone_snappy | ||
19520 | |||
19521 | four_bytes_loop_back_standalone_snappy: | ||
19522 | CMPL DX, $0x40 | ||
19523 | JBE four_bytes_remain_standalone_snappy | ||
19524 | MOVB $0xff, (AX) | ||
19525 | MOVL CX, 1(AX) | ||
19526 | LEAL -64(DX), DX | ||
19527 | ADDQ $0x05, BX | ||
19528 | ADDQ $0x05, AX | ||
19529 | CMPL DX, $0x04 | ||
19530 | JB four_bytes_remain_standalone_snappy | ||
19531 | JMP four_bytes_loop_back_standalone_snappy | ||
19532 | |||
19533 | four_bytes_remain_standalone_snappy: | ||
19534 | TESTL DX, DX | ||
19535 | JZ gen_emit_copy_end_snappy | ||
19536 | XORL SI, SI | ||
19537 | LEAL -1(SI)(DX*4), DX | ||
19538 | MOVB DL, (AX) | ||
19539 | MOVL CX, 1(AX) | ||
19540 | ADDQ $0x05, BX | ||
19541 | ADDQ $0x05, AX | ||
19542 | JMP gen_emit_copy_end_snappy | ||
19543 | |||
19544 | two_byte_offset_standalone_snappy: | ||
19545 | CMPL DX, $0x40 | ||
19546 | JBE two_byte_offset_short_standalone_snappy | ||
19547 | MOVB $0xee, (AX) | ||
19548 | MOVW CX, 1(AX) | ||
19549 | LEAL -60(DX), DX | ||
19550 | ADDQ $0x03, AX | ||
19551 | ADDQ $0x03, BX | ||
19552 | JMP two_byte_offset_standalone_snappy | ||
19553 | |||
19554 | two_byte_offset_short_standalone_snappy: | ||
19555 | MOVL DX, SI | ||
19556 | SHLL $0x02, SI | ||
19557 | CMPL DX, $0x0c | ||
19558 | JAE emit_copy_three_standalone_snappy | ||
19559 | CMPL CX, $0x00000800 | ||
19560 | JAE emit_copy_three_standalone_snappy | ||
19561 | LEAL -15(SI), SI | ||
19562 | MOVB CL, 1(AX) | ||
19563 | SHRL $0x08, CX | ||
19564 | SHLL $0x05, CX | ||
19565 | ORL CX, SI | ||
19566 | MOVB SI, (AX) | ||
19567 | ADDQ $0x02, BX | ||
19568 | ADDQ $0x02, AX | ||
19569 | JMP gen_emit_copy_end_snappy | ||
19570 | |||
19571 | emit_copy_three_standalone_snappy: | ||
19572 | LEAL -2(SI), SI | ||
19573 | MOVB SI, (AX) | ||
19574 | MOVW CX, 1(AX) | ||
19575 | ADDQ $0x03, BX | ||
19576 | ADDQ $0x03, AX | ||
19577 | |||
19578 | gen_emit_copy_end_snappy: | ||
19579 | MOVQ BX, ret+40(FP) | ||
19580 | RET | ||
19581 | |||
19582 | // func matchLen(a []byte, b []byte) int | ||
19583 | // Requires: BMI | ||
19584 | TEXT ·matchLen(SB), NOSPLIT, $0-56 | ||
19585 | MOVQ a_base+0(FP), AX | ||
19586 | MOVQ b_base+24(FP), CX | ||
19587 | MOVQ a_len+8(FP), DX | ||
19588 | |||
19589 | // matchLen | ||
19590 | XORL SI, SI | ||
19591 | |||
19592 | matchlen_loopback_16_standalone: | ||
19593 | CMPL DX, $0x10 | ||
19594 | JB matchlen_match8_standalone | ||
19595 | MOVQ (AX)(SI*1), BX | ||
19596 | MOVQ 8(AX)(SI*1), DI | ||
19597 | XORQ (CX)(SI*1), BX | ||
19598 | JNZ matchlen_bsf_8_standalone | ||
19599 | XORQ 8(CX)(SI*1), DI | ||
19600 | JNZ matchlen_bsf_16standalone | ||
19601 | LEAL -16(DX), DX | ||
19602 | LEAL 16(SI), SI | ||
19603 | JMP matchlen_loopback_16_standalone | ||
19604 | |||
19605 | matchlen_bsf_16standalone: | ||
19606 | #ifdef GOAMD64_v3 | ||
19607 | TZCNTQ DI, DI | ||
19608 | |||
19609 | #else | ||
19610 | BSFQ DI, DI | ||
19611 | |||
19612 | #endif | ||
19613 | SARQ $0x03, DI | ||
19614 | LEAL 8(SI)(DI*1), SI | ||
19615 | JMP gen_match_len_end | ||
19616 | |||
19617 | matchlen_match8_standalone: | ||
19618 | CMPL DX, $0x08 | ||
19619 | JB matchlen_match4_standalone | ||
19620 | MOVQ (AX)(SI*1), BX | ||
19621 | XORQ (CX)(SI*1), BX | ||
19622 | JNZ matchlen_bsf_8_standalone | ||
19623 | LEAL -8(DX), DX | ||
19624 | LEAL 8(SI), SI | ||
19625 | JMP matchlen_match4_standalone | ||
19626 | |||
19627 | matchlen_bsf_8_standalone: | ||
19628 | #ifdef GOAMD64_v3 | ||
19629 | TZCNTQ BX, BX | ||
19630 | |||
19631 | #else | ||
19632 | BSFQ BX, BX | ||
19633 | |||
19634 | #endif | ||
19635 | SARQ $0x03, BX | ||
19636 | LEAL (SI)(BX*1), SI | ||
19637 | JMP gen_match_len_end | ||
19638 | |||
19639 | matchlen_match4_standalone: | ||
19640 | CMPL DX, $0x04 | ||
19641 | JB matchlen_match2_standalone | ||
19642 | MOVL (AX)(SI*1), BX | ||
19643 | CMPL (CX)(SI*1), BX | ||
19644 | JNE matchlen_match2_standalone | ||
19645 | LEAL -4(DX), DX | ||
19646 | LEAL 4(SI), SI | ||
19647 | |||
19648 | matchlen_match2_standalone: | ||
19649 | CMPL DX, $0x01 | ||
19650 | JE matchlen_match1_standalone | ||
19651 | JB gen_match_len_end | ||
19652 | MOVW (AX)(SI*1), BX | ||
19653 | CMPW (CX)(SI*1), BX | ||
19654 | JNE matchlen_match1_standalone | ||
19655 | LEAL 2(SI), SI | ||
19656 | SUBL $0x02, DX | ||
19657 | JZ gen_match_len_end | ||
19658 | |||
19659 | matchlen_match1_standalone: | ||
19660 | MOVB (AX)(SI*1), BL | ||
19661 | CMPB (CX)(SI*1), BL | ||
19662 | JNE gen_match_len_end | ||
19663 | LEAL 1(SI), SI | ||
19664 | |||
19665 | gen_match_len_end: | ||
19666 | MOVQ SI, ret+48(FP) | ||
19667 | RET | ||
19668 | |||
19669 | // func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) | ||
19670 | // Requires: SSE2 | ||
19671 | TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64 | ||
19672 | XORQ SI, SI | ||
19673 | MOVQ dst_base+0(FP), AX | ||
19674 | MOVQ dst_len+8(FP), CX | ||
19675 | MOVQ src_base+24(FP), DX | ||
19676 | MOVQ src_len+32(FP), BX | ||
19677 | LEAQ (DX)(BX*1), BX | ||
19678 | LEAQ -10(AX)(CX*1), CX | ||
19679 | XORQ DI, DI | ||
19680 | |||
19681 | lz4_s2_loop: | ||
19682 | CMPQ DX, BX | ||
19683 | JAE lz4_s2_corrupt | ||
19684 | CMPQ AX, CX | ||
19685 | JAE lz4_s2_dstfull | ||
19686 | MOVBQZX (DX), R8 | ||
19687 | MOVQ R8, R9 | ||
19688 | MOVQ R8, R10 | ||
19689 | SHRQ $0x04, R9 | ||
19690 | ANDQ $0x0f, R10 | ||
19691 | CMPQ R8, $0xf0 | ||
19692 | JB lz4_s2_ll_end | ||
19693 | |||
19694 | lz4_s2_ll_loop: | ||
19695 | INCQ DX | ||
19696 | CMPQ DX, BX | ||
19697 | JAE lz4_s2_corrupt | ||
19698 | MOVBQZX (DX), R8 | ||
19699 | ADDQ R8, R9 | ||
19700 | CMPQ R8, $0xff | ||
19701 | JEQ lz4_s2_ll_loop | ||
19702 | |||
19703 | lz4_s2_ll_end: | ||
19704 | LEAQ (DX)(R9*1), R8 | ||
19705 | ADDQ $0x04, R10 | ||
19706 | CMPQ R8, BX | ||
19707 | JAE lz4_s2_corrupt | ||
19708 | INCQ DX | ||
19709 | INCQ R8 | ||
19710 | TESTQ R9, R9 | ||
19711 | JZ lz4_s2_lits_done | ||
19712 | LEAQ (AX)(R9*1), R11 | ||
19713 | CMPQ R11, CX | ||
19714 | JAE lz4_s2_dstfull | ||
19715 | ADDQ R9, SI | ||
19716 | LEAL -1(R9), R11 | ||
19717 | CMPL R11, $0x3c | ||
19718 | JB one_byte_lz4_s2 | ||
19719 | CMPL R11, $0x00000100 | ||
19720 | JB two_bytes_lz4_s2 | ||
19721 | CMPL R11, $0x00010000 | ||
19722 | JB three_bytes_lz4_s2 | ||
19723 | CMPL R11, $0x01000000 | ||
19724 | JB four_bytes_lz4_s2 | ||
19725 | MOVB $0xfc, (AX) | ||
19726 | MOVL R11, 1(AX) | ||
19727 | ADDQ $0x05, AX | ||
19728 | JMP memmove_long_lz4_s2 | ||
19729 | |||
19730 | four_bytes_lz4_s2: | ||
19731 | MOVL R11, R12 | ||
19732 | SHRL $0x10, R12 | ||
19733 | MOVB $0xf8, (AX) | ||
19734 | MOVW R11, 1(AX) | ||
19735 | MOVB R12, 3(AX) | ||
19736 | ADDQ $0x04, AX | ||
19737 | JMP memmove_long_lz4_s2 | ||
19738 | |||
19739 | three_bytes_lz4_s2: | ||
19740 | MOVB $0xf4, (AX) | ||
19741 | MOVW R11, 1(AX) | ||
19742 | ADDQ $0x03, AX | ||
19743 | JMP memmove_long_lz4_s2 | ||
19744 | |||
19745 | two_bytes_lz4_s2: | ||
19746 | MOVB $0xf0, (AX) | ||
19747 | MOVB R11, 1(AX) | ||
19748 | ADDQ $0x02, AX | ||
19749 | CMPL R11, $0x40 | ||
19750 | JB memmove_lz4_s2 | ||
19751 | JMP memmove_long_lz4_s2 | ||
19752 | |||
19753 | one_byte_lz4_s2: | ||
19754 | SHLB $0x02, R11 | ||
19755 | MOVB R11, (AX) | ||
19756 | ADDQ $0x01, AX | ||
19757 | |||
19758 | memmove_lz4_s2: | ||
19759 | LEAQ (AX)(R9*1), R11 | ||
19760 | |||
19761 | // genMemMoveShort | ||
19762 | CMPQ R9, $0x08 | ||
19763 | JBE emit_lit_memmove_lz4_s2_memmove_move_8 | ||
19764 | CMPQ R9, $0x10 | ||
19765 | JBE emit_lit_memmove_lz4_s2_memmove_move_8through16 | ||
19766 | CMPQ R9, $0x20 | ||
19767 | JBE emit_lit_memmove_lz4_s2_memmove_move_17through32 | ||
19768 | JMP emit_lit_memmove_lz4_s2_memmove_move_33through64 | ||
19769 | |||
19770 | emit_lit_memmove_lz4_s2_memmove_move_8: | ||
19771 | MOVQ (DX), R12 | ||
19772 | MOVQ R12, (AX) | ||
19773 | JMP memmove_end_copy_lz4_s2 | ||
19774 | |||
19775 | emit_lit_memmove_lz4_s2_memmove_move_8through16: | ||
19776 | MOVQ (DX), R12 | ||
19777 | MOVQ -8(DX)(R9*1), DX | ||
19778 | MOVQ R12, (AX) | ||
19779 | MOVQ DX, -8(AX)(R9*1) | ||
19780 | JMP memmove_end_copy_lz4_s2 | ||
19781 | |||
19782 | emit_lit_memmove_lz4_s2_memmove_move_17through32: | ||
19783 | MOVOU (DX), X0 | ||
19784 | MOVOU -16(DX)(R9*1), X1 | ||
19785 | MOVOU X0, (AX) | ||
19786 | MOVOU X1, -16(AX)(R9*1) | ||
19787 | JMP memmove_end_copy_lz4_s2 | ||
19788 | |||
19789 | emit_lit_memmove_lz4_s2_memmove_move_33through64: | ||
19790 | MOVOU (DX), X0 | ||
19791 | MOVOU 16(DX), X1 | ||
19792 | MOVOU -32(DX)(R9*1), X2 | ||
19793 | MOVOU -16(DX)(R9*1), X3 | ||
19794 | MOVOU X0, (AX) | ||
19795 | MOVOU X1, 16(AX) | ||
19796 | MOVOU X2, -32(AX)(R9*1) | ||
19797 | MOVOU X3, -16(AX)(R9*1) | ||
19798 | |||
19799 | memmove_end_copy_lz4_s2: | ||
19800 | MOVQ R11, AX | ||
19801 | JMP lz4_s2_lits_emit_done | ||
19802 | |||
19803 | memmove_long_lz4_s2: | ||
19804 | LEAQ (AX)(R9*1), R11 | ||
19805 | |||
19806 | // genMemMoveLong | ||
19807 | MOVOU (DX), X0 | ||
19808 | MOVOU 16(DX), X1 | ||
19809 | MOVOU -32(DX)(R9*1), X2 | ||
19810 | MOVOU -16(DX)(R9*1), X3 | ||
19811 | MOVQ R9, R13 | ||
19812 | SHRQ $0x05, R13 | ||
19813 | MOVQ AX, R12 | ||
19814 | ANDL $0x0000001f, R12 | ||
19815 | MOVQ $0x00000040, R14 | ||
19816 | SUBQ R12, R14 | ||
19817 | DECQ R13 | ||
19818 | JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 | ||
19819 | LEAQ -32(DX)(R14*1), R12 | ||
19820 | LEAQ -32(AX)(R14*1), R15 | ||
19821 | |||
19822 | emit_lit_memmove_long_lz4_s2large_big_loop_back: | ||
19823 | MOVOU (R12), X4 | ||
19824 | MOVOU 16(R12), X5 | ||
19825 | MOVOA X4, (R15) | ||
19826 | MOVOA X5, 16(R15) | ||
19827 | ADDQ $0x20, R15 | ||
19828 | ADDQ $0x20, R12 | ||
19829 | ADDQ $0x20, R14 | ||
19830 | DECQ R13 | ||
19831 | JNA emit_lit_memmove_long_lz4_s2large_big_loop_back | ||
19832 | |||
19833 | emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32: | ||
19834 | MOVOU -32(DX)(R14*1), X4 | ||
19835 | MOVOU -16(DX)(R14*1), X5 | ||
19836 | MOVOA X4, -32(AX)(R14*1) | ||
19837 | MOVOA X5, -16(AX)(R14*1) | ||
19838 | ADDQ $0x20, R14 | ||
19839 | CMPQ R9, R14 | ||
19840 | JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 | ||
19841 | MOVOU X0, (AX) | ||
19842 | MOVOU X1, 16(AX) | ||
19843 | MOVOU X2, -32(AX)(R9*1) | ||
19844 | MOVOU X3, -16(AX)(R9*1) | ||
19845 | MOVQ R11, AX | ||
19846 | |||
19847 | lz4_s2_lits_emit_done: | ||
19848 | MOVQ R8, DX | ||
19849 | |||
19850 | lz4_s2_lits_done: | ||
19851 | CMPQ DX, BX | ||
19852 | JNE lz4_s2_match | ||
19853 | CMPQ R10, $0x04 | ||
19854 | JEQ lz4_s2_done | ||
19855 | JMP lz4_s2_corrupt | ||
19856 | |||
19857 | lz4_s2_match: | ||
19858 | LEAQ 2(DX), R8 | ||
19859 | CMPQ R8, BX | ||
19860 | JAE lz4_s2_corrupt | ||
19861 | MOVWQZX (DX), R9 | ||
19862 | MOVQ R8, DX | ||
19863 | TESTQ R9, R9 | ||
19864 | JZ lz4_s2_corrupt | ||
19865 | CMPQ R9, SI | ||
19866 | JA lz4_s2_corrupt | ||
19867 | CMPQ R10, $0x13 | ||
19868 | JNE lz4_s2_ml_done | ||
19869 | |||
19870 | lz4_s2_ml_loop: | ||
19871 | MOVBQZX (DX), R8 | ||
19872 | INCQ DX | ||
19873 | ADDQ R8, R10 | ||
19874 | CMPQ DX, BX | ||
19875 | JAE lz4_s2_corrupt | ||
19876 | CMPQ R8, $0xff | ||
19877 | JEQ lz4_s2_ml_loop | ||
19878 | |||
19879 | lz4_s2_ml_done: | ||
19880 | ADDQ R10, SI | ||
19881 | CMPQ R9, DI | ||
19882 | JNE lz4_s2_docopy | ||
19883 | |||
19884 | // emitRepeat | ||
19885 | emit_repeat_again_lz4_s2: | ||
19886 | MOVL R10, R8 | ||
19887 | LEAL -4(R10), R10 | ||
19888 | CMPL R8, $0x08 | ||
19889 | JBE repeat_two_lz4_s2 | ||
19890 | CMPL R8, $0x0c | ||
19891 | JAE cant_repeat_two_offset_lz4_s2 | ||
19892 | CMPL R9, $0x00000800 | ||
19893 | JB repeat_two_offset_lz4_s2 | ||
19894 | |||
19895 | cant_repeat_two_offset_lz4_s2: | ||
19896 | CMPL R10, $0x00000104 | ||
19897 | JB repeat_three_lz4_s2 | ||
19898 | CMPL R10, $0x00010100 | ||
19899 | JB repeat_four_lz4_s2 | ||
19900 | CMPL R10, $0x0100ffff | ||
19901 | JB repeat_five_lz4_s2 | ||
19902 | LEAL -16842747(R10), R10 | ||
19903 | MOVL $0xfffb001d, (AX) | ||
19904 | MOVB $0xff, 4(AX) | ||
19905 | ADDQ $0x05, AX | ||
19906 | JMP emit_repeat_again_lz4_s2 | ||
19907 | |||
19908 | repeat_five_lz4_s2: | ||
19909 | LEAL -65536(R10), R10 | ||
19910 | MOVL R10, R9 | ||
19911 | MOVW $0x001d, (AX) | ||
19912 | MOVW R10, 2(AX) | ||
19913 | SARL $0x10, R9 | ||
19914 | MOVB R9, 4(AX) | ||
19915 | ADDQ $0x05, AX | ||
19916 | JMP lz4_s2_loop | ||
19917 | |||
19918 | repeat_four_lz4_s2: | ||
19919 | LEAL -256(R10), R10 | ||
19920 | MOVW $0x0019, (AX) | ||
19921 | MOVW R10, 2(AX) | ||
19922 | ADDQ $0x04, AX | ||
19923 | JMP lz4_s2_loop | ||
19924 | |||
19925 | repeat_three_lz4_s2: | ||
19926 | LEAL -4(R10), R10 | ||
19927 | MOVW $0x0015, (AX) | ||
19928 | MOVB R10, 2(AX) | ||
19929 | ADDQ $0x03, AX | ||
19930 | JMP lz4_s2_loop | ||
19931 | |||
19932 | repeat_two_lz4_s2: | ||
19933 | SHLL $0x02, R10 | ||
19934 | ORL $0x01, R10 | ||
19935 | MOVW R10, (AX) | ||
19936 | ADDQ $0x02, AX | ||
19937 | JMP lz4_s2_loop | ||
19938 | |||
19939 | repeat_two_offset_lz4_s2: | ||
19940 | XORQ R8, R8 | ||
19941 | LEAL 1(R8)(R10*4), R10 | ||
19942 | MOVB R9, 1(AX) | ||
19943 | SARL $0x08, R9 | ||
19944 | SHLL $0x05, R9 | ||
19945 | ORL R9, R10 | ||
19946 | MOVB R10, (AX) | ||
19947 | ADDQ $0x02, AX | ||
19948 | JMP lz4_s2_loop | ||
19949 | |||
19950 | lz4_s2_docopy: | ||
19951 | MOVQ R9, DI | ||
19952 | |||
19953 | // emitCopy | ||
19954 | CMPL R10, $0x40 | ||
19955 | JBE two_byte_offset_short_lz4_s2 | ||
19956 | CMPL R9, $0x00000800 | ||
19957 | JAE long_offset_short_lz4_s2 | ||
19958 | MOVL $0x00000001, R8 | ||
19959 | LEAL 16(R8), R8 | ||
19960 | MOVB R9, 1(AX) | ||
19961 | MOVL R9, R11 | ||
19962 | SHRL $0x08, R11 | ||
19963 | SHLL $0x05, R11 | ||
19964 | ORL R11, R8 | ||
19965 | MOVB R8, (AX) | ||
19966 | ADDQ $0x02, AX | ||
19967 | SUBL $0x08, R10 | ||
19968 | |||
19969 | // emitRepeat | ||
19970 | LEAL -4(R10), R10 | ||
19971 | JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b | ||
19972 | |||
19973 | emit_repeat_again_lz4_s2_emit_copy_short_2b: | ||
19974 | MOVL R10, R8 | ||
19975 | LEAL -4(R10), R10 | ||
19976 | CMPL R8, $0x08 | ||
19977 | JBE repeat_two_lz4_s2_emit_copy_short_2b | ||
19978 | CMPL R8, $0x0c | ||
19979 | JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b | ||
19980 | CMPL R9, $0x00000800 | ||
19981 | JB repeat_two_offset_lz4_s2_emit_copy_short_2b | ||
19982 | |||
19983 | cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: | ||
19984 | CMPL R10, $0x00000104 | ||
19985 | JB repeat_three_lz4_s2_emit_copy_short_2b | ||
19986 | CMPL R10, $0x00010100 | ||
19987 | JB repeat_four_lz4_s2_emit_copy_short_2b | ||
19988 | CMPL R10, $0x0100ffff | ||
19989 | JB repeat_five_lz4_s2_emit_copy_short_2b | ||
19990 | LEAL -16842747(R10), R10 | ||
19991 | MOVL $0xfffb001d, (AX) | ||
19992 | MOVB $0xff, 4(AX) | ||
19993 | ADDQ $0x05, AX | ||
19994 | JMP emit_repeat_again_lz4_s2_emit_copy_short_2b | ||
19995 | |||
19996 | repeat_five_lz4_s2_emit_copy_short_2b: | ||
19997 | LEAL -65536(R10), R10 | ||
19998 | MOVL R10, R9 | ||
19999 | MOVW $0x001d, (AX) | ||
20000 | MOVW R10, 2(AX) | ||
20001 | SARL $0x10, R9 | ||
20002 | MOVB R9, 4(AX) | ||
20003 | ADDQ $0x05, AX | ||
20004 | JMP lz4_s2_loop | ||
20005 | |||
20006 | repeat_four_lz4_s2_emit_copy_short_2b: | ||
20007 | LEAL -256(R10), R10 | ||
20008 | MOVW $0x0019, (AX) | ||
20009 | MOVW R10, 2(AX) | ||
20010 | ADDQ $0x04, AX | ||
20011 | JMP lz4_s2_loop | ||
20012 | |||
20013 | repeat_three_lz4_s2_emit_copy_short_2b: | ||
20014 | LEAL -4(R10), R10 | ||
20015 | MOVW $0x0015, (AX) | ||
20016 | MOVB R10, 2(AX) | ||
20017 | ADDQ $0x03, AX | ||
20018 | JMP lz4_s2_loop | ||
20019 | |||
20020 | repeat_two_lz4_s2_emit_copy_short_2b: | ||
20021 | SHLL $0x02, R10 | ||
20022 | ORL $0x01, R10 | ||
20023 | MOVW R10, (AX) | ||
20024 | ADDQ $0x02, AX | ||
20025 | JMP lz4_s2_loop | ||
20026 | |||
20027 | repeat_two_offset_lz4_s2_emit_copy_short_2b: | ||
20028 | XORQ R8, R8 | ||
20029 | LEAL 1(R8)(R10*4), R10 | ||
20030 | MOVB R9, 1(AX) | ||
20031 | SARL $0x08, R9 | ||
20032 | SHLL $0x05, R9 | ||
20033 | ORL R9, R10 | ||
20034 | MOVB R10, (AX) | ||
20035 | ADDQ $0x02, AX | ||
20036 | JMP lz4_s2_loop | ||
20037 | |||
20038 | long_offset_short_lz4_s2: | ||
20039 | MOVB $0xee, (AX) | ||
20040 | MOVW R9, 1(AX) | ||
20041 | LEAL -60(R10), R10 | ||
20042 | ADDQ $0x03, AX | ||
20043 | |||
20044 | // emitRepeat | ||
20045 | emit_repeat_again_lz4_s2_emit_copy_short: | ||
20046 | MOVL R10, R8 | ||
20047 | LEAL -4(R10), R10 | ||
20048 | CMPL R8, $0x08 | ||
20049 | JBE repeat_two_lz4_s2_emit_copy_short | ||
20050 | CMPL R8, $0x0c | ||
20051 | JAE cant_repeat_two_offset_lz4_s2_emit_copy_short | ||
20052 | CMPL R9, $0x00000800 | ||
20053 | JB repeat_two_offset_lz4_s2_emit_copy_short | ||
20054 | |||
20055 | cant_repeat_two_offset_lz4_s2_emit_copy_short: | ||
20056 | CMPL R10, $0x00000104 | ||
20057 | JB repeat_three_lz4_s2_emit_copy_short | ||
20058 | CMPL R10, $0x00010100 | ||
20059 | JB repeat_four_lz4_s2_emit_copy_short | ||
20060 | CMPL R10, $0x0100ffff | ||
20061 | JB repeat_five_lz4_s2_emit_copy_short | ||
20062 | LEAL -16842747(R10), R10 | ||
20063 | MOVL $0xfffb001d, (AX) | ||
20064 | MOVB $0xff, 4(AX) | ||
20065 | ADDQ $0x05, AX | ||
20066 | JMP emit_repeat_again_lz4_s2_emit_copy_short | ||
20067 | |||
20068 | repeat_five_lz4_s2_emit_copy_short: | ||
20069 | LEAL -65536(R10), R10 | ||
20070 | MOVL R10, R9 | ||
20071 | MOVW $0x001d, (AX) | ||
20072 | MOVW R10, 2(AX) | ||
20073 | SARL $0x10, R9 | ||
20074 | MOVB R9, 4(AX) | ||
20075 | ADDQ $0x05, AX | ||
20076 | JMP lz4_s2_loop | ||
20077 | |||
20078 | repeat_four_lz4_s2_emit_copy_short: | ||
20079 | LEAL -256(R10), R10 | ||
20080 | MOVW $0x0019, (AX) | ||
20081 | MOVW R10, 2(AX) | ||
20082 | ADDQ $0x04, AX | ||
20083 | JMP lz4_s2_loop | ||
20084 | |||
20085 | repeat_three_lz4_s2_emit_copy_short: | ||
20086 | LEAL -4(R10), R10 | ||
20087 | MOVW $0x0015, (AX) | ||
20088 | MOVB R10, 2(AX) | ||
20089 | ADDQ $0x03, AX | ||
20090 | JMP lz4_s2_loop | ||
20091 | |||
20092 | repeat_two_lz4_s2_emit_copy_short: | ||
20093 | SHLL $0x02, R10 | ||
20094 | ORL $0x01, R10 | ||
20095 | MOVW R10, (AX) | ||
20096 | ADDQ $0x02, AX | ||
20097 | JMP lz4_s2_loop | ||
20098 | |||
20099 | repeat_two_offset_lz4_s2_emit_copy_short: | ||
20100 | XORQ R8, R8 | ||
20101 | LEAL 1(R8)(R10*4), R10 | ||
20102 | MOVB R9, 1(AX) | ||
20103 | SARL $0x08, R9 | ||
20104 | SHLL $0x05, R9 | ||
20105 | ORL R9, R10 | ||
20106 | MOVB R10, (AX) | ||
20107 | ADDQ $0x02, AX | ||
20108 | JMP lz4_s2_loop | ||
20109 | |||
20110 | two_byte_offset_short_lz4_s2: | ||
20111 | MOVL R10, R8 | ||
20112 | SHLL $0x02, R8 | ||
20113 | CMPL R10, $0x0c | ||
20114 | JAE emit_copy_three_lz4_s2 | ||
20115 | CMPL R9, $0x00000800 | ||
20116 | JAE emit_copy_three_lz4_s2 | ||
20117 | LEAL -15(R8), R8 | ||
20118 | MOVB R9, 1(AX) | ||
20119 | SHRL $0x08, R9 | ||
20120 | SHLL $0x05, R9 | ||
20121 | ORL R9, R8 | ||
20122 | MOVB R8, (AX) | ||
20123 | ADDQ $0x02, AX | ||
20124 | JMP lz4_s2_loop | ||
20125 | |||
20126 | emit_copy_three_lz4_s2: | ||
20127 | LEAL -2(R8), R8 | ||
20128 | MOVB R8, (AX) | ||
20129 | MOVW R9, 1(AX) | ||
20130 | ADDQ $0x03, AX | ||
20131 | JMP lz4_s2_loop | ||
20132 | |||
20133 | lz4_s2_done: | ||
20134 | MOVQ dst_base+0(FP), CX | ||
20135 | SUBQ CX, AX | ||
20136 | MOVQ SI, uncompressed+48(FP) | ||
20137 | MOVQ AX, dstUsed+56(FP) | ||
20138 | RET | ||
20139 | |||
20140 | lz4_s2_corrupt: | ||
20141 | XORQ AX, AX | ||
20142 | LEAQ -1(AX), SI | ||
20143 | MOVQ SI, uncompressed+48(FP) | ||
20144 | RET | ||
20145 | |||
20146 | lz4_s2_dstfull: | ||
20147 | XORQ AX, AX | ||
20148 | LEAQ -2(AX), SI | ||
20149 | MOVQ SI, uncompressed+48(FP) | ||
20150 | RET | ||
20151 | |||
20152 | // func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) | ||
20153 | // Requires: SSE2 | ||
20154 | TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64 | ||
20155 | XORQ SI, SI | ||
20156 | MOVQ dst_base+0(FP), AX | ||
20157 | MOVQ dst_len+8(FP), CX | ||
20158 | MOVQ src_base+24(FP), DX | ||
20159 | MOVQ src_len+32(FP), BX | ||
20160 | LEAQ (DX)(BX*1), BX | ||
20161 | LEAQ -10(AX)(CX*1), CX | ||
20162 | XORQ DI, DI | ||
20163 | |||
20164 | lz4s_s2_loop: | ||
20165 | CMPQ DX, BX | ||
20166 | JAE lz4s_s2_corrupt | ||
20167 | CMPQ AX, CX | ||
20168 | JAE lz4s_s2_dstfull | ||
20169 | MOVBQZX (DX), R8 | ||
20170 | MOVQ R8, R9 | ||
20171 | MOVQ R8, R10 | ||
20172 | SHRQ $0x04, R9 | ||
20173 | ANDQ $0x0f, R10 | ||
20174 | CMPQ R8, $0xf0 | ||
20175 | JB lz4s_s2_ll_end | ||
20176 | |||
20177 | lz4s_s2_ll_loop: | ||
20178 | INCQ DX | ||
20179 | CMPQ DX, BX | ||
20180 | JAE lz4s_s2_corrupt | ||
20181 | MOVBQZX (DX), R8 | ||
20182 | ADDQ R8, R9 | ||
20183 | CMPQ R8, $0xff | ||
20184 | JEQ lz4s_s2_ll_loop | ||
20185 | |||
20186 | lz4s_s2_ll_end: | ||
20187 | LEAQ (DX)(R9*1), R8 | ||
20188 | ADDQ $0x03, R10 | ||
20189 | CMPQ R8, BX | ||
20190 | JAE lz4s_s2_corrupt | ||
20191 | INCQ DX | ||
20192 | INCQ R8 | ||
20193 | TESTQ R9, R9 | ||
20194 | JZ lz4s_s2_lits_done | ||
20195 | LEAQ (AX)(R9*1), R11 | ||
20196 | CMPQ R11, CX | ||
20197 | JAE lz4s_s2_dstfull | ||
20198 | ADDQ R9, SI | ||
20199 | LEAL -1(R9), R11 | ||
20200 | CMPL R11, $0x3c | ||
20201 | JB one_byte_lz4s_s2 | ||
20202 | CMPL R11, $0x00000100 | ||
20203 | JB two_bytes_lz4s_s2 | ||
20204 | CMPL R11, $0x00010000 | ||
20205 | JB three_bytes_lz4s_s2 | ||
20206 | CMPL R11, $0x01000000 | ||
20207 | JB four_bytes_lz4s_s2 | ||
20208 | MOVB $0xfc, (AX) | ||
20209 | MOVL R11, 1(AX) | ||
20210 | ADDQ $0x05, AX | ||
20211 | JMP memmove_long_lz4s_s2 | ||
20212 | |||
20213 | four_bytes_lz4s_s2: | ||
20214 | MOVL R11, R12 | ||
20215 | SHRL $0x10, R12 | ||
20216 | MOVB $0xf8, (AX) | ||
20217 | MOVW R11, 1(AX) | ||
20218 | MOVB R12, 3(AX) | ||
20219 | ADDQ $0x04, AX | ||
20220 | JMP memmove_long_lz4s_s2 | ||
20221 | |||
20222 | three_bytes_lz4s_s2: | ||
20223 | MOVB $0xf4, (AX) | ||
20224 | MOVW R11, 1(AX) | ||
20225 | ADDQ $0x03, AX | ||
20226 | JMP memmove_long_lz4s_s2 | ||
20227 | |||
20228 | two_bytes_lz4s_s2: | ||
20229 | MOVB $0xf0, (AX) | ||
20230 | MOVB R11, 1(AX) | ||
20231 | ADDQ $0x02, AX | ||
20232 | CMPL R11, $0x40 | ||
20233 | JB memmove_lz4s_s2 | ||
20234 | JMP memmove_long_lz4s_s2 | ||
20235 | |||
20236 | one_byte_lz4s_s2: | ||
20237 | SHLB $0x02, R11 | ||
20238 | MOVB R11, (AX) | ||
20239 | ADDQ $0x01, AX | ||
20240 | |||
20241 | memmove_lz4s_s2: | ||
20242 | LEAQ (AX)(R9*1), R11 | ||
20243 | |||
20244 | // genMemMoveShort | ||
20245 | CMPQ R9, $0x08 | ||
20246 | JBE emit_lit_memmove_lz4s_s2_memmove_move_8 | ||
20247 | CMPQ R9, $0x10 | ||
20248 | JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16 | ||
20249 | CMPQ R9, $0x20 | ||
20250 | JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32 | ||
20251 | JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64 | ||
20252 | |||
20253 | emit_lit_memmove_lz4s_s2_memmove_move_8: | ||
20254 | MOVQ (DX), R12 | ||
20255 | MOVQ R12, (AX) | ||
20256 | JMP memmove_end_copy_lz4s_s2 | ||
20257 | |||
20258 | emit_lit_memmove_lz4s_s2_memmove_move_8through16: | ||
20259 | MOVQ (DX), R12 | ||
20260 | MOVQ -8(DX)(R9*1), DX | ||
20261 | MOVQ R12, (AX) | ||
20262 | MOVQ DX, -8(AX)(R9*1) | ||
20263 | JMP memmove_end_copy_lz4s_s2 | ||
20264 | |||
20265 | emit_lit_memmove_lz4s_s2_memmove_move_17through32: | ||
20266 | MOVOU (DX), X0 | ||
20267 | MOVOU -16(DX)(R9*1), X1 | ||
20268 | MOVOU X0, (AX) | ||
20269 | MOVOU X1, -16(AX)(R9*1) | ||
20270 | JMP memmove_end_copy_lz4s_s2 | ||
20271 | |||
20272 | emit_lit_memmove_lz4s_s2_memmove_move_33through64: | ||
20273 | MOVOU (DX), X0 | ||
20274 | MOVOU 16(DX), X1 | ||
20275 | MOVOU -32(DX)(R9*1), X2 | ||
20276 | MOVOU -16(DX)(R9*1), X3 | ||
20277 | MOVOU X0, (AX) | ||
20278 | MOVOU X1, 16(AX) | ||
20279 | MOVOU X2, -32(AX)(R9*1) | ||
20280 | MOVOU X3, -16(AX)(R9*1) | ||
20281 | |||
20282 | memmove_end_copy_lz4s_s2: | ||
20283 | MOVQ R11, AX | ||
20284 | JMP lz4s_s2_lits_emit_done | ||
20285 | |||
20286 | memmove_long_lz4s_s2: | ||
20287 | LEAQ (AX)(R9*1), R11 | ||
20288 | |||
20289 | // genMemMoveLong | ||
20290 | MOVOU (DX), X0 | ||
20291 | MOVOU 16(DX), X1 | ||
20292 | MOVOU -32(DX)(R9*1), X2 | ||
20293 | MOVOU -16(DX)(R9*1), X3 | ||
20294 | MOVQ R9, R13 | ||
20295 | SHRQ $0x05, R13 | ||
20296 | MOVQ AX, R12 | ||
20297 | ANDL $0x0000001f, R12 | ||
20298 | MOVQ $0x00000040, R14 | ||
20299 | SUBQ R12, R14 | ||
20300 | DECQ R13 | ||
20301 | JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 | ||
20302 | LEAQ -32(DX)(R14*1), R12 | ||
20303 | LEAQ -32(AX)(R14*1), R15 | ||
20304 | |||
20305 | emit_lit_memmove_long_lz4s_s2large_big_loop_back: | ||
20306 | MOVOU (R12), X4 | ||
20307 | MOVOU 16(R12), X5 | ||
20308 | MOVOA X4, (R15) | ||
20309 | MOVOA X5, 16(R15) | ||
20310 | ADDQ $0x20, R15 | ||
20311 | ADDQ $0x20, R12 | ||
20312 | ADDQ $0x20, R14 | ||
20313 | DECQ R13 | ||
20314 | JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back | ||
20315 | |||
20316 | emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32: | ||
20317 | MOVOU -32(DX)(R14*1), X4 | ||
20318 | MOVOU -16(DX)(R14*1), X5 | ||
20319 | MOVOA X4, -32(AX)(R14*1) | ||
20320 | MOVOA X5, -16(AX)(R14*1) | ||
20321 | ADDQ $0x20, R14 | ||
20322 | CMPQ R9, R14 | ||
20323 | JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 | ||
20324 | MOVOU X0, (AX) | ||
20325 | MOVOU X1, 16(AX) | ||
20326 | MOVOU X2, -32(AX)(R9*1) | ||
20327 | MOVOU X3, -16(AX)(R9*1) | ||
20328 | MOVQ R11, AX | ||
20329 | |||
20330 | lz4s_s2_lits_emit_done: | ||
20331 | MOVQ R8, DX | ||
20332 | |||
20333 | lz4s_s2_lits_done: | ||
20334 | CMPQ DX, BX | ||
20335 | JNE lz4s_s2_match | ||
20336 | CMPQ R10, $0x03 | ||
20337 | JEQ lz4s_s2_done | ||
20338 | JMP lz4s_s2_corrupt | ||
20339 | |||
20340 | lz4s_s2_match: | ||
20341 | CMPQ R10, $0x03 | ||
20342 | JEQ lz4s_s2_loop | ||
20343 | LEAQ 2(DX), R8 | ||
20344 | CMPQ R8, BX | ||
20345 | JAE lz4s_s2_corrupt | ||
20346 | MOVWQZX (DX), R9 | ||
20347 | MOVQ R8, DX | ||
20348 | TESTQ R9, R9 | ||
20349 | JZ lz4s_s2_corrupt | ||
20350 | CMPQ R9, SI | ||
20351 | JA lz4s_s2_corrupt | ||
20352 | CMPQ R10, $0x12 | ||
20353 | JNE lz4s_s2_ml_done | ||
20354 | |||
20355 | lz4s_s2_ml_loop: | ||
20356 | MOVBQZX (DX), R8 | ||
20357 | INCQ DX | ||
20358 | ADDQ R8, R10 | ||
20359 | CMPQ DX, BX | ||
20360 | JAE lz4s_s2_corrupt | ||
20361 | CMPQ R8, $0xff | ||
20362 | JEQ lz4s_s2_ml_loop | ||
20363 | |||
20364 | lz4s_s2_ml_done: | ||
20365 | ADDQ R10, SI | ||
20366 | CMPQ R9, DI | ||
20367 | JNE lz4s_s2_docopy | ||
20368 | |||
20369 | // emitRepeat | ||
20370 | emit_repeat_again_lz4_s2: | ||
20371 | MOVL R10, R8 | ||
20372 | LEAL -4(R10), R10 | ||
20373 | CMPL R8, $0x08 | ||
20374 | JBE repeat_two_lz4_s2 | ||
20375 | CMPL R8, $0x0c | ||
20376 | JAE cant_repeat_two_offset_lz4_s2 | ||
20377 | CMPL R9, $0x00000800 | ||
20378 | JB repeat_two_offset_lz4_s2 | ||
20379 | |||
20380 | cant_repeat_two_offset_lz4_s2: | ||
20381 | CMPL R10, $0x00000104 | ||
20382 | JB repeat_three_lz4_s2 | ||
20383 | CMPL R10, $0x00010100 | ||
20384 | JB repeat_four_lz4_s2 | ||
20385 | CMPL R10, $0x0100ffff | ||
20386 | JB repeat_five_lz4_s2 | ||
20387 | LEAL -16842747(R10), R10 | ||
20388 | MOVL $0xfffb001d, (AX) | ||
20389 | MOVB $0xff, 4(AX) | ||
20390 | ADDQ $0x05, AX | ||
20391 | JMP emit_repeat_again_lz4_s2 | ||
20392 | |||
20393 | repeat_five_lz4_s2: | ||
20394 | LEAL -65536(R10), R10 | ||
20395 | MOVL R10, R9 | ||
20396 | MOVW $0x001d, (AX) | ||
20397 | MOVW R10, 2(AX) | ||
20398 | SARL $0x10, R9 | ||
20399 | MOVB R9, 4(AX) | ||
20400 | ADDQ $0x05, AX | ||
20401 | JMP lz4s_s2_loop | ||
20402 | |||
20403 | repeat_four_lz4_s2: | ||
20404 | LEAL -256(R10), R10 | ||
20405 | MOVW $0x0019, (AX) | ||
20406 | MOVW R10, 2(AX) | ||
20407 | ADDQ $0x04, AX | ||
20408 | JMP lz4s_s2_loop | ||
20409 | |||
20410 | repeat_three_lz4_s2: | ||
20411 | LEAL -4(R10), R10 | ||
20412 | MOVW $0x0015, (AX) | ||
20413 | MOVB R10, 2(AX) | ||
20414 | ADDQ $0x03, AX | ||
20415 | JMP lz4s_s2_loop | ||
20416 | |||
20417 | repeat_two_lz4_s2: | ||
20418 | SHLL $0x02, R10 | ||
20419 | ORL $0x01, R10 | ||
20420 | MOVW R10, (AX) | ||
20421 | ADDQ $0x02, AX | ||
20422 | JMP lz4s_s2_loop | ||
20423 | |||
20424 | repeat_two_offset_lz4_s2: | ||
20425 | XORQ R8, R8 | ||
20426 | LEAL 1(R8)(R10*4), R10 | ||
20427 | MOVB R9, 1(AX) | ||
20428 | SARL $0x08, R9 | ||
20429 | SHLL $0x05, R9 | ||
20430 | ORL R9, R10 | ||
20431 | MOVB R10, (AX) | ||
20432 | ADDQ $0x02, AX | ||
20433 | JMP lz4s_s2_loop | ||
20434 | |||
20435 | lz4s_s2_docopy: | ||
20436 | MOVQ R9, DI | ||
20437 | |||
20438 | // emitCopy | ||
20439 | CMPL R10, $0x40 | ||
20440 | JBE two_byte_offset_short_lz4_s2 | ||
20441 | CMPL R9, $0x00000800 | ||
20442 | JAE long_offset_short_lz4_s2 | ||
20443 | MOVL $0x00000001, R8 | ||
20444 | LEAL 16(R8), R8 | ||
20445 | MOVB R9, 1(AX) | ||
20446 | MOVL R9, R11 | ||
20447 | SHRL $0x08, R11 | ||
20448 | SHLL $0x05, R11 | ||
20449 | ORL R11, R8 | ||
20450 | MOVB R8, (AX) | ||
20451 | ADDQ $0x02, AX | ||
20452 | SUBL $0x08, R10 | ||
20453 | |||
20454 | // emitRepeat | ||
20455 | LEAL -4(R10), R10 | ||
20456 | JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b | ||
20457 | |||
20458 | emit_repeat_again_lz4_s2_emit_copy_short_2b: | ||
20459 | MOVL R10, R8 | ||
20460 | LEAL -4(R10), R10 | ||
20461 | CMPL R8, $0x08 | ||
20462 | JBE repeat_two_lz4_s2_emit_copy_short_2b | ||
20463 | CMPL R8, $0x0c | ||
20464 | JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b | ||
20465 | CMPL R9, $0x00000800 | ||
20466 | JB repeat_two_offset_lz4_s2_emit_copy_short_2b | ||
20467 | |||
20468 | cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: | ||
20469 | CMPL R10, $0x00000104 | ||
20470 | JB repeat_three_lz4_s2_emit_copy_short_2b | ||
20471 | CMPL R10, $0x00010100 | ||
20472 | JB repeat_four_lz4_s2_emit_copy_short_2b | ||
20473 | CMPL R10, $0x0100ffff | ||
20474 | JB repeat_five_lz4_s2_emit_copy_short_2b | ||
20475 | LEAL -16842747(R10), R10 | ||
20476 | MOVL $0xfffb001d, (AX) | ||
20477 | MOVB $0xff, 4(AX) | ||
20478 | ADDQ $0x05, AX | ||
20479 | JMP emit_repeat_again_lz4_s2_emit_copy_short_2b | ||
20480 | |||
20481 | repeat_five_lz4_s2_emit_copy_short_2b: | ||
20482 | LEAL -65536(R10), R10 | ||
20483 | MOVL R10, R9 | ||
20484 | MOVW $0x001d, (AX) | ||
20485 | MOVW R10, 2(AX) | ||
20486 | SARL $0x10, R9 | ||
20487 | MOVB R9, 4(AX) | ||
20488 | ADDQ $0x05, AX | ||
20489 | JMP lz4s_s2_loop | ||
20490 | |||
20491 | repeat_four_lz4_s2_emit_copy_short_2b: | ||
20492 | LEAL -256(R10), R10 | ||
20493 | MOVW $0x0019, (AX) | ||
20494 | MOVW R10, 2(AX) | ||
20495 | ADDQ $0x04, AX | ||
20496 | JMP lz4s_s2_loop | ||
20497 | |||
20498 | repeat_three_lz4_s2_emit_copy_short_2b: | ||
20499 | LEAL -4(R10), R10 | ||
20500 | MOVW $0x0015, (AX) | ||
20501 | MOVB R10, 2(AX) | ||
20502 | ADDQ $0x03, AX | ||
20503 | JMP lz4s_s2_loop | ||
20504 | |||
20505 | repeat_two_lz4_s2_emit_copy_short_2b: | ||
20506 | SHLL $0x02, R10 | ||
20507 | ORL $0x01, R10 | ||
20508 | MOVW R10, (AX) | ||
20509 | ADDQ $0x02, AX | ||
20510 | JMP lz4s_s2_loop | ||
20511 | |||
20512 | repeat_two_offset_lz4_s2_emit_copy_short_2b: | ||
20513 | XORQ R8, R8 | ||
20514 | LEAL 1(R8)(R10*4), R10 | ||
20515 | MOVB R9, 1(AX) | ||
20516 | SARL $0x08, R9 | ||
20517 | SHLL $0x05, R9 | ||
20518 | ORL R9, R10 | ||
20519 | MOVB R10, (AX) | ||
20520 | ADDQ $0x02, AX | ||
20521 | JMP lz4s_s2_loop | ||
20522 | |||
20523 | long_offset_short_lz4_s2: | ||
20524 | MOVB $0xee, (AX) | ||
20525 | MOVW R9, 1(AX) | ||
20526 | LEAL -60(R10), R10 | ||
20527 | ADDQ $0x03, AX | ||
20528 | |||
20529 | // emitRepeat | ||
20530 | emit_repeat_again_lz4_s2_emit_copy_short: | ||
20531 | MOVL R10, R8 | ||
20532 | LEAL -4(R10), R10 | ||
20533 | CMPL R8, $0x08 | ||
20534 | JBE repeat_two_lz4_s2_emit_copy_short | ||
20535 | CMPL R8, $0x0c | ||
20536 | JAE cant_repeat_two_offset_lz4_s2_emit_copy_short | ||
20537 | CMPL R9, $0x00000800 | ||
20538 | JB repeat_two_offset_lz4_s2_emit_copy_short | ||
20539 | |||
20540 | cant_repeat_two_offset_lz4_s2_emit_copy_short: | ||
20541 | CMPL R10, $0x00000104 | ||
20542 | JB repeat_three_lz4_s2_emit_copy_short | ||
20543 | CMPL R10, $0x00010100 | ||
20544 | JB repeat_four_lz4_s2_emit_copy_short | ||
20545 | CMPL R10, $0x0100ffff | ||
20546 | JB repeat_five_lz4_s2_emit_copy_short | ||
20547 | LEAL -16842747(R10), R10 | ||
20548 | MOVL $0xfffb001d, (AX) | ||
20549 | MOVB $0xff, 4(AX) | ||
20550 | ADDQ $0x05, AX | ||
20551 | JMP emit_repeat_again_lz4_s2_emit_copy_short | ||
20552 | |||
20553 | repeat_five_lz4_s2_emit_copy_short: | ||
20554 | LEAL -65536(R10), R10 | ||
20555 | MOVL R10, R9 | ||
20556 | MOVW $0x001d, (AX) | ||
20557 | MOVW R10, 2(AX) | ||
20558 | SARL $0x10, R9 | ||
20559 | MOVB R9, 4(AX) | ||
20560 | ADDQ $0x05, AX | ||
20561 | JMP lz4s_s2_loop | ||
20562 | |||
20563 | repeat_four_lz4_s2_emit_copy_short: | ||
20564 | LEAL -256(R10), R10 | ||
20565 | MOVW $0x0019, (AX) | ||
20566 | MOVW R10, 2(AX) | ||
20567 | ADDQ $0x04, AX | ||
20568 | JMP lz4s_s2_loop | ||
20569 | |||
20570 | repeat_three_lz4_s2_emit_copy_short: | ||
20571 | LEAL -4(R10), R10 | ||
20572 | MOVW $0x0015, (AX) | ||
20573 | MOVB R10, 2(AX) | ||
20574 | ADDQ $0x03, AX | ||
20575 | JMP lz4s_s2_loop | ||
20576 | |||
20577 | repeat_two_lz4_s2_emit_copy_short: | ||
20578 | SHLL $0x02, R10 | ||
20579 | ORL $0x01, R10 | ||
20580 | MOVW R10, (AX) | ||
20581 | ADDQ $0x02, AX | ||
20582 | JMP lz4s_s2_loop | ||
20583 | |||
20584 | repeat_two_offset_lz4_s2_emit_copy_short: | ||
20585 | XORQ R8, R8 | ||
20586 | LEAL 1(R8)(R10*4), R10 | ||
20587 | MOVB R9, 1(AX) | ||
20588 | SARL $0x08, R9 | ||
20589 | SHLL $0x05, R9 | ||
20590 | ORL R9, R10 | ||
20591 | MOVB R10, (AX) | ||
20592 | ADDQ $0x02, AX | ||
20593 | JMP lz4s_s2_loop | ||
20594 | |||
20595 | two_byte_offset_short_lz4_s2: | ||
20596 | MOVL R10, R8 | ||
20597 | SHLL $0x02, R8 | ||
20598 | CMPL R10, $0x0c | ||
20599 | JAE emit_copy_three_lz4_s2 | ||
20600 | CMPL R9, $0x00000800 | ||
20601 | JAE emit_copy_three_lz4_s2 | ||
20602 | LEAL -15(R8), R8 | ||
20603 | MOVB R9, 1(AX) | ||
20604 | SHRL $0x08, R9 | ||
20605 | SHLL $0x05, R9 | ||
20606 | ORL R9, R8 | ||
20607 | MOVB R8, (AX) | ||
20608 | ADDQ $0x02, AX | ||
20609 | JMP lz4s_s2_loop | ||
20610 | |||
20611 | emit_copy_three_lz4_s2: | ||
20612 | LEAL -2(R8), R8 | ||
20613 | MOVB R8, (AX) | ||
20614 | MOVW R9, 1(AX) | ||
20615 | ADDQ $0x03, AX | ||
20616 | JMP lz4s_s2_loop | ||
20617 | |||
20618 | lz4s_s2_done: | ||
20619 | MOVQ dst_base+0(FP), CX | ||
20620 | SUBQ CX, AX | ||
20621 | MOVQ SI, uncompressed+48(FP) | ||
20622 | MOVQ AX, dstUsed+56(FP) | ||
20623 | RET | ||
20624 | |||
20625 | lz4s_s2_corrupt: | ||
20626 | XORQ AX, AX | ||
20627 | LEAQ -1(AX), SI | ||
20628 | MOVQ SI, uncompressed+48(FP) | ||
20629 | RET | ||
20630 | |||
20631 | lz4s_s2_dstfull: | ||
20632 | XORQ AX, AX | ||
20633 | LEAQ -2(AX), SI | ||
20634 | MOVQ SI, uncompressed+48(FP) | ||
20635 | RET | ||
20636 | |||
20637 | // func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) | ||
20638 | // Requires: SSE2 | ||
20639 | TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64 | ||
20640 | XORQ SI, SI | ||
20641 | MOVQ dst_base+0(FP), AX | ||
20642 | MOVQ dst_len+8(FP), CX | ||
20643 | MOVQ src_base+24(FP), DX | ||
20644 | MOVQ src_len+32(FP), BX | ||
20645 | LEAQ (DX)(BX*1), BX | ||
20646 | LEAQ -10(AX)(CX*1), CX | ||
20647 | |||
20648 | lz4_snappy_loop: | ||
20649 | CMPQ DX, BX | ||
20650 | JAE lz4_snappy_corrupt | ||
20651 | CMPQ AX, CX | ||
20652 | JAE lz4_snappy_dstfull | ||
20653 | MOVBQZX (DX), DI | ||
20654 | MOVQ DI, R8 | ||
20655 | MOVQ DI, R9 | ||
20656 | SHRQ $0x04, R8 | ||
20657 | ANDQ $0x0f, R9 | ||
20658 | CMPQ DI, $0xf0 | ||
20659 | JB lz4_snappy_ll_end | ||
20660 | |||
20661 | lz4_snappy_ll_loop: | ||
20662 | INCQ DX | ||
20663 | CMPQ DX, BX | ||
20664 | JAE lz4_snappy_corrupt | ||
20665 | MOVBQZX (DX), DI | ||
20666 | ADDQ DI, R8 | ||
20667 | CMPQ DI, $0xff | ||
20668 | JEQ lz4_snappy_ll_loop | ||
20669 | |||
20670 | lz4_snappy_ll_end: | ||
20671 | LEAQ (DX)(R8*1), DI | ||
20672 | ADDQ $0x04, R9 | ||
20673 | CMPQ DI, BX | ||
20674 | JAE lz4_snappy_corrupt | ||
20675 | INCQ DX | ||
20676 | INCQ DI | ||
20677 | TESTQ R8, R8 | ||
20678 | JZ lz4_snappy_lits_done | ||
20679 | LEAQ (AX)(R8*1), R10 | ||
20680 | CMPQ R10, CX | ||
20681 | JAE lz4_snappy_dstfull | ||
20682 | ADDQ R8, SI | ||
20683 | LEAL -1(R8), R10 | ||
20684 | CMPL R10, $0x3c | ||
20685 | JB one_byte_lz4_snappy | ||
20686 | CMPL R10, $0x00000100 | ||
20687 | JB two_bytes_lz4_snappy | ||
20688 | CMPL R10, $0x00010000 | ||
20689 | JB three_bytes_lz4_snappy | ||
20690 | CMPL R10, $0x01000000 | ||
20691 | JB four_bytes_lz4_snappy | ||
20692 | MOVB $0xfc, (AX) | ||
20693 | MOVL R10, 1(AX) | ||
20694 | ADDQ $0x05, AX | ||
20695 | JMP memmove_long_lz4_snappy | ||
20696 | |||
20697 | four_bytes_lz4_snappy: | ||
20698 | MOVL R10, R11 | ||
20699 | SHRL $0x10, R11 | ||
20700 | MOVB $0xf8, (AX) | ||
20701 | MOVW R10, 1(AX) | ||
20702 | MOVB R11, 3(AX) | ||
20703 | ADDQ $0x04, AX | ||
20704 | JMP memmove_long_lz4_snappy | ||
20705 | |||
20706 | three_bytes_lz4_snappy: | ||
20707 | MOVB $0xf4, (AX) | ||
20708 | MOVW R10, 1(AX) | ||
20709 | ADDQ $0x03, AX | ||
20710 | JMP memmove_long_lz4_snappy | ||
20711 | |||
20712 | two_bytes_lz4_snappy: | ||
20713 | MOVB $0xf0, (AX) | ||
20714 | MOVB R10, 1(AX) | ||
20715 | ADDQ $0x02, AX | ||
20716 | CMPL R10, $0x40 | ||
20717 | JB memmove_lz4_snappy | ||
20718 | JMP memmove_long_lz4_snappy | ||
20719 | |||
20720 | one_byte_lz4_snappy: | ||
20721 | SHLB $0x02, R10 | ||
20722 | MOVB R10, (AX) | ||
20723 | ADDQ $0x01, AX | ||
20724 | |||
20725 | memmove_lz4_snappy: | ||
20726 | LEAQ (AX)(R8*1), R10 | ||
20727 | |||
20728 | // genMemMoveShort | ||
20729 | CMPQ R8, $0x08 | ||
20730 | JBE emit_lit_memmove_lz4_snappy_memmove_move_8 | ||
20731 | CMPQ R8, $0x10 | ||
20732 | JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16 | ||
20733 | CMPQ R8, $0x20 | ||
20734 | JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32 | ||
20735 | JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64 | ||
20736 | |||
20737 | emit_lit_memmove_lz4_snappy_memmove_move_8: | ||
20738 | MOVQ (DX), R11 | ||
20739 | MOVQ R11, (AX) | ||
20740 | JMP memmove_end_copy_lz4_snappy | ||
20741 | |||
20742 | emit_lit_memmove_lz4_snappy_memmove_move_8through16: | ||
20743 | MOVQ (DX), R11 | ||
20744 | MOVQ -8(DX)(R8*1), DX | ||
20745 | MOVQ R11, (AX) | ||
20746 | MOVQ DX, -8(AX)(R8*1) | ||
20747 | JMP memmove_end_copy_lz4_snappy | ||
20748 | |||
20749 | emit_lit_memmove_lz4_snappy_memmove_move_17through32: | ||
20750 | MOVOU (DX), X0 | ||
20751 | MOVOU -16(DX)(R8*1), X1 | ||
20752 | MOVOU X0, (AX) | ||
20753 | MOVOU X1, -16(AX)(R8*1) | ||
20754 | JMP memmove_end_copy_lz4_snappy | ||
20755 | |||
20756 | emit_lit_memmove_lz4_snappy_memmove_move_33through64: | ||
20757 | MOVOU (DX), X0 | ||
20758 | MOVOU 16(DX), X1 | ||
20759 | MOVOU -32(DX)(R8*1), X2 | ||
20760 | MOVOU -16(DX)(R8*1), X3 | ||
20761 | MOVOU X0, (AX) | ||
20762 | MOVOU X1, 16(AX) | ||
20763 | MOVOU X2, -32(AX)(R8*1) | ||
20764 | MOVOU X3, -16(AX)(R8*1) | ||
20765 | |||
20766 | memmove_end_copy_lz4_snappy: | ||
20767 | MOVQ R10, AX | ||
20768 | JMP lz4_snappy_lits_emit_done | ||
20769 | |||
20770 | memmove_long_lz4_snappy: | ||
20771 | LEAQ (AX)(R8*1), R10 | ||
20772 | |||
20773 | // genMemMoveLong | ||
20774 | MOVOU (DX), X0 | ||
20775 | MOVOU 16(DX), X1 | ||
20776 | MOVOU -32(DX)(R8*1), X2 | ||
20777 | MOVOU -16(DX)(R8*1), X3 | ||
20778 | MOVQ R8, R12 | ||
20779 | SHRQ $0x05, R12 | ||
20780 | MOVQ AX, R11 | ||
20781 | ANDL $0x0000001f, R11 | ||
20782 | MOVQ $0x00000040, R13 | ||
20783 | SUBQ R11, R13 | ||
20784 | DECQ R12 | ||
20785 | JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 | ||
20786 | LEAQ -32(DX)(R13*1), R11 | ||
20787 | LEAQ -32(AX)(R13*1), R14 | ||
20788 | |||
20789 | emit_lit_memmove_long_lz4_snappylarge_big_loop_back: | ||
20790 | MOVOU (R11), X4 | ||
20791 | MOVOU 16(R11), X5 | ||
20792 | MOVOA X4, (R14) | ||
20793 | MOVOA X5, 16(R14) | ||
20794 | ADDQ $0x20, R14 | ||
20795 | ADDQ $0x20, R11 | ||
20796 | ADDQ $0x20, R13 | ||
20797 | DECQ R12 | ||
20798 | JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back | ||
20799 | |||
20800 | emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32: | ||
20801 | MOVOU -32(DX)(R13*1), X4 | ||
20802 | MOVOU -16(DX)(R13*1), X5 | ||
20803 | MOVOA X4, -32(AX)(R13*1) | ||
20804 | MOVOA X5, -16(AX)(R13*1) | ||
20805 | ADDQ $0x20, R13 | ||
20806 | CMPQ R8, R13 | ||
20807 | JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 | ||
20808 | MOVOU X0, (AX) | ||
20809 | MOVOU X1, 16(AX) | ||
20810 | MOVOU X2, -32(AX)(R8*1) | ||
20811 | MOVOU X3, -16(AX)(R8*1) | ||
20812 | MOVQ R10, AX | ||
20813 | |||
20814 | lz4_snappy_lits_emit_done: | ||
20815 | MOVQ DI, DX | ||
20816 | |||
20817 | lz4_snappy_lits_done: | ||
20818 | CMPQ DX, BX | ||
20819 | JNE lz4_snappy_match | ||
20820 | CMPQ R9, $0x04 | ||
20821 | JEQ lz4_snappy_done | ||
20822 | JMP lz4_snappy_corrupt | ||
20823 | |||
20824 | lz4_snappy_match: | ||
20825 | LEAQ 2(DX), DI | ||
20826 | CMPQ DI, BX | ||
20827 | JAE lz4_snappy_corrupt | ||
20828 | MOVWQZX (DX), R8 | ||
20829 | MOVQ DI, DX | ||
20830 | TESTQ R8, R8 | ||
20831 | JZ lz4_snappy_corrupt | ||
20832 | CMPQ R8, SI | ||
20833 | JA lz4_snappy_corrupt | ||
20834 | CMPQ R9, $0x13 | ||
20835 | JNE lz4_snappy_ml_done | ||
20836 | |||
20837 | lz4_snappy_ml_loop: | ||
20838 | MOVBQZX (DX), DI | ||
20839 | INCQ DX | ||
20840 | ADDQ DI, R9 | ||
20841 | CMPQ DX, BX | ||
20842 | JAE lz4_snappy_corrupt | ||
20843 | CMPQ DI, $0xff | ||
20844 | JEQ lz4_snappy_ml_loop | ||
20845 | |||
20846 | lz4_snappy_ml_done: | ||
20847 | ADDQ R9, SI | ||
20848 | |||
20849 | // emitCopy | ||
20850 | two_byte_offset_lz4_s2: | ||
20851 | CMPL R9, $0x40 | ||
20852 | JBE two_byte_offset_short_lz4_s2 | ||
20853 | MOVB $0xee, (AX) | ||
20854 | MOVW R8, 1(AX) | ||
20855 | LEAL -60(R9), R9 | ||
20856 | ADDQ $0x03, AX | ||
20857 | CMPQ AX, CX | ||
20858 | JAE lz4_snappy_loop | ||
20859 | JMP two_byte_offset_lz4_s2 | ||
20860 | |||
20861 | two_byte_offset_short_lz4_s2: | ||
20862 | MOVL R9, DI | ||
20863 | SHLL $0x02, DI | ||
20864 | CMPL R9, $0x0c | ||
20865 | JAE emit_copy_three_lz4_s2 | ||
20866 | CMPL R8, $0x00000800 | ||
20867 | JAE emit_copy_three_lz4_s2 | ||
20868 | LEAL -15(DI), DI | ||
20869 | MOVB R8, 1(AX) | ||
20870 | SHRL $0x08, R8 | ||
20871 | SHLL $0x05, R8 | ||
20872 | ORL R8, DI | ||
20873 | MOVB DI, (AX) | ||
20874 | ADDQ $0x02, AX | ||
20875 | JMP lz4_snappy_loop | ||
20876 | |||
20877 | emit_copy_three_lz4_s2: | ||
20878 | LEAL -2(DI), DI | ||
20879 | MOVB DI, (AX) | ||
20880 | MOVW R8, 1(AX) | ||
20881 | ADDQ $0x03, AX | ||
20882 | JMP lz4_snappy_loop | ||
20883 | |||
20884 | lz4_snappy_done: | ||
20885 | MOVQ dst_base+0(FP), CX | ||
20886 | SUBQ CX, AX | ||
20887 | MOVQ SI, uncompressed+48(FP) | ||
20888 | MOVQ AX, dstUsed+56(FP) | ||
20889 | RET | ||
20890 | |||
20891 | lz4_snappy_corrupt: | ||
20892 | XORQ AX, AX | ||
20893 | LEAQ -1(AX), SI | ||
20894 | MOVQ SI, uncompressed+48(FP) | ||
20895 | RET | ||
20896 | |||
20897 | lz4_snappy_dstfull: | ||
20898 | XORQ AX, AX | ||
20899 | LEAQ -2(AX), SI | ||
20900 | MOVQ SI, uncompressed+48(FP) | ||
20901 | RET | ||
20902 | |||
20903 | // func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) | ||
20904 | // Requires: SSE2 | ||
20905 | TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64 | ||
20906 | XORQ SI, SI | ||
20907 | MOVQ dst_base+0(FP), AX | ||
20908 | MOVQ dst_len+8(FP), CX | ||
20909 | MOVQ src_base+24(FP), DX | ||
20910 | MOVQ src_len+32(FP), BX | ||
20911 | LEAQ (DX)(BX*1), BX | ||
20912 | LEAQ -10(AX)(CX*1), CX | ||
20913 | |||
20914 | lz4s_snappy_loop: | ||
20915 | CMPQ DX, BX | ||
20916 | JAE lz4s_snappy_corrupt | ||
20917 | CMPQ AX, CX | ||
20918 | JAE lz4s_snappy_dstfull | ||
20919 | MOVBQZX (DX), DI | ||
20920 | MOVQ DI, R8 | ||
20921 | MOVQ DI, R9 | ||
20922 | SHRQ $0x04, R8 | ||
20923 | ANDQ $0x0f, R9 | ||
20924 | CMPQ DI, $0xf0 | ||
20925 | JB lz4s_snappy_ll_end | ||
20926 | |||
20927 | lz4s_snappy_ll_loop: | ||
20928 | INCQ DX | ||
20929 | CMPQ DX, BX | ||
20930 | JAE lz4s_snappy_corrupt | ||
20931 | MOVBQZX (DX), DI | ||
20932 | ADDQ DI, R8 | ||
20933 | CMPQ DI, $0xff | ||
20934 | JEQ lz4s_snappy_ll_loop | ||
20935 | |||
20936 | lz4s_snappy_ll_end: | ||
20937 | LEAQ (DX)(R8*1), DI | ||
20938 | ADDQ $0x03, R9 | ||
20939 | CMPQ DI, BX | ||
20940 | JAE lz4s_snappy_corrupt | ||
20941 | INCQ DX | ||
20942 | INCQ DI | ||
20943 | TESTQ R8, R8 | ||
20944 | JZ lz4s_snappy_lits_done | ||
20945 | LEAQ (AX)(R8*1), R10 | ||
20946 | CMPQ R10, CX | ||
20947 | JAE lz4s_snappy_dstfull | ||
20948 | ADDQ R8, SI | ||
20949 | LEAL -1(R8), R10 | ||
20950 | CMPL R10, $0x3c | ||
20951 | JB one_byte_lz4s_snappy | ||
20952 | CMPL R10, $0x00000100 | ||
20953 | JB two_bytes_lz4s_snappy | ||
20954 | CMPL R10, $0x00010000 | ||
20955 | JB three_bytes_lz4s_snappy | ||
20956 | CMPL R10, $0x01000000 | ||
20957 | JB four_bytes_lz4s_snappy | ||
20958 | MOVB $0xfc, (AX) | ||
20959 | MOVL R10, 1(AX) | ||
20960 | ADDQ $0x05, AX | ||
20961 | JMP memmove_long_lz4s_snappy | ||
20962 | |||
20963 | four_bytes_lz4s_snappy: | ||
20964 | MOVL R10, R11 | ||
20965 | SHRL $0x10, R11 | ||
20966 | MOVB $0xf8, (AX) | ||
20967 | MOVW R10, 1(AX) | ||
20968 | MOVB R11, 3(AX) | ||
20969 | ADDQ $0x04, AX | ||
20970 | JMP memmove_long_lz4s_snappy | ||
20971 | |||
20972 | three_bytes_lz4s_snappy: | ||
20973 | MOVB $0xf4, (AX) | ||
20974 | MOVW R10, 1(AX) | ||
20975 | ADDQ $0x03, AX | ||
20976 | JMP memmove_long_lz4s_snappy | ||
20977 | |||
20978 | two_bytes_lz4s_snappy: | ||
20979 | MOVB $0xf0, (AX) | ||
20980 | MOVB R10, 1(AX) | ||
20981 | ADDQ $0x02, AX | ||
20982 | CMPL R10, $0x40 | ||
20983 | JB memmove_lz4s_snappy | ||
20984 | JMP memmove_long_lz4s_snappy | ||
20985 | |||
20986 | one_byte_lz4s_snappy: | ||
20987 | SHLB $0x02, R10 | ||
20988 | MOVB R10, (AX) | ||
20989 | ADDQ $0x01, AX | ||
20990 | |||
20991 | memmove_lz4s_snappy: | ||
20992 | LEAQ (AX)(R8*1), R10 | ||
20993 | |||
20994 | // genMemMoveShort | ||
20995 | CMPQ R8, $0x08 | ||
20996 | JBE emit_lit_memmove_lz4s_snappy_memmove_move_8 | ||
20997 | CMPQ R8, $0x10 | ||
20998 | JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16 | ||
20999 | CMPQ R8, $0x20 | ||
21000 | JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32 | ||
21001 | JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64 | ||
21002 | |||
21003 | emit_lit_memmove_lz4s_snappy_memmove_move_8: | ||
21004 | MOVQ (DX), R11 | ||
21005 | MOVQ R11, (AX) | ||
21006 | JMP memmove_end_copy_lz4s_snappy | ||
21007 | |||
21008 | emit_lit_memmove_lz4s_snappy_memmove_move_8through16: | ||
21009 | MOVQ (DX), R11 | ||
21010 | MOVQ -8(DX)(R8*1), DX | ||
21011 | MOVQ R11, (AX) | ||
21012 | MOVQ DX, -8(AX)(R8*1) | ||
21013 | JMP memmove_end_copy_lz4s_snappy | ||
21014 | |||
21015 | emit_lit_memmove_lz4s_snappy_memmove_move_17through32: | ||
21016 | MOVOU (DX), X0 | ||
21017 | MOVOU -16(DX)(R8*1), X1 | ||
21018 | MOVOU X0, (AX) | ||
21019 | MOVOU X1, -16(AX)(R8*1) | ||
21020 | JMP memmove_end_copy_lz4s_snappy | ||
21021 | |||
21022 | emit_lit_memmove_lz4s_snappy_memmove_move_33through64: | ||
21023 | MOVOU (DX), X0 | ||
21024 | MOVOU 16(DX), X1 | ||
21025 | MOVOU -32(DX)(R8*1), X2 | ||
21026 | MOVOU -16(DX)(R8*1), X3 | ||
21027 | MOVOU X0, (AX) | ||
21028 | MOVOU X1, 16(AX) | ||
21029 | MOVOU X2, -32(AX)(R8*1) | ||
21030 | MOVOU X3, -16(AX)(R8*1) | ||
21031 | |||
21032 | memmove_end_copy_lz4s_snappy: | ||
21033 | MOVQ R10, AX | ||
21034 | JMP lz4s_snappy_lits_emit_done | ||
21035 | |||
21036 | memmove_long_lz4s_snappy: | ||
21037 | LEAQ (AX)(R8*1), R10 | ||
21038 | |||
21039 | // genMemMoveLong | ||
21040 | MOVOU (DX), X0 | ||
21041 | MOVOU 16(DX), X1 | ||
21042 | MOVOU -32(DX)(R8*1), X2 | ||
21043 | MOVOU -16(DX)(R8*1), X3 | ||
21044 | MOVQ R8, R12 | ||
21045 | SHRQ $0x05, R12 | ||
21046 | MOVQ AX, R11 | ||
21047 | ANDL $0x0000001f, R11 | ||
21048 | MOVQ $0x00000040, R13 | ||
21049 | SUBQ R11, R13 | ||
21050 | DECQ R12 | ||
21051 | JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 | ||
21052 | LEAQ -32(DX)(R13*1), R11 | ||
21053 | LEAQ -32(AX)(R13*1), R14 | ||
21054 | |||
21055 | emit_lit_memmove_long_lz4s_snappylarge_big_loop_back: | ||
21056 | MOVOU (R11), X4 | ||
21057 | MOVOU 16(R11), X5 | ||
21058 | MOVOA X4, (R14) | ||
21059 | MOVOA X5, 16(R14) | ||
21060 | ADDQ $0x20, R14 | ||
21061 | ADDQ $0x20, R11 | ||
21062 | ADDQ $0x20, R13 | ||
21063 | DECQ R12 | ||
21064 | JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back | ||
21065 | |||
21066 | emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32: | ||
21067 | MOVOU -32(DX)(R13*1), X4 | ||
21068 | MOVOU -16(DX)(R13*1), X5 | ||
21069 | MOVOA X4, -32(AX)(R13*1) | ||
21070 | MOVOA X5, -16(AX)(R13*1) | ||
21071 | ADDQ $0x20, R13 | ||
21072 | CMPQ R8, R13 | ||
21073 | JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 | ||
21074 | MOVOU X0, (AX) | ||
21075 | MOVOU X1, 16(AX) | ||
21076 | MOVOU X2, -32(AX)(R8*1) | ||
21077 | MOVOU X3, -16(AX)(R8*1) | ||
21078 | MOVQ R10, AX | ||
21079 | |||
21080 | lz4s_snappy_lits_emit_done: | ||
21081 | MOVQ DI, DX | ||
21082 | |||
21083 | lz4s_snappy_lits_done: | ||
21084 | CMPQ DX, BX | ||
21085 | JNE lz4s_snappy_match | ||
21086 | CMPQ R9, $0x03 | ||
21087 | JEQ lz4s_snappy_done | ||
21088 | JMP lz4s_snappy_corrupt | ||
21089 | |||
21090 | lz4s_snappy_match: | ||
21091 | CMPQ R9, $0x03 | ||
21092 | JEQ lz4s_snappy_loop | ||
21093 | LEAQ 2(DX), DI | ||
21094 | CMPQ DI, BX | ||
21095 | JAE lz4s_snappy_corrupt | ||
21096 | MOVWQZX (DX), R8 | ||
21097 | MOVQ DI, DX | ||
21098 | TESTQ R8, R8 | ||
21099 | JZ lz4s_snappy_corrupt | ||
21100 | CMPQ R8, SI | ||
21101 | JA lz4s_snappy_corrupt | ||
21102 | CMPQ R9, $0x12 | ||
21103 | JNE lz4s_snappy_ml_done | ||
21104 | |||
21105 | lz4s_snappy_ml_loop: | ||
21106 | MOVBQZX (DX), DI | ||
21107 | INCQ DX | ||
21108 | ADDQ DI, R9 | ||
21109 | CMPQ DX, BX | ||
21110 | JAE lz4s_snappy_corrupt | ||
21111 | CMPQ DI, $0xff | ||
21112 | JEQ lz4s_snappy_ml_loop | ||
21113 | |||
21114 | lz4s_snappy_ml_done: | ||
21115 | ADDQ R9, SI | ||
21116 | |||
21117 | // emitCopy | ||
21118 | two_byte_offset_lz4_s2: | ||
21119 | CMPL R9, $0x40 | ||
21120 | JBE two_byte_offset_short_lz4_s2 | ||
21121 | MOVB $0xee, (AX) | ||
21122 | MOVW R8, 1(AX) | ||
21123 | LEAL -60(R9), R9 | ||
21124 | ADDQ $0x03, AX | ||
21125 | CMPQ AX, CX | ||
21126 | JAE lz4s_snappy_loop | ||
21127 | JMP two_byte_offset_lz4_s2 | ||
21128 | |||
21129 | two_byte_offset_short_lz4_s2: | ||
21130 | MOVL R9, DI | ||
21131 | SHLL $0x02, DI | ||
21132 | CMPL R9, $0x0c | ||
21133 | JAE emit_copy_three_lz4_s2 | ||
21134 | CMPL R8, $0x00000800 | ||
21135 | JAE emit_copy_three_lz4_s2 | ||
21136 | LEAL -15(DI), DI | ||
21137 | MOVB R8, 1(AX) | ||
21138 | SHRL $0x08, R8 | ||
21139 | SHLL $0x05, R8 | ||
21140 | ORL R8, DI | ||
21141 | MOVB DI, (AX) | ||
21142 | ADDQ $0x02, AX | ||
21143 | JMP lz4s_snappy_loop | ||
21144 | |||
21145 | emit_copy_three_lz4_s2: | ||
21146 | LEAL -2(DI), DI | ||
21147 | MOVB DI, (AX) | ||
21148 | MOVW R8, 1(AX) | ||
21149 | ADDQ $0x03, AX | ||
21150 | JMP lz4s_snappy_loop | ||
21151 | |||
21152 | lz4s_snappy_done: | ||
21153 | MOVQ dst_base+0(FP), CX | ||
21154 | SUBQ CX, AX | ||
21155 | MOVQ SI, uncompressed+48(FP) | ||
21156 | MOVQ AX, dstUsed+56(FP) | ||
21157 | RET | ||
21158 | |||
21159 | lz4s_snappy_corrupt: | ||
21160 | XORQ AX, AX | ||
21161 | LEAQ -1(AX), SI | ||
21162 | MOVQ SI, uncompressed+48(FP) | ||
21163 | RET | ||
21164 | |||
21165 | lz4s_snappy_dstfull: | ||
21166 | XORQ AX, AX | ||
21167 | LEAQ -2(AX), SI | ||
21168 | MOVQ SI, uncompressed+48(FP) | ||
21169 | RET | ||