From 8db41da676ac8368ef7c2549d56239a5ff5eedde Mon Sep 17 00:00:00 2001 From: Rutger Broekhoff Date: Tue, 2 Jan 2024 18:56:31 +0100 Subject: Delete vendor directory --- .../klauspost/compress/s2/encodeblock_amd64.s | 21169 ------------------- 1 file changed, 21169 deletions(-) delete mode 100644 vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s (limited to 'vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s') diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s deleted file mode 100644 index 5f110d1..0000000 --- a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s +++ /dev/null @@ -1,21169 +0,0 @@ -// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. - -//go:build !appengine && !noasm && gc && !noasm - -#include "textflag.h" - -// func _dummy_() -TEXT ·_dummy_(SB), $0 -#ifdef GOAMD64_v4 -#ifndef GOAMD64_v3 -#define GOAMD64_v3 -#endif -#endif - RET - -// func encodeBlockAsm(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x06, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeBlockAsm - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - SHLQ $0x10, R10 - IMULQ R8, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm - LEAL 1(CX), SI - MOVL 12(SP), DI - MOVL SI, BX - SUBL 16(SP), BX - JZ repeat_extend_back_end_encodeBlockAsm - -repeat_extend_back_loop_encodeBlockAsm: - CMPL SI, DI - JBE repeat_extend_back_end_encodeBlockAsm - MOVB -1(DX)(BX*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_encodeBlockAsm - LEAL -1(SI), SI - DECL BX - JNZ repeat_extend_back_loop_encodeBlockAsm - -repeat_extend_back_end_encodeBlockAsm: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_encodeBlockAsm - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_encodeBlockAsm - CMPL BX, $0x00010000 - JB three_bytes_repeat_emit_encodeBlockAsm - CMPL BX, $0x01000000 - JB four_bytes_repeat_emit_encodeBlockAsm - MOVB $0xfc, (AX) - MOVL BX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_repeat_emit_encodeBlockAsm - -four_bytes_repeat_emit_encodeBlockAsm: - MOVL BX, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW BX, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeBlockAsm - -three_bytes_repeat_emit_encodeBlockAsm: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm - -two_bytes_repeat_emit_encodeBlockAsm: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_encodeBlockAsm - JMP memmove_long_repeat_emit_encodeBlockAsm - -one_byte_repeat_emit_encodeBlockAsm: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm - -emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm: - MOVQ BX, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm - -memmove_long_repeat_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_repeat_emit_encodeBlockAsm: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_encodeBlockAsm: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_encodeBlockAsm - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (BX)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm - XORQ 8(BX)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm - -matchlen_bsf_16repeat_extend_encodeBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm - -matchlen_match8_repeat_extend_encodeBlockAsm: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm - MOVQ (R9)(R11*1), R10 - XORQ (BX)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm - -matchlen_bsf_8_repeat_extend_encodeBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm - -matchlen_match4_repeat_extend_encodeBlockAsm: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm - MOVL (R9)(R11*1), R10 - CMPL (BX)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm - JB repeat_extend_forward_end_encodeBlockAsm - MOVW (R9)(R11*1), R10 - CMPW (BX)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm - -matchlen_match1_repeat_extend_encodeBlockAsm: - MOVB (R9)(R11*1), R10 - CMPB (BX)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm: - ADDL R11, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm - - // emitRepeat -emit_repeat_again_match_repeat_encodeBlockAsm: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_match_repeat_encodeBlockAsm - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm - CMPL SI, $0x00000800 - JB repeat_two_offset_match_repeat_encodeBlockAsm - -cant_repeat_two_offset_match_repeat_encodeBlockAsm: - CMPL BX, $0x00000104 - JB repeat_three_match_repeat_encodeBlockAsm - CMPL BX, $0x00010100 - JB repeat_four_match_repeat_encodeBlockAsm - CMPL BX, $0x0100ffff - JB repeat_five_match_repeat_encodeBlockAsm - LEAL -16842747(BX), BX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_repeat_encodeBlockAsm - -repeat_five_match_repeat_encodeBlockAsm: - LEAL -65536(BX), BX - MOVL BX, SI - MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_match_repeat_encodeBlockAsm: - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_match_repeat_encodeBlockAsm: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_match_repeat_encodeBlockAsm: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_match_repeat_encodeBlockAsm: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_as_copy_encodeBlockAsm: - // emitCopy - CMPL SI, $0x00010000 - JB two_byte_offset_repeat_as_copy_encodeBlockAsm - CMPL BX, $0x40 - JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm - MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(BX), BX - ADDQ $0x05, AX - CMPL BX, $0x04 - JB four_bytes_remain_repeat_as_copy_encodeBlockAsm - - // emitRepeat -emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL SI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL BX, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL BX, $0x0100ffff - JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy - LEAL -16842747(BX), BX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy - -repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -65536(BX), BX - MOVL BX, SI - MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -four_bytes_remain_repeat_as_copy_encodeBlockAsm: - TESTL BX, BX - JZ repeat_end_emit_encodeBlockAsm - XORL DI, DI - LEAL -1(DI)(BX*4), BX - MOVB BL, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -two_byte_offset_repeat_as_copy_encodeBlockAsm: - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm - CMPL SI, $0x00000800 - JAE long_offset_short_repeat_as_copy_encodeBlockAsm - MOVL $0x00000001, DI - LEAL 16(DI), DI - MOVB SI, 1(AX) - MOVL SI, R8 - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, DI - MOVB DI, (AX) - ADDQ $0x02, AX - SUBL $0x08, BX - - // emitRepeat - LEAL -4(BX), BX - JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - -emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - CMPL SI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - CMPL BX, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - CMPL BX, $0x0100ffff - JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - LEAL -16842747(BX), BX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b - -repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - LEAL -65536(BX), BX - MOVL BX, SI - MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -long_offset_short_repeat_as_copy_encodeBlockAsm: - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BX), BX - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL SI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL BX, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL BX, $0x0100ffff - JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short - LEAL -16842747(BX), BX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short - -repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, SI - MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm - CMPL SI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm - LEAL -15(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm - -emit_copy_three_repeat_as_copy_encodeBlockAsm: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm - -no_repeat_found_encodeBlockAsm: - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeBlockAsm - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_encodeBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm - -candidate3_match_encodeBlockAsm: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm - -candidate2_match_encodeBlockAsm: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_encodeBlockAsm: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeBlockAsm - -match_extend_back_loop_encodeBlockAsm: - CMPL CX, SI - JBE match_extend_back_end_encodeBlockAsm - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeBlockAsm - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeBlockAsm - JMP match_extend_back_loop_encodeBlockAsm - -match_extend_back_end_encodeBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 5(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JB one_byte_match_emit_encodeBlockAsm - CMPL DI, $0x00000100 - JB two_bytes_match_emit_encodeBlockAsm - CMPL DI, $0x00010000 - JB three_bytes_match_emit_encodeBlockAsm - CMPL DI, $0x01000000 - JB four_bytes_match_emit_encodeBlockAsm - MOVB $0xfc, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeBlockAsm - -four_bytes_match_emit_encodeBlockAsm: - MOVL DI, R9 - SHRL $0x10, R9 - MOVB $0xf8, (AX) - MOVW DI, 1(AX) - MOVB R9, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBlockAsm - -three_bytes_match_emit_encodeBlockAsm: - MOVB $0xf4, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm - -two_bytes_match_emit_encodeBlockAsm: - MOVB $0xf0, (AX) - MOVB DI, 1(AX) - ADDQ $0x02, AX - CMPL DI, $0x40 - JB memmove_match_emit_encodeBlockAsm - JMP memmove_long_match_emit_encodeBlockAsm - -one_byte_match_emit_encodeBlockAsm: - SHLB $0x02, DI - MOVB DI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), DI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: - MOVQ (SI), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm - -emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeBlockAsm: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm - -memmove_long_match_emit_encodeBlockAsm: - LEAQ (AX)(R8*1), DI - - // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeBlockAsm: -match_nolit_loop_encodeBlockAsm: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_encodeBlockAsm: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_encodeBlockAsm - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_encodeBlockAsm - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_encodeBlockAsm - -matchlen_bsf_16match_nolit_encodeBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_encodeBlockAsm - -matchlen_match8_match_nolit_encodeBlockAsm: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_encodeBlockAsm - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_encodeBlockAsm - -matchlen_bsf_8_match_nolit_encodeBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm - -matchlen_match4_match_nolit_encodeBlockAsm: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_encodeBlockAsm - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_encodeBlockAsm - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_encodeBlockAsm: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_encodeBlockAsm - JB match_nolit_end_encodeBlockAsm - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_encodeBlockAsm - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_encodeBlockAsm - -matchlen_match1_match_nolit_encodeBlockAsm: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm - LEAL 1(R9), R9 - -match_nolit_end_encodeBlockAsm: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy - CMPL BX, $0x00010000 - JB two_byte_offset_match_nolit_encodeBlockAsm - CMPL R9, $0x40 - JBE four_bytes_remain_match_nolit_encodeBlockAsm - MOVB $0xff, (AX) - MOVL BX, 1(AX) - LEAL -64(R9), R9 - ADDQ $0x05, AX - CMPL R9, $0x04 - JB four_bytes_remain_match_nolit_encodeBlockAsm - - // emitRepeat -emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy - CMPL BX, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm_emit_copy - CMPL R9, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm_emit_copy - CMPL R9, $0x0100ffff - JB repeat_five_match_nolit_encodeBlockAsm_emit_copy - LEAL -16842747(R9), R9 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy - -repeat_five_match_nolit_encodeBlockAsm_emit_copy: - LEAL -65536(R9), R9 - MOVL R9, BX - MOVW $0x001d, (AX) - MOVW R9, 2(AX) - SARL $0x10, BX - MOVB BL, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_four_match_nolit_encodeBlockAsm_emit_copy: - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_three_match_nolit_encodeBlockAsm_emit_copy: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_match_nolit_encodeBlockAsm_emit_copy: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -four_bytes_remain_match_nolit_encodeBlockAsm: - TESTL R9, R9 - JZ match_nolit_emitcopy_end_encodeBlockAsm - XORL SI, SI - LEAL -1(SI)(R9*4), R9 - MOVB R9, (AX) - MOVL BX, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -two_byte_offset_match_nolit_encodeBlockAsm: - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBlockAsm - CMPL BX, $0x00000800 - JAE long_offset_short_match_nolit_encodeBlockAsm - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB BL, 1(AX) - MOVL BX, DI - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, AX - SUBL $0x08, R9 - - // emitRepeat - LEAL -4(R9), R9 - JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b - -emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b: - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b - CMPL BX, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b - CMPL R9, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b - CMPL R9, $0x0100ffff - JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b - LEAL -16842747(R9), R9 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b - -repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b: - LEAL -65536(R9), R9 - MOVL R9, BX - MOVW $0x001d, (AX) - MOVW R9, 2(AX) - SARL $0x10, BX - MOVB BL, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b: - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -long_offset_short_match_nolit_encodeBlockAsm: - MOVB $0xee, (AX) - MOVW BX, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short - CMPL BX, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R9, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short - CMPL R9, $0x0100ffff - JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short - LEAL -16842747(R9), R9 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short - -repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -65536(R9), R9 - MOVL R9, BX - MOVW $0x001d, (AX) - MOVW R9, 2(AX) - SARL $0x10, BX - MOVB BL, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -two_byte_offset_short_match_nolit_encodeBlockAsm: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_encodeBlockAsm - CMPL BX, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBlockAsm - LEAL -15(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm - -emit_copy_three_match_nolit_encodeBlockAsm: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm: - CMPL CX, 8(SP) - JAE emit_remainder_encodeBlockAsm - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x32, DI - SHLQ $0x10, BX - IMULQ R8, BX - SHRQ $0x32, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_encodeBlockAsm - INCL CX - JMP search_loop_encodeBlockAsm - -emit_remainder_encodeBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBlockAsm - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBlockAsm - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeBlockAsm - CMPL DX, $0x01000000 - JB four_bytes_emit_remainder_encodeBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeBlockAsm - -four_bytes_emit_remainder_encodeBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBlockAsm - -three_bytes_emit_remainder_encodeBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm - -two_bytes_emit_remainder_encodeBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBlockAsm - JMP memmove_long_emit_remainder_encodeBlockAsm - -one_byte_emit_remainder_encodeBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm - -emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm - -memmove_long_emit_remainder_encodeBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm4MB(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm4MB(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm4MB: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm4MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm4MB: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x06, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeBlockAsm4MB - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - SHLQ $0x10, R10 - IMULQ R8, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm4MB - LEAL 1(CX), SI - MOVL 12(SP), DI - MOVL SI, BX - SUBL 16(SP), BX - JZ repeat_extend_back_end_encodeBlockAsm4MB - -repeat_extend_back_loop_encodeBlockAsm4MB: - CMPL SI, DI - JBE repeat_extend_back_end_encodeBlockAsm4MB - MOVB -1(DX)(BX*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_encodeBlockAsm4MB - LEAL -1(SI), SI - DECL BX - JNZ repeat_extend_back_loop_encodeBlockAsm4MB - -repeat_extend_back_end_encodeBlockAsm4MB: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_encodeBlockAsm4MB - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_encodeBlockAsm4MB - CMPL BX, $0x00010000 - JB three_bytes_repeat_emit_encodeBlockAsm4MB - MOVL BX, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW BX, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -three_bytes_repeat_emit_encodeBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -two_bytes_repeat_emit_encodeBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_encodeBlockAsm4MB - JMP memmove_long_repeat_emit_encodeBlockAsm4MB - -one_byte_repeat_emit_encodeBlockAsm4MB: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm4MB: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB - -emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm4MB: - MOVQ BX, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB - -memmove_long_repeat_emit_encodeBlockAsm4MB: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_repeat_emit_encodeBlockAsm4MB: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_encodeBlockAsm4MB - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (BX)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB - XORQ 8(BX)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4MB - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB - -matchlen_bsf_16repeat_extend_encodeBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_match8_repeat_extend_encodeBlockAsm4MB: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm4MB - MOVQ (R9)(R11*1), R10 - XORQ (BX)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm4MB - -matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_match4_repeat_extend_encodeBlockAsm4MB: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm4MB - MOVL (R9)(R11*1), R10 - CMPL (BX)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm4MB: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm4MB - JB repeat_extend_forward_end_encodeBlockAsm4MB - MOVW (R9)(R11*1), R10 - CMPW (BX)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm4MB - -matchlen_match1_repeat_extend_encodeBlockAsm4MB: - MOVB (R9)(R11*1), R10 - CMPB (BX)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm4MB - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm4MB: - ADDL R11, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm4MB - - // emitRepeat - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_match_repeat_encodeBlockAsm4MB - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB - CMPL SI, $0x00000800 - JB repeat_two_offset_match_repeat_encodeBlockAsm4MB - -cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: - CMPL BX, $0x00000104 - JB repeat_three_match_repeat_encodeBlockAsm4MB - CMPL BX, $0x00010100 - JB repeat_four_match_repeat_encodeBlockAsm4MB - LEAL -65536(BX), BX - MOVL BX, SI - MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_match_repeat_encodeBlockAsm4MB: - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_match_repeat_encodeBlockAsm4MB: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_match_repeat_encodeBlockAsm4MB: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_match_repeat_encodeBlockAsm4MB: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_as_copy_encodeBlockAsm4MB: - // emitCopy - CMPL SI, $0x00010000 - JB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB - CMPL BX, $0x40 - JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB - MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(BX), BX - ADDQ $0x05, AX - CMPL BX, $0x04 - JB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB - - // emitRepeat - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL SI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy - CMPL BX, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy - LEAL -65536(BX), BX - MOVL BX, SI - MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: - TESTL BX, BX - JZ repeat_end_emit_encodeBlockAsm4MB - XORL DI, DI - LEAL -1(DI)(BX*4), BX - MOVB BL, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB - CMPL SI, $0x00000800 - JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB - MOVL $0x00000001, DI - LEAL 16(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - SUBL $0x08, BX - - // emitRepeat - LEAL -4(BX), BX - JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - CMPL SI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - CMPL BX, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b - LEAL -65536(BX), BX - MOVL BX, SI - MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -long_offset_short_repeat_as_copy_encodeBlockAsm4MB: - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BX), BX - ADDQ $0x03, AX - - // emitRepeat - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL SI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - CMPL BX, $0x00010100 - JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short - LEAL -65536(BX), BX - MOVL BX, SI - MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, SI - MOVB SI, 4(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB - CMPL SI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB - LEAL -15(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm4MB - -emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm4MB: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm4MB - -no_repeat_found_encodeBlockAsm4MB: - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeBlockAsm4MB - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm4MB - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_encodeBlockAsm4MB - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm4MB - -candidate3_match_encodeBlockAsm4MB: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm4MB - -candidate2_match_encodeBlockAsm4MB: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_encodeBlockAsm4MB: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeBlockAsm4MB - -match_extend_back_loop_encodeBlockAsm4MB: - CMPL CX, SI - JBE match_extend_back_end_encodeBlockAsm4MB - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeBlockAsm4MB - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeBlockAsm4MB - JMP match_extend_back_loop_encodeBlockAsm4MB - -match_extend_back_end_encodeBlockAsm4MB: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm4MB: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm4MB - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JB one_byte_match_emit_encodeBlockAsm4MB - CMPL DI, $0x00000100 - JB two_bytes_match_emit_encodeBlockAsm4MB - CMPL DI, $0x00010000 - JB three_bytes_match_emit_encodeBlockAsm4MB - MOVL DI, R9 - SHRL $0x10, R9 - MOVB $0xf8, (AX) - MOVW DI, 1(AX) - MOVB R9, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBlockAsm4MB - -three_bytes_match_emit_encodeBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm4MB - -two_bytes_match_emit_encodeBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB DI, 1(AX) - ADDQ $0x02, AX - CMPL DI, $0x40 - JB memmove_match_emit_encodeBlockAsm4MB - JMP memmove_long_match_emit_encodeBlockAsm4MB - -one_byte_match_emit_encodeBlockAsm4MB: - SHLB $0x02, DI - MOVB DI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm4MB: - LEAQ (AX)(R8*1), DI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: - MOVQ (SI), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeBlockAsm4MB: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm4MB - -memmove_long_match_emit_encodeBlockAsm4MB: - LEAQ (AX)(R8*1), DI - - // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeBlockAsm4MB: -match_nolit_loop_encodeBlockAsm4MB: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_encodeBlockAsm4MB: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_encodeBlockAsm4MB - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4MB - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_encodeBlockAsm4MB - -matchlen_bsf_16match_nolit_encodeBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_encodeBlockAsm4MB - -matchlen_match8_match_nolit_encodeBlockAsm4MB: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_encodeBlockAsm4MB - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_encodeBlockAsm4MB - -matchlen_bsf_8_match_nolit_encodeBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm4MB - -matchlen_match4_match_nolit_encodeBlockAsm4MB: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_encodeBlockAsm4MB - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_encodeBlockAsm4MB - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_encodeBlockAsm4MB: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_encodeBlockAsm4MB - JB match_nolit_end_encodeBlockAsm4MB - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_encodeBlockAsm4MB - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_encodeBlockAsm4MB - -matchlen_match1_match_nolit_encodeBlockAsm4MB: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm4MB - LEAL 1(R9), R9 - -match_nolit_end_encodeBlockAsm4MB: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy - CMPL BX, $0x00010000 - JB two_byte_offset_match_nolit_encodeBlockAsm4MB - CMPL R9, $0x40 - JBE four_bytes_remain_match_nolit_encodeBlockAsm4MB - MOVB $0xff, (AX) - MOVL BX, 1(AX) - LEAL -64(R9), R9 - ADDQ $0x05, AX - CMPL R9, $0x04 - JB four_bytes_remain_match_nolit_encodeBlockAsm4MB - - // emitRepeat - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL BX, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy - CMPL R9, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy - LEAL -65536(R9), R9 - MOVL R9, BX - MOVW $0x001d, (AX) - MOVW R9, 2(AX) - SARL $0x10, BX - MOVB BL, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -four_bytes_remain_match_nolit_encodeBlockAsm4MB: - TESTL R9, R9 - JZ match_nolit_emitcopy_end_encodeBlockAsm4MB - XORL SI, SI - LEAL -1(SI)(R9*4), R9 - MOVB R9, (AX) - MOVL BX, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -two_byte_offset_match_nolit_encodeBlockAsm4MB: - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBlockAsm4MB - CMPL BX, $0x00000800 - JAE long_offset_short_match_nolit_encodeBlockAsm4MB - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - SUBL $0x08, R9 - - // emitRepeat - LEAL -4(R9), R9 - JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - CMPL BX, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - CMPL R9, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b - LEAL -65536(R9), R9 - MOVL R9, BX - MOVW $0x001d, (AX) - MOVW R9, 2(AX) - SARL $0x10, BX - MOVB BL, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -long_offset_short_match_nolit_encodeBlockAsm4MB: - MOVB $0xee, (AX) - MOVW BX, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - - // emitRepeat - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL BX, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short - CMPL R9, $0x00010100 - JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short - LEAL -65536(R9), R9 - MOVL R9, BX - MOVW $0x001d, (AX) - MOVW R9, 2(AX) - SARL $0x10, BX - MOVB BL, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -two_byte_offset_short_match_nolit_encodeBlockAsm4MB: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_encodeBlockAsm4MB - CMPL BX, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBlockAsm4MB - LEAL -15(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - -emit_copy_three_match_nolit_encodeBlockAsm4MB: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm4MB: - CMPL CX, 8(SP) - JAE emit_remainder_encodeBlockAsm4MB - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm4MB: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x32, DI - SHLQ $0x10, BX - IMULQ R8, BX - SHRQ $0x32, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_encodeBlockAsm4MB - INCL CX - JMP search_loop_encodeBlockAsm4MB - -emit_remainder_encodeBlockAsm4MB: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm4MB: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBlockAsm4MB - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBlockAsm4MB - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeBlockAsm4MB - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -three_bytes_emit_remainder_encodeBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -two_bytes_emit_remainder_encodeBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBlockAsm4MB - JMP memmove_long_emit_remainder_encodeBlockAsm4MB - -one_byte_emit_remainder_encodeBlockAsm4MB: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm4MB: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB - -memmove_long_emit_remainder_encodeBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm4MB: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm12B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm12B(SB), $16408-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000080, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm12B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeBlockAsm12B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x18, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - SHLQ $0x18, R10 - IMULQ R8, R10 - SHRQ $0x34, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x18, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm12B - LEAL 1(CX), SI - MOVL 12(SP), DI - MOVL SI, BX - SUBL 16(SP), BX - JZ repeat_extend_back_end_encodeBlockAsm12B - -repeat_extend_back_loop_encodeBlockAsm12B: - CMPL SI, DI - JBE repeat_extend_back_end_encodeBlockAsm12B - MOVB -1(DX)(BX*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_encodeBlockAsm12B - LEAL -1(SI), SI - DECL BX - JNZ repeat_extend_back_loop_encodeBlockAsm12B - -repeat_extend_back_end_encodeBlockAsm12B: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_encodeBlockAsm12B - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_encodeBlockAsm12B - JB three_bytes_repeat_emit_encodeBlockAsm12B - -three_bytes_repeat_emit_encodeBlockAsm12B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm12B - -two_bytes_repeat_emit_encodeBlockAsm12B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_encodeBlockAsm12B - JMP memmove_long_repeat_emit_encodeBlockAsm12B - -one_byte_repeat_emit_encodeBlockAsm12B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm12B: - MOVQ BX, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm12B - -memmove_long_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_repeat_emit_encodeBlockAsm12B: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_encodeBlockAsm12B: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_encodeBlockAsm12B - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (BX)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B - XORQ 8(BX)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm12B - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm12B - -matchlen_bsf_16repeat_extend_encodeBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_match8_repeat_extend_encodeBlockAsm12B: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm12B - MOVQ (R9)(R11*1), R10 - XORQ (BX)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm12B - -matchlen_bsf_8_repeat_extend_encodeBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_match4_repeat_extend_encodeBlockAsm12B: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm12B - MOVL (R9)(R11*1), R10 - CMPL (BX)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm12B - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm12B: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm12B - JB repeat_extend_forward_end_encodeBlockAsm12B - MOVW (R9)(R11*1), R10 - CMPW (BX)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm12B - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm12B - -matchlen_match1_repeat_extend_encodeBlockAsm12B: - MOVB (R9)(R11*1), R10 - CMPB (BX)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm12B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm12B: - ADDL R11, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm12B - - // emitRepeat - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_match_repeat_encodeBlockAsm12B - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B - CMPL SI, $0x00000800 - JB repeat_two_offset_match_repeat_encodeBlockAsm12B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: - CMPL BX, $0x00000104 - JB repeat_three_match_repeat_encodeBlockAsm12B - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_three_match_repeat_encodeBlockAsm12B: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_match_repeat_encodeBlockAsm12B: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_offset_match_repeat_encodeBlockAsm12B: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_as_copy_encodeBlockAsm12B: - // emitCopy - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B - CMPL SI, $0x00000800 - JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B - MOVL $0x00000001, DI - LEAL 16(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - SUBL $0x08, BX - - // emitRepeat - LEAL -4(BX), BX - JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b - CMPL SI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -long_offset_short_repeat_as_copy_encodeBlockAsm12B: - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BX), BX - ADDQ $0x03, AX - - // emitRepeat - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL SI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - CMPL SI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - LEAL -15(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm12B - -emit_copy_three_repeat_as_copy_encodeBlockAsm12B: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm12B: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm12B - -no_repeat_found_encodeBlockAsm12B: - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeBlockAsm12B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm12B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_encodeBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm12B - -candidate3_match_encodeBlockAsm12B: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm12B - -candidate2_match_encodeBlockAsm12B: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_encodeBlockAsm12B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeBlockAsm12B - -match_extend_back_loop_encodeBlockAsm12B: - CMPL CX, SI - JBE match_extend_back_end_encodeBlockAsm12B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeBlockAsm12B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeBlockAsm12B - JMP match_extend_back_loop_encodeBlockAsm12B - -match_extend_back_end_encodeBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm12B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JB one_byte_match_emit_encodeBlockAsm12B - CMPL DI, $0x00000100 - JB two_bytes_match_emit_encodeBlockAsm12B - JB three_bytes_match_emit_encodeBlockAsm12B - -three_bytes_match_emit_encodeBlockAsm12B: - MOVB $0xf4, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm12B - -two_bytes_match_emit_encodeBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DI, 1(AX) - ADDQ $0x02, AX - CMPL DI, $0x40 - JB memmove_match_emit_encodeBlockAsm12B - JMP memmove_long_match_emit_encodeBlockAsm12B - -one_byte_match_emit_encodeBlockAsm12B: - SHLB $0x02, DI - MOVB DI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), DI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (SI), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm12B - -emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeBlockAsm12B: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm12B - -memmove_long_match_emit_encodeBlockAsm12B: - LEAQ (AX)(R8*1), DI - - // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeBlockAsm12B: -match_nolit_loop_encodeBlockAsm12B: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_encodeBlockAsm12B: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_encodeBlockAsm12B - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_encodeBlockAsm12B - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_encodeBlockAsm12B - -matchlen_bsf_16match_nolit_encodeBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_encodeBlockAsm12B - -matchlen_match8_match_nolit_encodeBlockAsm12B: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_encodeBlockAsm12B - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_encodeBlockAsm12B - -matchlen_bsf_8_match_nolit_encodeBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm12B - -matchlen_match4_match_nolit_encodeBlockAsm12B: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_encodeBlockAsm12B - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_encodeBlockAsm12B - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_encodeBlockAsm12B: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_encodeBlockAsm12B - JB match_nolit_end_encodeBlockAsm12B - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_encodeBlockAsm12B - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_encodeBlockAsm12B - -matchlen_match1_match_nolit_encodeBlockAsm12B: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm12B - LEAL 1(R9), R9 - -match_nolit_end_encodeBlockAsm12B: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBlockAsm12B - CMPL BX, $0x00000800 - JAE long_offset_short_match_nolit_encodeBlockAsm12B - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - SUBL $0x08, R9 - - // emitRepeat - LEAL -4(R9), R9 - JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b - CMPL BX, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -long_offset_short_match_nolit_encodeBlockAsm12B: - MOVB $0xee, (AX) - MOVW BX, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - - // emitRepeat - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL BX, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -two_byte_offset_short_match_nolit_encodeBlockAsm12B: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_encodeBlockAsm12B - CMPL BX, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBlockAsm12B - LEAL -15(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm12B - -emit_copy_three_match_nolit_encodeBlockAsm12B: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm12B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeBlockAsm12B - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x18, DI - IMULQ R8, DI - SHRQ $0x34, DI - SHLQ $0x18, BX - IMULQ R8, BX - SHRQ $0x34, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_encodeBlockAsm12B - INCL CX - JMP search_loop_encodeBlockAsm12B - -emit_remainder_encodeBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBlockAsm12B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBlockAsm12B - JB three_bytes_emit_remainder_encodeBlockAsm12B - -three_bytes_emit_remainder_encodeBlockAsm12B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm12B - -two_bytes_emit_remainder_encodeBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBlockAsm12B - JMP memmove_long_emit_remainder_encodeBlockAsm12B - -one_byte_emit_remainder_encodeBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm12B - -memmove_long_emit_remainder_encodeBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm10B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm10B(SB), $4120-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000020, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm10B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeBlockAsm10B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x36, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm10B - LEAL 1(CX), SI - MOVL 12(SP), DI - MOVL SI, BX - SUBL 16(SP), BX - JZ repeat_extend_back_end_encodeBlockAsm10B - -repeat_extend_back_loop_encodeBlockAsm10B: - CMPL SI, DI - JBE repeat_extend_back_end_encodeBlockAsm10B - MOVB -1(DX)(BX*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_encodeBlockAsm10B - LEAL -1(SI), SI - DECL BX - JNZ repeat_extend_back_loop_encodeBlockAsm10B - -repeat_extend_back_end_encodeBlockAsm10B: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_encodeBlockAsm10B - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_encodeBlockAsm10B - JB three_bytes_repeat_emit_encodeBlockAsm10B - -three_bytes_repeat_emit_encodeBlockAsm10B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm10B - -two_bytes_repeat_emit_encodeBlockAsm10B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_encodeBlockAsm10B - JMP memmove_long_repeat_emit_encodeBlockAsm10B - -one_byte_repeat_emit_encodeBlockAsm10B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm10B: - MOVQ BX, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm10B - -memmove_long_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_repeat_emit_encodeBlockAsm10B: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_encodeBlockAsm10B: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_encodeBlockAsm10B - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (BX)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B - XORQ 8(BX)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm10B - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm10B - -matchlen_bsf_16repeat_extend_encodeBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_match8_repeat_extend_encodeBlockAsm10B: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm10B - MOVQ (R9)(R11*1), R10 - XORQ (BX)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm10B - -matchlen_bsf_8_repeat_extend_encodeBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_match4_repeat_extend_encodeBlockAsm10B: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm10B - MOVL (R9)(R11*1), R10 - CMPL (BX)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm10B - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm10B: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm10B - JB repeat_extend_forward_end_encodeBlockAsm10B - MOVW (R9)(R11*1), R10 - CMPW (BX)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm10B - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm10B - -matchlen_match1_repeat_extend_encodeBlockAsm10B: - MOVB (R9)(R11*1), R10 - CMPB (BX)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm10B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm10B: - ADDL R11, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm10B - - // emitRepeat - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_match_repeat_encodeBlockAsm10B - CMPL DI, $0x0c - JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B - CMPL SI, $0x00000800 - JB repeat_two_offset_match_repeat_encodeBlockAsm10B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: - CMPL BX, $0x00000104 - JB repeat_three_match_repeat_encodeBlockAsm10B - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_three_match_repeat_encodeBlockAsm10B: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_match_repeat_encodeBlockAsm10B: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_offset_match_repeat_encodeBlockAsm10B: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_as_copy_encodeBlockAsm10B: - // emitCopy - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B - CMPL SI, $0x00000800 - JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B - MOVL $0x00000001, DI - LEAL 16(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - SUBL $0x08, BX - - // emitRepeat - LEAL -4(BX), BX - JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b - CMPL SI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -long_offset_short_repeat_as_copy_encodeBlockAsm10B: - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BX), BX - ADDQ $0x03, AX - - // emitRepeat - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL DI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL SI, $0x00000800 - JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - CMPL SI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - LEAL -15(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm10B - -emit_copy_three_repeat_as_copy_encodeBlockAsm10B: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm10B: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm10B - -no_repeat_found_encodeBlockAsm10B: - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeBlockAsm10B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm10B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_encodeBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm10B - -candidate3_match_encodeBlockAsm10B: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm10B - -candidate2_match_encodeBlockAsm10B: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_encodeBlockAsm10B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeBlockAsm10B - -match_extend_back_loop_encodeBlockAsm10B: - CMPL CX, SI - JBE match_extend_back_end_encodeBlockAsm10B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeBlockAsm10B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeBlockAsm10B - JMP match_extend_back_loop_encodeBlockAsm10B - -match_extend_back_end_encodeBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm10B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JB one_byte_match_emit_encodeBlockAsm10B - CMPL DI, $0x00000100 - JB two_bytes_match_emit_encodeBlockAsm10B - JB three_bytes_match_emit_encodeBlockAsm10B - -three_bytes_match_emit_encodeBlockAsm10B: - MOVB $0xf4, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm10B - -two_bytes_match_emit_encodeBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DI, 1(AX) - ADDQ $0x02, AX - CMPL DI, $0x40 - JB memmove_match_emit_encodeBlockAsm10B - JMP memmove_long_match_emit_encodeBlockAsm10B - -one_byte_match_emit_encodeBlockAsm10B: - SHLB $0x02, DI - MOVB DI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), DI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: - MOVQ (SI), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm10B - -emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeBlockAsm10B: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm10B - -memmove_long_match_emit_encodeBlockAsm10B: - LEAQ (AX)(R8*1), DI - - // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeBlockAsm10B: -match_nolit_loop_encodeBlockAsm10B: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_encodeBlockAsm10B: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_encodeBlockAsm10B - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_encodeBlockAsm10B - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_encodeBlockAsm10B - -matchlen_bsf_16match_nolit_encodeBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_encodeBlockAsm10B - -matchlen_match8_match_nolit_encodeBlockAsm10B: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_encodeBlockAsm10B - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_encodeBlockAsm10B - -matchlen_bsf_8_match_nolit_encodeBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm10B - -matchlen_match4_match_nolit_encodeBlockAsm10B: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_encodeBlockAsm10B - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_encodeBlockAsm10B - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_encodeBlockAsm10B: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_encodeBlockAsm10B - JB match_nolit_end_encodeBlockAsm10B - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_encodeBlockAsm10B - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_encodeBlockAsm10B - -matchlen_match1_match_nolit_encodeBlockAsm10B: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm10B - LEAL 1(R9), R9 - -match_nolit_end_encodeBlockAsm10B: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBlockAsm10B - CMPL BX, $0x00000800 - JAE long_offset_short_match_nolit_encodeBlockAsm10B - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - SUBL $0x08, R9 - - // emitRepeat - LEAL -4(R9), R9 - JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b - CMPL BX, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -long_offset_short_match_nolit_encodeBlockAsm10B: - MOVB $0xee, (AX) - MOVW BX, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - - // emitRepeat - MOVL R9, SI - LEAL -4(R9), R9 - CMPL SI, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL BX, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -two_byte_offset_short_match_nolit_encodeBlockAsm10B: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_encodeBlockAsm10B - CMPL BX, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBlockAsm10B - LEAL -15(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm10B - -emit_copy_three_match_nolit_encodeBlockAsm10B: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm10B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeBlockAsm10B - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm10B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x36, DI - SHLQ $0x20, BX - IMULQ R8, BX - SHRQ $0x36, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_encodeBlockAsm10B - INCL CX - JMP search_loop_encodeBlockAsm10B - -emit_remainder_encodeBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBlockAsm10B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBlockAsm10B - JB three_bytes_emit_remainder_encodeBlockAsm10B - -three_bytes_emit_remainder_encodeBlockAsm10B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm10B - -two_bytes_emit_remainder_encodeBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBlockAsm10B - JMP memmove_long_emit_remainder_encodeBlockAsm10B - -one_byte_emit_remainder_encodeBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm10B - -memmove_long_emit_remainder_encodeBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBlockAsm8B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBlockAsm8B(SB), $1048-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000008, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBlockAsm8B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x04, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeBlockAsm8B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x38, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x38, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x38, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeBlockAsm8B - LEAL 1(CX), SI - MOVL 12(SP), DI - MOVL SI, BX - SUBL 16(SP), BX - JZ repeat_extend_back_end_encodeBlockAsm8B - -repeat_extend_back_loop_encodeBlockAsm8B: - CMPL SI, DI - JBE repeat_extend_back_end_encodeBlockAsm8B - MOVB -1(DX)(BX*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_encodeBlockAsm8B - LEAL -1(SI), SI - DECL BX - JNZ repeat_extend_back_loop_encodeBlockAsm8B - -repeat_extend_back_end_encodeBlockAsm8B: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_encodeBlockAsm8B - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_encodeBlockAsm8B - JB three_bytes_repeat_emit_encodeBlockAsm8B - -three_bytes_repeat_emit_encodeBlockAsm8B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeBlockAsm8B - -two_bytes_repeat_emit_encodeBlockAsm8B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_encodeBlockAsm8B - JMP memmove_long_repeat_emit_encodeBlockAsm8B - -one_byte_repeat_emit_encodeBlockAsm8B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_repeat_emit_encodeBlockAsm8B: - MOVQ BX, AX - JMP emit_literal_done_repeat_emit_encodeBlockAsm8B - -memmove_long_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R11 - SHRQ $0x05, R11 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R12 - SUBQ R10, R12 - DECQ R11 - JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R12*1), R10 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R10 - ADDQ $0x20, R12 - DECQ R11 - JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R12*1), X4 - MOVOU -16(R9)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ R8, R12 - JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_repeat_emit_encodeBlockAsm8B: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R8 - SUBL CX, R8 - LEAQ (DX)(CX*1), R9 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_repeat_extend_encodeBlockAsm8B: - CMPL R8, $0x10 - JB matchlen_match8_repeat_extend_encodeBlockAsm8B - MOVQ (R9)(R11*1), R10 - MOVQ 8(R9)(R11*1), R12 - XORQ (BX)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B - XORQ 8(BX)(R11*1), R12 - JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm8B - LEAL -16(R8), R8 - LEAL 16(R11), R11 - JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm8B - -matchlen_bsf_16repeat_extend_encodeBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_match8_repeat_extend_encodeBlockAsm8B: - CMPL R8, $0x08 - JB matchlen_match4_repeat_extend_encodeBlockAsm8B - MOVQ (R9)(R11*1), R10 - XORQ (BX)(R11*1), R10 - JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B - LEAL -8(R8), R8 - LEAL 8(R11), R11 - JMP matchlen_match4_repeat_extend_encodeBlockAsm8B - -matchlen_bsf_8_repeat_extend_encodeBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_match4_repeat_extend_encodeBlockAsm8B: - CMPL R8, $0x04 - JB matchlen_match2_repeat_extend_encodeBlockAsm8B - MOVL (R9)(R11*1), R10 - CMPL (BX)(R11*1), R10 - JNE matchlen_match2_repeat_extend_encodeBlockAsm8B - LEAL -4(R8), R8 - LEAL 4(R11), R11 - -matchlen_match2_repeat_extend_encodeBlockAsm8B: - CMPL R8, $0x01 - JE matchlen_match1_repeat_extend_encodeBlockAsm8B - JB repeat_extend_forward_end_encodeBlockAsm8B - MOVW (R9)(R11*1), R10 - CMPW (BX)(R11*1), R10 - JNE matchlen_match1_repeat_extend_encodeBlockAsm8B - LEAL 2(R11), R11 - SUBL $0x02, R8 - JZ repeat_extend_forward_end_encodeBlockAsm8B - -matchlen_match1_repeat_extend_encodeBlockAsm8B: - MOVB (R9)(R11*1), R10 - CMPB (BX)(R11*1), R10 - JNE repeat_extend_forward_end_encodeBlockAsm8B - LEAL 1(R11), R11 - -repeat_extend_forward_end_encodeBlockAsm8B: - ADDL R11, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - TESTL DI, DI - JZ repeat_as_copy_encodeBlockAsm8B - - // emitRepeat - MOVL BX, SI - LEAL -4(BX), BX - CMPL SI, $0x08 - JBE repeat_two_match_repeat_encodeBlockAsm8B - CMPL SI, $0x0c - JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B - -cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: - CMPL BX, $0x00000104 - JB repeat_three_match_repeat_encodeBlockAsm8B - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_three_match_repeat_encodeBlockAsm8B: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_two_match_repeat_encodeBlockAsm8B: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_as_copy_encodeBlockAsm8B: - // emitCopy - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B - CMPL SI, $0x00000800 - JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B - MOVL $0x00000001, DI - LEAL 16(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - SUBL $0x08, BX - - // emitRepeat - LEAL -4(BX), BX - JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b - MOVL BX, SI - LEAL -4(BX), BX - CMPL SI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - -long_offset_short_repeat_as_copy_encodeBlockAsm8B: - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BX), BX - ADDQ $0x03, AX - - // emitRepeat - MOVL BX, SI - LEAL -4(BX), BX - CMPL SI, $0x08 - JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - CMPL BX, $0x00000104 - JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - LEAL -256(BX), BX - MOVW $0x0019, (AX) - MOVW BX, 2(AX) - ADDQ $0x04, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - LEAL -4(BX), BX - MOVW $0x0015, (AX) - MOVB BL, 2(AX) - ADDQ $0x03, AX - JMP repeat_end_emit_encodeBlockAsm8B - -repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - XORQ DI, DI - LEAL 1(DI)(BX*4), BX - MOVB SI, 1(AX) - SARL $0x08, SI - SHLL $0x05, SI - ORL SI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - -two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeBlockAsm8B - LEAL -15(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeBlockAsm8B - -emit_copy_three_repeat_as_copy_encodeBlockAsm8B: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeBlockAsm8B: - MOVL CX, 12(SP) - JMP search_loop_encodeBlockAsm8B - -no_repeat_found_encodeBlockAsm8B: - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeBlockAsm8B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeBlockAsm8B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_encodeBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeBlockAsm8B - -candidate3_match_encodeBlockAsm8B: - ADDL $0x02, CX - JMP candidate_match_encodeBlockAsm8B - -candidate2_match_encodeBlockAsm8B: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_encodeBlockAsm8B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeBlockAsm8B - -match_extend_back_loop_encodeBlockAsm8B: - CMPL CX, SI - JBE match_extend_back_end_encodeBlockAsm8B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeBlockAsm8B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeBlockAsm8B - JMP match_extend_back_loop_encodeBlockAsm8B - -match_extend_back_end_encodeBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBlockAsm8B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeBlockAsm8B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JB one_byte_match_emit_encodeBlockAsm8B - CMPL DI, $0x00000100 - JB two_bytes_match_emit_encodeBlockAsm8B - JB three_bytes_match_emit_encodeBlockAsm8B - -three_bytes_match_emit_encodeBlockAsm8B: - MOVB $0xf4, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBlockAsm8B - -two_bytes_match_emit_encodeBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DI, 1(AX) - ADDQ $0x02, AX - CMPL DI, $0x40 - JB memmove_match_emit_encodeBlockAsm8B - JMP memmove_long_match_emit_encodeBlockAsm8B - -one_byte_match_emit_encodeBlockAsm8B: - SHLB $0x02, DI - MOVB DI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), DI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: - MOVQ (SI), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBlockAsm8B - -emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeBlockAsm8B: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeBlockAsm8B - -memmove_long_match_emit_encodeBlockAsm8B: - LEAQ (AX)(R8*1), DI - - // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeBlockAsm8B: -match_nolit_loop_encodeBlockAsm8B: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_encodeBlockAsm8B: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_encodeBlockAsm8B - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_encodeBlockAsm8B - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_encodeBlockAsm8B - -matchlen_bsf_16match_nolit_encodeBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_encodeBlockAsm8B - -matchlen_match8_match_nolit_encodeBlockAsm8B: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_encodeBlockAsm8B - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_encodeBlockAsm8B - -matchlen_bsf_8_match_nolit_encodeBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeBlockAsm8B - -matchlen_match4_match_nolit_encodeBlockAsm8B: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_encodeBlockAsm8B - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_encodeBlockAsm8B - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_encodeBlockAsm8B: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_encodeBlockAsm8B - JB match_nolit_end_encodeBlockAsm8B - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_encodeBlockAsm8B - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_encodeBlockAsm8B - -matchlen_match1_match_nolit_encodeBlockAsm8B: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_encodeBlockAsm8B - LEAL 1(R9), R9 - -match_nolit_end_encodeBlockAsm8B: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBlockAsm8B - CMPL BX, $0x00000800 - JAE long_offset_short_match_nolit_encodeBlockAsm8B - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - SUBL $0x08, R9 - - // emitRepeat - LEAL -4(R9), R9 - JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b - MOVL R9, BX - LEAL -4(R9), R9 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -long_offset_short_match_nolit_encodeBlockAsm8B: - MOVB $0xee, (AX) - MOVW BX, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - - // emitRepeat - MOVL R9, BX - LEAL -4(R9), R9 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: - CMPL R9, $0x00000104 - JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short - LEAL -256(R9), R9 - MOVW $0x0019, (AX) - MOVW R9, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: - LEAL -4(R9), R9 - MOVW $0x0015, (AX) - MOVB R9, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, R9 - ORL $0x01, R9 - MOVW R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - XORQ SI, SI - LEAL 1(SI)(R9*4), R9 - MOVB BL, 1(AX) - SARL $0x08, BX - SHLL $0x05, BX - ORL BX, R9 - MOVB R9, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -two_byte_offset_short_match_nolit_encodeBlockAsm8B: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_encodeBlockAsm8B - LEAL -15(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBlockAsm8B - -emit_copy_three_match_nolit_encodeBlockAsm8B: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeBlockAsm8B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeBlockAsm8B - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBlockAsm8B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x38, DI - SHLQ $0x20, BX - IMULQ R8, BX - SHRQ $0x38, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_encodeBlockAsm8B - INCL CX - JMP search_loop_encodeBlockAsm8B - -emit_remainder_encodeBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBlockAsm8B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBlockAsm8B - JB three_bytes_emit_remainder_encodeBlockAsm8B - -three_bytes_emit_remainder_encodeBlockAsm8B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBlockAsm8B - -two_bytes_emit_remainder_encodeBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBlockAsm8B - JMP memmove_long_emit_remainder_encodeBlockAsm8B - -one_byte_emit_remainder_encodeBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBlockAsm8B - -memmove_long_emit_remainder_encodeBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm(SB), $589848-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00001200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x07, BX - CMPL BX, $0x63 - JBE check_maxskip_ok_encodeBetterBlockAsm - LEAL 100(CX), BX - JMP check_maxskip_cont_encodeBetterBlockAsm - -check_maxskip_ok_encodeBetterBlockAsm: - LEAL 1(CX)(BX*1), BX - -check_maxskip_cont_encodeBetterBlockAsm: - CMPL BX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R8 - MOVQ $0x9e3779b1, BX - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x2f, R9 - SHLQ $0x20, R10 - IMULQ BX, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BX - MOVL 524312(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 524312(SP)(R10*4) - MOVQ (DX)(BX*1), R9 - MOVQ (DX)(DI*1), R10 - CMPQ R9, SI - JEQ candidate_match_encodeBetterBlockAsm - CMPQ R10, SI - JNE no_short_found_encodeBetterBlockAsm - MOVL DI, BX - JMP candidate_match_encodeBetterBlockAsm - -no_short_found_encodeBetterBlockAsm: - CMPL R9, SI - JEQ candidate_match_encodeBetterBlockAsm - CMPL R10, SI - JEQ candidateS_match_encodeBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm - -candidateS_match_encodeBetterBlockAsm: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x2f, R9 - MOVL 24(SP)(R9*4), BX - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeBetterBlockAsm - DECL CX - MOVL DI, BX - -candidate_match_encodeBetterBlockAsm: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeBetterBlockAsm - -match_extend_back_loop_encodeBetterBlockAsm: - CMPL CX, SI - JBE match_extend_back_end_encodeBetterBlockAsm - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeBetterBlockAsm - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeBetterBlockAsm - JMP match_extend_back_loop_encodeBetterBlockAsm - -match_extend_back_end_encodeBetterBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 5(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm: - MOVL CX, SI - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), R9 - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_match_nolit_encodeBetterBlockAsm: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeBetterBlockAsm - MOVQ (R8)(R11*1), R10 - MOVQ 8(R8)(R11*1), R12 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm - XORQ 8(R9)(R11*1), R12 - JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm - LEAL -16(DI), DI - LEAL 16(R11), R11 - JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm - -matchlen_bsf_16match_nolit_encodeBetterBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm - -matchlen_match8_match_nolit_encodeBetterBlockAsm: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeBetterBlockAsm - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm - LEAL -8(DI), DI - LEAL 8(R11), R11 - JMP matchlen_match4_match_nolit_encodeBetterBlockAsm - -matchlen_bsf_8_match_nolit_encodeBetterBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm - -matchlen_match4_match_nolit_encodeBetterBlockAsm: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeBetterBlockAsm - MOVL (R8)(R11*1), R10 - CMPL (R9)(R11*1), R10 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm - LEAL -4(DI), DI - LEAL 4(R11), R11 - -matchlen_match2_match_nolit_encodeBetterBlockAsm: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeBetterBlockAsm - JB match_nolit_end_encodeBetterBlockAsm - MOVW (R8)(R11*1), R10 - CMPW (R9)(R11*1), R10 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm - LEAL 2(R11), R11 - SUBL $0x02, DI - JZ match_nolit_end_encodeBetterBlockAsm - -matchlen_match1_match_nolit_encodeBetterBlockAsm: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeBetterBlockAsm - LEAL 1(R11), R11 - -match_nolit_end_encodeBetterBlockAsm: - MOVL CX, DI - SUBL BX, DI - - // Check if repeat - CMPL 16(SP), DI - JEQ match_is_repeat_encodeBetterBlockAsm - CMPL R11, $0x01 - JA match_length_ok_encodeBetterBlockAsm - CMPL DI, $0x0000ffff - JBE match_length_ok_encodeBetterBlockAsm - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeBetterBlockAsm - -match_length_ok_encodeBetterBlockAsm: - MOVL DI, 16(SP) - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_encodeBetterBlockAsm - CMPL BX, $0x00000100 - JB two_bytes_match_emit_encodeBetterBlockAsm - CMPL BX, $0x00010000 - JB three_bytes_match_emit_encodeBetterBlockAsm - CMPL BX, $0x01000000 - JB four_bytes_match_emit_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL BX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -four_bytes_match_emit_encodeBetterBlockAsm: - MOVL BX, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW BX, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -three_bytes_match_emit_encodeBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm - -two_bytes_match_emit_encodeBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_encodeBetterBlockAsm - JMP memmove_long_match_emit_encodeBetterBlockAsm - -one_byte_match_emit_encodeBetterBlockAsm: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x04 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm: - MOVQ BX, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm - -memmove_long_match_emit_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy - CMPL DI, $0x00010000 - JB two_byte_offset_match_nolit_encodeBetterBlockAsm - CMPL R11, $0x40 - JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm - MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(R11), R11 - ADDQ $0x05, AX - CMPL R11, $0x04 - JB four_bytes_remain_match_nolit_encodeBetterBlockAsm - - // emitRepeat -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R11, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy - CMPL R11, $0x0100ffff - JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy - LEAL -16842747(R11), R11 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -65536(R11), R11 - MOVL R11, DI - MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -four_bytes_remain_match_nolit_encodeBetterBlockAsm: - TESTL R11, R11 - JZ match_nolit_emitcopy_end_encodeBetterBlockAsm - XORL BX, BX - LEAL -1(BX)(R11*4), R11 - MOVB R11, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -two_byte_offset_match_nolit_encodeBetterBlockAsm: - CMPL R11, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm - CMPL DI, $0x00000800 - JAE long_offset_short_match_nolit_encodeBetterBlockAsm - MOVL $0x00000001, BX - LEAL 16(BX), BX - MOVB DI, 1(AX) - MOVL DI, R8 - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, BX - MOVB BL, (AX) - ADDQ $0x02, AX - SUBL $0x08, R11 - - // emitRepeat - LEAL -4(R11), R11 - JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - CMPL R11, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - CMPL R11, $0x0100ffff - JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - LEAL -16842747(R11), R11 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - LEAL -65536(R11), R11 - MOVL R11, DI - MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -long_offset_short_match_nolit_encodeBetterBlockAsm: - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R11, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short - CMPL R11, $0x0100ffff - JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short - LEAL -16842747(R11), R11 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short - -repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -65536(R11), R11 - MOVL R11, DI - MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm: - MOVL R11, BX - SHLL $0x02, BX - CMPL R11, $0x0c - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm - CMPL DI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm - LEAL -15(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -emit_copy_three_match_nolit_encodeBetterBlockAsm: - LEAL -2(BX), BX - MOVB BL, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -match_is_repeat_encodeBetterBlockAsm: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_repeat_encodeBetterBlockAsm - CMPL BX, $0x00000100 - JB two_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL BX, $0x00010000 - JB three_bytes_match_emit_repeat_encodeBetterBlockAsm - CMPL BX, $0x01000000 - JB four_bytes_match_emit_repeat_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL BX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -four_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVL BX, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW BX, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -three_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -two_bytes_match_emit_repeat_encodeBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_repeat_encodeBetterBlockAsm - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm - -one_byte_match_emit_repeat_encodeBetterBlockAsm: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x04 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: - MOVQ BX, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm - -memmove_long_match_emit_repeat_encodeBetterBlockAsm: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitRepeat -emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm - CMPL R11, $0x00010100 - JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm - CMPL R11, $0x0100ffff - JB repeat_five_match_nolit_repeat_encodeBetterBlockAsm - LEAL -16842747(R11), R11 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm - -repeat_five_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -65536(R11), R11 - MOVL R11, DI - MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_four_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm: - CMPL CX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm: - MOVQ $0x00cf1bbcdcbfa563, BX - MOVQ $0x9e3779b1, DI - LEAQ 1(SI), SI - LEAQ -2(CX), R8 - MOVQ (DX)(SI*1), R9 - MOVQ 1(DX)(SI*1), R10 - MOVQ (DX)(R8*1), R11 - MOVQ 1(DX)(R8*1), R12 - SHLQ $0x08, R9 - IMULQ BX, R9 - SHRQ $0x2f, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x32, R10 - SHLQ $0x08, R11 - IMULQ BX, R11 - SHRQ $0x2f, R11 - SHLQ $0x20, R12 - IMULQ DI, R12 - SHRQ $0x32, R12 - LEAQ 1(SI), DI - LEAQ 1(R8), R13 - MOVL SI, 24(SP)(R9*4) - MOVL R8, 24(SP)(R11*4) - MOVL DI, 524312(SP)(R10*4) - MOVL R13, 524312(SP)(R12*4) - LEAQ 1(R8)(SI*1), DI - SHRQ $0x01, DI - ADDQ $0x01, SI - SUBQ $0x01, R8 - -index_loop_encodeBetterBlockAsm: - CMPQ DI, R8 - JAE search_loop_encodeBetterBlockAsm - MOVQ (DX)(SI*1), R9 - MOVQ (DX)(DI*1), R10 - SHLQ $0x08, R9 - IMULQ BX, R9 - SHRQ $0x2f, R9 - SHLQ $0x08, R10 - IMULQ BX, R10 - SHRQ $0x2f, R10 - MOVL SI, 24(SP)(R9*4) - MOVL DI, 24(SP)(R10*4) - ADDQ $0x02, SI - ADDQ $0x02, DI - JMP index_loop_encodeBetterBlockAsm - -emit_remainder_encodeBetterBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeBetterBlockAsm - CMPL DX, $0x01000000 - JB four_bytes_emit_remainder_encodeBetterBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -four_bytes_emit_remainder_encodeBetterBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -three_bytes_emit_remainder_encodeBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -two_bytes_emit_remainder_encodeBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBetterBlockAsm - JMP memmove_long_emit_remainder_encodeBetterBlockAsm - -one_byte_emit_remainder_encodeBetterBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm - -memmove_long_emit_remainder_encodeBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00001200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm4MB: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm4MB - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm4MB: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x07, BX - CMPL BX, $0x63 - JBE check_maxskip_ok_encodeBetterBlockAsm4MB - LEAL 100(CX), BX - JMP check_maxskip_cont_encodeBetterBlockAsm4MB - -check_maxskip_ok_encodeBetterBlockAsm4MB: - LEAL 1(CX)(BX*1), BX - -check_maxskip_cont_encodeBetterBlockAsm4MB: - CMPL BX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm4MB - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R8 - MOVQ $0x9e3779b1, BX - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x2f, R9 - SHLQ $0x20, R10 - IMULQ BX, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BX - MOVL 524312(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 524312(SP)(R10*4) - MOVQ (DX)(BX*1), R9 - MOVQ (DX)(DI*1), R10 - CMPQ R9, SI - JEQ candidate_match_encodeBetterBlockAsm4MB - CMPQ R10, SI - JNE no_short_found_encodeBetterBlockAsm4MB - MOVL DI, BX - JMP candidate_match_encodeBetterBlockAsm4MB - -no_short_found_encodeBetterBlockAsm4MB: - CMPL R9, SI - JEQ candidate_match_encodeBetterBlockAsm4MB - CMPL R10, SI - JEQ candidateS_match_encodeBetterBlockAsm4MB - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm4MB - -candidateS_match_encodeBetterBlockAsm4MB: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x2f, R9 - MOVL 24(SP)(R9*4), BX - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeBetterBlockAsm4MB - DECL CX - MOVL DI, BX - -candidate_match_encodeBetterBlockAsm4MB: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeBetterBlockAsm4MB - -match_extend_back_loop_encodeBetterBlockAsm4MB: - CMPL CX, SI - JBE match_extend_back_end_encodeBetterBlockAsm4MB - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeBetterBlockAsm4MB - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeBetterBlockAsm4MB - JMP match_extend_back_loop_encodeBetterBlockAsm4MB - -match_extend_back_end_encodeBetterBlockAsm4MB: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 4(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm4MB: - MOVL CX, SI - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), R9 - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeBetterBlockAsm4MB - MOVQ (R8)(R11*1), R10 - MOVQ 8(R8)(R11*1), R12 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB - XORQ 8(R9)(R11*1), R12 - JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB - LEAL -16(DI), DI - LEAL 16(R11), R11 - JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB - -matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_match8_match_nolit_encodeBetterBlockAsm4MB: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB - LEAL -8(DI), DI - LEAL 8(R11), R11 - JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4MB - -matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_match4_match_nolit_encodeBetterBlockAsm4MB: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeBetterBlockAsm4MB - MOVL (R8)(R11*1), R10 - CMPL (R9)(R11*1), R10 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB - LEAL -4(DI), DI - LEAL 4(R11), R11 - -matchlen_match2_match_nolit_encodeBetterBlockAsm4MB: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB - JB match_nolit_end_encodeBetterBlockAsm4MB - MOVW (R8)(R11*1), R10 - CMPW (R9)(R11*1), R10 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB - LEAL 2(R11), R11 - SUBL $0x02, DI - JZ match_nolit_end_encodeBetterBlockAsm4MB - -matchlen_match1_match_nolit_encodeBetterBlockAsm4MB: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeBetterBlockAsm4MB - LEAL 1(R11), R11 - -match_nolit_end_encodeBetterBlockAsm4MB: - MOVL CX, DI - SUBL BX, DI - - // Check if repeat - CMPL 16(SP), DI - JEQ match_is_repeat_encodeBetterBlockAsm4MB - CMPL R11, $0x01 - JA match_length_ok_encodeBetterBlockAsm4MB - CMPL DI, $0x0000ffff - JBE match_length_ok_encodeBetterBlockAsm4MB - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeBetterBlockAsm4MB - -match_length_ok_encodeBetterBlockAsm4MB: - MOVL DI, 16(SP) - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_encodeBetterBlockAsm4MB - CMPL BX, $0x00000100 - JB two_bytes_match_emit_encodeBetterBlockAsm4MB - CMPL BX, $0x00010000 - JB three_bytes_match_emit_encodeBetterBlockAsm4MB - MOVL BX, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW BX, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -three_bytes_match_emit_encodeBetterBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -two_bytes_match_emit_encodeBetterBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_encodeBetterBlockAsm4MB - JMP memmove_long_match_emit_encodeBetterBlockAsm4MB - -one_byte_match_emit_encodeBetterBlockAsm4MB: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm4MB: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x04 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: - MOVQ BX, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB - -memmove_long_match_emit_encodeBetterBlockAsm4MB: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm4MB: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy - CMPL DI, $0x00010000 - JB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB - CMPL R11, $0x40 - JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB - MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(R11), R11 - ADDQ $0x05, AX - CMPL R11, $0x04 - JB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB - - // emitRepeat - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy - CMPL R11, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy - LEAL -65536(R11), R11 - MOVL R11, DI - MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: - TESTL R11, R11 - JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - XORL BX, BX - LEAL -1(BX)(R11*4), R11 - MOVB R11, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: - CMPL R11, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB - CMPL DI, $0x00000800 - JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB - MOVL $0x00000001, BX - LEAL 16(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - SUBL $0x08, R11 - - // emitRepeat - LEAL -4(R11), R11 - JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - CMPL R11, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b - LEAL -65536(R11), R11 - MOVL R11, DI - MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -long_offset_short_match_nolit_encodeBetterBlockAsm4MB: - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - - // emitRepeat - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - CMPL R11, $0x00010100 - JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short - LEAL -65536(R11), R11 - MOVL R11, DI - MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: - MOVL R11, BX - SHLL $0x02, BX - CMPL R11, $0x0c - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB - CMPL DI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB - LEAL -15(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: - LEAL -2(BX), BX - MOVB BL, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -match_is_repeat_encodeBetterBlockAsm4MB: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB - CMPL BX, $0x00000100 - JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB - CMPL BX, $0x00010000 - JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB - MOVL BX, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW BX, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_repeat_encodeBetterBlockAsm4MB - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB - -one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm4MB: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x04 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: - MOVQ BX, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB - -memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB - CMPL R11, $0x00010100 - JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB - LEAL -65536(R11), R11 - MOVL R11, DI - MOVW $0x001d, (AX) - MOVW R11, 2(AX) - SARL $0x10, DI - MOVB DI, 4(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: - CMPL CX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm4MB - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm4MB: - MOVQ $0x00cf1bbcdcbfa563, BX - MOVQ $0x9e3779b1, DI - LEAQ 1(SI), SI - LEAQ -2(CX), R8 - MOVQ (DX)(SI*1), R9 - MOVQ 1(DX)(SI*1), R10 - MOVQ (DX)(R8*1), R11 - MOVQ 1(DX)(R8*1), R12 - SHLQ $0x08, R9 - IMULQ BX, R9 - SHRQ $0x2f, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x32, R10 - SHLQ $0x08, R11 - IMULQ BX, R11 - SHRQ $0x2f, R11 - SHLQ $0x20, R12 - IMULQ DI, R12 - SHRQ $0x32, R12 - LEAQ 1(SI), DI - LEAQ 1(R8), R13 - MOVL SI, 24(SP)(R9*4) - MOVL R8, 24(SP)(R11*4) - MOVL DI, 524312(SP)(R10*4) - MOVL R13, 524312(SP)(R12*4) - LEAQ 1(R8)(SI*1), DI - SHRQ $0x01, DI - ADDQ $0x01, SI - SUBQ $0x01, R8 - -index_loop_encodeBetterBlockAsm4MB: - CMPQ DI, R8 - JAE search_loop_encodeBetterBlockAsm4MB - MOVQ (DX)(SI*1), R9 - MOVQ (DX)(DI*1), R10 - SHLQ $0x08, R9 - IMULQ BX, R9 - SHRQ $0x2f, R9 - SHLQ $0x08, R10 - IMULQ BX, R10 - SHRQ $0x2f, R10 - MOVL SI, 24(SP)(R9*4) - MOVL DI, 24(SP)(R10*4) - ADDQ $0x02, SI - ADDQ $0x02, DI - JMP index_loop_encodeBetterBlockAsm4MB - -emit_remainder_encodeBetterBlockAsm4MB: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 4(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeBetterBlockAsm4MB - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm4MB: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBetterBlockAsm4MB - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBetterBlockAsm4MB - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeBetterBlockAsm4MB - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -three_bytes_emit_remainder_encodeBetterBlockAsm4MB: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -two_bytes_emit_remainder_encodeBetterBlockAsm4MB: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBetterBlockAsm4MB - JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB - -one_byte_emit_remainder_encodeBetterBlockAsm4MB: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB - -memmove_long_emit_remainder_encodeBetterBlockAsm4MB: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000280, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm12B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x06, BX - LEAL 1(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm12B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BX - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - SHLQ $0x20, R10 - IMULQ BX, R10 - SHRQ $0x34, R10 - MOVL 24(SP)(R9*4), BX - MOVL 65560(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 65560(SP)(R10*4) - MOVQ (DX)(BX*1), R9 - MOVQ (DX)(DI*1), R10 - CMPQ R9, SI - JEQ candidate_match_encodeBetterBlockAsm12B - CMPQ R10, SI - JNE no_short_found_encodeBetterBlockAsm12B - MOVL DI, BX - JMP candidate_match_encodeBetterBlockAsm12B - -no_short_found_encodeBetterBlockAsm12B: - CMPL R9, SI - JEQ candidate_match_encodeBetterBlockAsm12B - CMPL R10, SI - JEQ candidateS_match_encodeBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm12B - -candidateS_match_encodeBetterBlockAsm12B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL 24(SP)(R9*4), BX - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeBetterBlockAsm12B - DECL CX - MOVL DI, BX - -candidate_match_encodeBetterBlockAsm12B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeBetterBlockAsm12B - -match_extend_back_loop_encodeBetterBlockAsm12B: - CMPL CX, SI - JBE match_extend_back_end_encodeBetterBlockAsm12B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeBetterBlockAsm12B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeBetterBlockAsm12B - JMP match_extend_back_loop_encodeBetterBlockAsm12B - -match_extend_back_end_encodeBetterBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm12B: - MOVL CX, SI - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), R9 - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeBetterBlockAsm12B - MOVQ (R8)(R11*1), R10 - MOVQ 8(R8)(R11*1), R12 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B - XORQ 8(R9)(R11*1), R12 - JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B - LEAL -16(DI), DI - LEAL 16(R11), R11 - JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B - -matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm12B - -matchlen_match8_match_nolit_encodeBetterBlockAsm12B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeBetterBlockAsm12B - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B - LEAL -8(DI), DI - LEAL 8(R11), R11 - JMP matchlen_match4_match_nolit_encodeBetterBlockAsm12B - -matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm12B - -matchlen_match4_match_nolit_encodeBetterBlockAsm12B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeBetterBlockAsm12B - MOVL (R8)(R11*1), R10 - CMPL (R9)(R11*1), R10 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B - LEAL -4(DI), DI - LEAL 4(R11), R11 - -matchlen_match2_match_nolit_encodeBetterBlockAsm12B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeBetterBlockAsm12B - JB match_nolit_end_encodeBetterBlockAsm12B - MOVW (R8)(R11*1), R10 - CMPW (R9)(R11*1), R10 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B - LEAL 2(R11), R11 - SUBL $0x02, DI - JZ match_nolit_end_encodeBetterBlockAsm12B - -matchlen_match1_match_nolit_encodeBetterBlockAsm12B: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeBetterBlockAsm12B - LEAL 1(R11), R11 - -match_nolit_end_encodeBetterBlockAsm12B: - MOVL CX, DI - SUBL BX, DI - - // Check if repeat - CMPL 16(SP), DI - JEQ match_is_repeat_encodeBetterBlockAsm12B - MOVL DI, 16(SP) - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_encodeBetterBlockAsm12B - CMPL BX, $0x00000100 - JB two_bytes_match_emit_encodeBetterBlockAsm12B - JB three_bytes_match_emit_encodeBetterBlockAsm12B - -three_bytes_match_emit_encodeBetterBlockAsm12B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm12B - -two_bytes_match_emit_encodeBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_encodeBetterBlockAsm12B - JMP memmove_long_match_emit_encodeBetterBlockAsm12B - -one_byte_match_emit_encodeBetterBlockAsm12B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x04 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm12B: - MOVQ BX, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B - -memmove_long_match_emit_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm12B: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy - CMPL R11, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B - CMPL DI, $0x00000800 - JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B - MOVL $0x00000001, BX - LEAL 16(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - SUBL $0x08, R11 - - // emitRepeat - LEAL -4(R11), R11 - JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -long_offset_short_match_nolit_encodeBetterBlockAsm12B: - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - - // emitRepeat - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: - MOVL R11, BX - SHLL $0x02, BX - CMPL R11, $0x0c - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B - CMPL DI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B - LEAL -15(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -emit_copy_three_match_nolit_encodeBetterBlockAsm12B: - LEAL -2(BX), BX - MOVB BL, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -match_is_repeat_encodeBetterBlockAsm12B: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_repeat_encodeBetterBlockAsm12B - CMPL BX, $0x00000100 - JB two_bytes_match_emit_repeat_encodeBetterBlockAsm12B - JB three_bytes_match_emit_repeat_encodeBetterBlockAsm12B - -three_bytes_match_emit_repeat_encodeBetterBlockAsm12B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_repeat_encodeBetterBlockAsm12B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B - -one_byte_match_emit_repeat_encodeBetterBlockAsm12B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x04 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: - MOVQ BX, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm12B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm12B - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ $0x9e3779b1, DI - LEAQ 1(SI), SI - LEAQ -2(CX), R8 - MOVQ (DX)(SI*1), R9 - MOVQ 1(DX)(SI*1), R10 - MOVQ (DX)(R8*1), R11 - MOVQ 1(DX)(R8*1), R12 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x32, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x34, R10 - SHLQ $0x10, R11 - IMULQ BX, R11 - SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ DI, R12 - SHRQ $0x34, R12 - LEAQ 1(SI), DI - LEAQ 1(R8), R13 - MOVL SI, 24(SP)(R9*4) - MOVL R8, 24(SP)(R11*4) - MOVL DI, 65560(SP)(R10*4) - MOVL R13, 65560(SP)(R12*4) - LEAQ 1(R8)(SI*1), DI - SHRQ $0x01, DI - ADDQ $0x01, SI - SUBQ $0x01, R8 - -index_loop_encodeBetterBlockAsm12B: - CMPQ DI, R8 - JAE search_loop_encodeBetterBlockAsm12B - MOVQ (DX)(SI*1), R9 - MOVQ (DX)(DI*1), R10 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x32, R9 - SHLQ $0x10, R10 - IMULQ BX, R10 - SHRQ $0x32, R10 - MOVL SI, 24(SP)(R9*4) - MOVL DI, 24(SP)(R10*4) - ADDQ $0x02, SI - ADDQ $0x02, DI - JMP index_loop_encodeBetterBlockAsm12B - -emit_remainder_encodeBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBetterBlockAsm12B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBetterBlockAsm12B - JB three_bytes_emit_remainder_encodeBetterBlockAsm12B - -three_bytes_emit_remainder_encodeBetterBlockAsm12B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B - -two_bytes_emit_remainder_encodeBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBetterBlockAsm12B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B - -one_byte_emit_remainder_encodeBetterBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B - -memmove_long_emit_remainder_encodeBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x000000a0, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm10B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 1(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm10B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BX - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - SHLQ $0x20, R10 - IMULQ BX, R10 - SHRQ $0x36, R10 - MOVL 24(SP)(R9*4), BX - MOVL 16408(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 16408(SP)(R10*4) - MOVQ (DX)(BX*1), R9 - MOVQ (DX)(DI*1), R10 - CMPQ R9, SI - JEQ candidate_match_encodeBetterBlockAsm10B - CMPQ R10, SI - JNE no_short_found_encodeBetterBlockAsm10B - MOVL DI, BX - JMP candidate_match_encodeBetterBlockAsm10B - -no_short_found_encodeBetterBlockAsm10B: - CMPL R9, SI - JEQ candidate_match_encodeBetterBlockAsm10B - CMPL R10, SI - JEQ candidateS_match_encodeBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm10B - -candidateS_match_encodeBetterBlockAsm10B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - MOVL 24(SP)(R9*4), BX - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeBetterBlockAsm10B - DECL CX - MOVL DI, BX - -candidate_match_encodeBetterBlockAsm10B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeBetterBlockAsm10B - -match_extend_back_loop_encodeBetterBlockAsm10B: - CMPL CX, SI - JBE match_extend_back_end_encodeBetterBlockAsm10B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeBetterBlockAsm10B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeBetterBlockAsm10B - JMP match_extend_back_loop_encodeBetterBlockAsm10B - -match_extend_back_end_encodeBetterBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm10B: - MOVL CX, SI - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), R9 - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeBetterBlockAsm10B - MOVQ (R8)(R11*1), R10 - MOVQ 8(R8)(R11*1), R12 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B - XORQ 8(R9)(R11*1), R12 - JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B - LEAL -16(DI), DI - LEAL 16(R11), R11 - JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B - -matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm10B - -matchlen_match8_match_nolit_encodeBetterBlockAsm10B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeBetterBlockAsm10B - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B - LEAL -8(DI), DI - LEAL 8(R11), R11 - JMP matchlen_match4_match_nolit_encodeBetterBlockAsm10B - -matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm10B - -matchlen_match4_match_nolit_encodeBetterBlockAsm10B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeBetterBlockAsm10B - MOVL (R8)(R11*1), R10 - CMPL (R9)(R11*1), R10 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B - LEAL -4(DI), DI - LEAL 4(R11), R11 - -matchlen_match2_match_nolit_encodeBetterBlockAsm10B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeBetterBlockAsm10B - JB match_nolit_end_encodeBetterBlockAsm10B - MOVW (R8)(R11*1), R10 - CMPW (R9)(R11*1), R10 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B - LEAL 2(R11), R11 - SUBL $0x02, DI - JZ match_nolit_end_encodeBetterBlockAsm10B - -matchlen_match1_match_nolit_encodeBetterBlockAsm10B: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeBetterBlockAsm10B - LEAL 1(R11), R11 - -match_nolit_end_encodeBetterBlockAsm10B: - MOVL CX, DI - SUBL BX, DI - - // Check if repeat - CMPL 16(SP), DI - JEQ match_is_repeat_encodeBetterBlockAsm10B - MOVL DI, 16(SP) - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_encodeBetterBlockAsm10B - CMPL BX, $0x00000100 - JB two_bytes_match_emit_encodeBetterBlockAsm10B - JB three_bytes_match_emit_encodeBetterBlockAsm10B - -three_bytes_match_emit_encodeBetterBlockAsm10B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm10B - -two_bytes_match_emit_encodeBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_encodeBetterBlockAsm10B - JMP memmove_long_match_emit_encodeBetterBlockAsm10B - -one_byte_match_emit_encodeBetterBlockAsm10B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x04 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm10B: - MOVQ BX, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B - -memmove_long_match_emit_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm10B: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy - CMPL R11, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B - CMPL DI, $0x00000800 - JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B - MOVL $0x00000001, BX - LEAL 16(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - SUBL $0x08, R11 - - // emitRepeat - LEAL -4(R11), R11 - JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -long_offset_short_match_nolit_encodeBetterBlockAsm10B: - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - - // emitRepeat - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: - MOVL R11, BX - SHLL $0x02, BX - CMPL R11, $0x0c - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B - CMPL DI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B - LEAL -15(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -emit_copy_three_match_nolit_encodeBetterBlockAsm10B: - LEAL -2(BX), BX - MOVB BL, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -match_is_repeat_encodeBetterBlockAsm10B: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_repeat_encodeBetterBlockAsm10B - CMPL BX, $0x00000100 - JB two_bytes_match_emit_repeat_encodeBetterBlockAsm10B - JB three_bytes_match_emit_repeat_encodeBetterBlockAsm10B - -three_bytes_match_emit_repeat_encodeBetterBlockAsm10B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_repeat_encodeBetterBlockAsm10B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B - -one_byte_match_emit_repeat_encodeBetterBlockAsm10B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x04 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: - MOVQ BX, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B - CMPL DI, $0x00000800 - JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - -repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm10B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm10B - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm10B: - MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ $0x9e3779b1, DI - LEAQ 1(SI), SI - LEAQ -2(CX), R8 - MOVQ (DX)(SI*1), R9 - MOVQ 1(DX)(SI*1), R10 - MOVQ (DX)(R8*1), R11 - MOVQ 1(DX)(R8*1), R12 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x34, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x36, R10 - SHLQ $0x10, R11 - IMULQ BX, R11 - SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ DI, R12 - SHRQ $0x36, R12 - LEAQ 1(SI), DI - LEAQ 1(R8), R13 - MOVL SI, 24(SP)(R9*4) - MOVL R8, 24(SP)(R11*4) - MOVL DI, 16408(SP)(R10*4) - MOVL R13, 16408(SP)(R12*4) - LEAQ 1(R8)(SI*1), DI - SHRQ $0x01, DI - ADDQ $0x01, SI - SUBQ $0x01, R8 - -index_loop_encodeBetterBlockAsm10B: - CMPQ DI, R8 - JAE search_loop_encodeBetterBlockAsm10B - MOVQ (DX)(SI*1), R9 - MOVQ (DX)(DI*1), R10 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x34, R9 - SHLQ $0x10, R10 - IMULQ BX, R10 - SHRQ $0x34, R10 - MOVL SI, 24(SP)(R9*4) - MOVL DI, 24(SP)(R10*4) - ADDQ $0x02, SI - ADDQ $0x02, DI - JMP index_loop_encodeBetterBlockAsm10B - -emit_remainder_encodeBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBetterBlockAsm10B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBetterBlockAsm10B - JB three_bytes_emit_remainder_encodeBetterBlockAsm10B - -three_bytes_emit_remainder_encodeBetterBlockAsm10B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B - -two_bytes_emit_remainder_encodeBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBetterBlockAsm10B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B - -one_byte_emit_remainder_encodeBetterBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B - -memmove_long_emit_remainder_encodeBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000028, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeBetterBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeBetterBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -6(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeBetterBlockAsm8B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x04, BX - LEAL 1(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm8B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BX - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ BX, R10 - SHRQ $0x38, R10 - MOVL 24(SP)(R9*4), BX - MOVL 4120(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 4120(SP)(R10*4) - MOVQ (DX)(BX*1), R9 - MOVQ (DX)(DI*1), R10 - CMPQ R9, SI - JEQ candidate_match_encodeBetterBlockAsm8B - CMPQ R10, SI - JNE no_short_found_encodeBetterBlockAsm8B - MOVL DI, BX - JMP candidate_match_encodeBetterBlockAsm8B - -no_short_found_encodeBetterBlockAsm8B: - CMPL R9, SI - JEQ candidate_match_encodeBetterBlockAsm8B - CMPL R10, SI - JEQ candidateS_match_encodeBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeBetterBlockAsm8B - -candidateS_match_encodeBetterBlockAsm8B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - MOVL 24(SP)(R9*4), BX - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeBetterBlockAsm8B - DECL CX - MOVL DI, BX - -candidate_match_encodeBetterBlockAsm8B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeBetterBlockAsm8B - -match_extend_back_loop_encodeBetterBlockAsm8B: - CMPL CX, SI - JBE match_extend_back_end_encodeBetterBlockAsm8B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeBetterBlockAsm8B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeBetterBlockAsm8B - JMP match_extend_back_loop_encodeBetterBlockAsm8B - -match_extend_back_end_encodeBetterBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeBetterBlockAsm8B: - MOVL CX, SI - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), R9 - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeBetterBlockAsm8B - MOVQ (R8)(R11*1), R10 - MOVQ 8(R8)(R11*1), R12 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B - XORQ 8(R9)(R11*1), R12 - JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B - LEAL -16(DI), DI - LEAL 16(R11), R11 - JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B - -matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm8B - -matchlen_match8_match_nolit_encodeBetterBlockAsm8B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeBetterBlockAsm8B - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B - LEAL -8(DI), DI - LEAL 8(R11), R11 - JMP matchlen_match4_match_nolit_encodeBetterBlockAsm8B - -matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeBetterBlockAsm8B - -matchlen_match4_match_nolit_encodeBetterBlockAsm8B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeBetterBlockAsm8B - MOVL (R8)(R11*1), R10 - CMPL (R9)(R11*1), R10 - JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B - LEAL -4(DI), DI - LEAL 4(R11), R11 - -matchlen_match2_match_nolit_encodeBetterBlockAsm8B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeBetterBlockAsm8B - JB match_nolit_end_encodeBetterBlockAsm8B - MOVW (R8)(R11*1), R10 - CMPW (R9)(R11*1), R10 - JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B - LEAL 2(R11), R11 - SUBL $0x02, DI - JZ match_nolit_end_encodeBetterBlockAsm8B - -matchlen_match1_match_nolit_encodeBetterBlockAsm8B: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeBetterBlockAsm8B - LEAL 1(R11), R11 - -match_nolit_end_encodeBetterBlockAsm8B: - MOVL CX, DI - SUBL BX, DI - - // Check if repeat - CMPL 16(SP), DI - JEQ match_is_repeat_encodeBetterBlockAsm8B - MOVL DI, 16(SP) - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_encodeBetterBlockAsm8B - CMPL BX, $0x00000100 - JB two_bytes_match_emit_encodeBetterBlockAsm8B - JB three_bytes_match_emit_encodeBetterBlockAsm8B - -three_bytes_match_emit_encodeBetterBlockAsm8B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeBetterBlockAsm8B - -two_bytes_match_emit_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_encodeBetterBlockAsm8B - JMP memmove_long_match_emit_encodeBetterBlockAsm8B - -one_byte_match_emit_encodeBetterBlockAsm8B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x04 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ R8, $0x08 - JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (R9), R10 - MOVL R10, (AX) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (R9), R10 - MOVL -4(R9)(R8*1), R9 - MOVL R10, (AX) - MOVL R9, -4(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeBetterBlockAsm8B: - MOVQ BX, AX - JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B - -memmove_long_match_emit_encodeBetterBlockAsm8B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_encodeBetterBlockAsm8B: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy - CMPL R11, $0x40 - JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B - CMPL DI, $0x00000800 - JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B - MOVL $0x00000001, BX - LEAL 16(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - SUBL $0x08, R11 - - // emitRepeat - LEAL -4(R11), R11 - JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -long_offset_short_match_nolit_encodeBetterBlockAsm8B: - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - - // emitRepeat - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - -cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: - MOVL R11, BX - SHLL $0x02, BX - CMPL R11, $0x0c - JAE emit_copy_three_match_nolit_encodeBetterBlockAsm8B - LEAL -15(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -emit_copy_three_match_nolit_encodeBetterBlockAsm8B: - LEAL -2(BX), BX - MOVB BL, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -match_is_repeat_encodeBetterBlockAsm8B: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R8 - SUBL BX, DI - LEAL -1(DI), BX - CMPL BX, $0x3c - JB one_byte_match_emit_repeat_encodeBetterBlockAsm8B - CMPL BX, $0x00000100 - JB two_bytes_match_emit_repeat_encodeBetterBlockAsm8B - JB three_bytes_match_emit_repeat_encodeBetterBlockAsm8B - -three_bytes_match_emit_repeat_encodeBetterBlockAsm8B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B - -two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_repeat_encodeBetterBlockAsm8B - JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B - -one_byte_match_emit_repeat_encodeBetterBlockAsm8B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_repeat_encodeBetterBlockAsm8B: - LEAQ (AX)(DI*1), BX - - // genMemMoveShort - CMPQ DI, $0x04 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 - CMPQ DI, $0x08 - JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ DI, $0x10 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: - MOVL (R8), R9 - MOVL R9, (AX) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (R8), R9 - MOVL -4(R8)(DI*1), R8 - MOVL R9, (AX) - MOVL R8, -4(AX)(DI*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B - -emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - -memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: - MOVQ BX, AX - JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B - -memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: - LEAQ (AX)(DI*1), BX - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R12 - SUBQ R9, R12 - DECQ R10 - JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R12*1), R9 - LEAQ -32(AX)(R12*1), R13 - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R13) - MOVOA X5, 16(R13) - ADDQ $0x20, R13 - ADDQ $0x20, R9 - ADDQ $0x20, R12 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R12*1), X4 - MOVOU -16(R8)(R12*1), X5 - MOVOA X4, -32(AX)(R12*1) - MOVOA X5, -16(AX)(R12*1) - ADDQ $0x20, R12 - CMPQ DI, R12 - JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BX, AX - -emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitRepeat - MOVL R11, BX - LEAL -4(R11), R11 - CMPL BX, $0x08 - JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B - CMPL BX, $0x0c - JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B - -cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: - CMPL R11, $0x00000104 - JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B - LEAL -256(R11), R11 - MOVW $0x0019, (AX) - MOVW R11, 2(AX) - ADDQ $0x04, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: - LEAL -4(R11), R11 - MOVW $0x0015, (AX) - MOVB R11, 2(AX) - ADDQ $0x03, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - -repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: - SHLL $0x02, R11 - ORL $0x01, R11 - MOVW R11, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - XORQ BX, BX - LEAL 1(BX)(R11*4), R11 - MOVB DI, 1(AX) - SARL $0x08, DI - SHLL $0x05, DI - ORL DI, R11 - MOVB R11, (AX) - ADDQ $0x02, AX - -match_nolit_emitcopy_end_encodeBetterBlockAsm8B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeBetterBlockAsm8B - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeBetterBlockAsm8B: - MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ $0x9e3779b1, DI - LEAQ 1(SI), SI - LEAQ -2(CX), R8 - MOVQ (DX)(SI*1), R9 - MOVQ 1(DX)(SI*1), R10 - MOVQ (DX)(R8*1), R11 - MOVQ 1(DX)(R8*1), R12 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x38, R10 - SHLQ $0x10, R11 - IMULQ BX, R11 - SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ DI, R12 - SHRQ $0x38, R12 - LEAQ 1(SI), DI - LEAQ 1(R8), R13 - MOVL SI, 24(SP)(R9*4) - MOVL R8, 24(SP)(R11*4) - MOVL DI, 4120(SP)(R10*4) - MOVL R13, 4120(SP)(R12*4) - LEAQ 1(R8)(SI*1), DI - SHRQ $0x01, DI - ADDQ $0x01, SI - SUBQ $0x01, R8 - -index_loop_encodeBetterBlockAsm8B: - CMPQ DI, R8 - JAE search_loop_encodeBetterBlockAsm8B - MOVQ (DX)(SI*1), R9 - MOVQ (DX)(DI*1), R10 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x36, R9 - SHLQ $0x10, R10 - IMULQ BX, R10 - SHRQ $0x36, R10 - MOVL SI, 24(SP)(R9*4) - MOVL DI, 24(SP)(R10*4) - ADDQ $0x02, SI - ADDQ $0x02, DI - JMP index_loop_encodeBetterBlockAsm8B - -emit_remainder_encodeBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeBetterBlockAsm8B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeBetterBlockAsm8B - JB three_bytes_emit_remainder_encodeBetterBlockAsm8B - -three_bytes_emit_remainder_encodeBetterBlockAsm8B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B - -two_bytes_emit_remainder_encodeBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeBetterBlockAsm8B - JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B - -one_byte_emit_remainder_encodeBetterBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B - -memmove_long_emit_remainder_encodeBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x06, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - SHLQ $0x10, R10 - IMULQ R8, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeSnappyBlockAsm - LEAL 1(CX), SI - MOVL 12(SP), BX - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeSnappyBlockAsm - -repeat_extend_back_loop_encodeSnappyBlockAsm: - CMPL SI, BX - JBE repeat_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(DI*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm - LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm - -repeat_extend_back_end_encodeSnappyBlockAsm: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R8 - SUBL BX, DI - LEAL -1(DI), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_encodeSnappyBlockAsm - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_encodeSnappyBlockAsm - CMPL BX, $0x00010000 - JB three_bytes_repeat_emit_encodeSnappyBlockAsm - CMPL BX, $0x01000000 - JB four_bytes_repeat_emit_encodeSnappyBlockAsm - MOVB $0xfc, (AX) - MOVL BX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -four_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVL BX, R9 - SHRL $0x10, R9 - MOVB $0xf8, (AX) - MOVW BX, 1(AX) - MOVB R9, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -three_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -two_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_encodeSnappyBlockAsm - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm - -one_byte_repeat_emit_encodeSnappyBlockAsm: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(DI*1), BX - - // genMemMoveShort - CMPQ DI, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 - CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (R8), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: - MOVQ BX, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm - -memmove_long_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(DI*1), BX - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BX, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm: - CMPL DI, $0x10 - JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm - XORQ 8(BX)(R10*1), R11 - JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm - -matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_match8_repeat_extend_encodeSnappyBlockAsm: - CMPL DI, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm - -matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm: - CMPL DI, $0x04 - JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm - MOVL (R8)(R10*1), R9 - CMPL (BX)(R10*1), R9 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm: - CMPL DI, $0x01 - JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm - JB repeat_extend_forward_end_encodeSnappyBlockAsm - MOVW (R8)(R10*1), R9 - CMPW (BX)(R10*1), R9 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ repeat_extend_forward_end_encodeSnappyBlockAsm - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm: - MOVB (R8)(R10*1), R9 - CMPB (BX)(R10*1), R9 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm - LEAL 1(R10), R10 - -repeat_extend_forward_end_encodeSnappyBlockAsm: - ADDL R10, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - - // emitCopy - CMPL SI, $0x00010000 - JB two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm - -four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: - CMPL BX, $0x40 - JBE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0xff, (AX) - MOVL SI, 1(AX) - LEAL -64(BX), BX - ADDQ $0x05, AX - CMPL BX, $0x04 - JB four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm - JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm - -four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: - TESTL BX, BX - JZ repeat_end_emit_encodeSnappyBlockAsm - XORL DI, DI - LEAL -1(DI)(BX*4), BX - MOVB BL, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, AX - JMP repeat_end_emit_encodeSnappyBlockAsm - -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BX), BX - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - CMPL SI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - LEAL -15(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm - -no_repeat_found_encodeSnappyBlockAsm: - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeSnappyBlockAsm - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeSnappyBlockAsm - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_encodeSnappyBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm - -candidate3_match_encodeSnappyBlockAsm: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm - -candidate2_match_encodeSnappyBlockAsm: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_encodeSnappyBlockAsm: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeSnappyBlockAsm - -match_extend_back_loop_encodeSnappyBlockAsm: - CMPL CX, SI - JBE match_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeSnappyBlockAsm - JMP match_extend_back_loop_encodeSnappyBlockAsm - -match_extend_back_end_encodeSnappyBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 5(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JB one_byte_match_emit_encodeSnappyBlockAsm - CMPL DI, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBlockAsm - CMPL DI, $0x00010000 - JB three_bytes_match_emit_encodeSnappyBlockAsm - CMPL DI, $0x01000000 - JB four_bytes_match_emit_encodeSnappyBlockAsm - MOVB $0xfc, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -four_bytes_match_emit_encodeSnappyBlockAsm: - MOVL DI, R9 - SHRL $0x10, R9 - MOVB $0xf8, (AX) - MOVW DI, 1(AX) - MOVB R9, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -three_bytes_match_emit_encodeSnappyBlockAsm: - MOVB $0xf4, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -two_bytes_match_emit_encodeSnappyBlockAsm: - MOVB $0xf0, (AX) - MOVB DI, 1(AX) - ADDQ $0x02, AX - CMPL DI, $0x40 - JB memmove_match_emit_encodeSnappyBlockAsm - JMP memmove_long_match_emit_encodeSnappyBlockAsm - -one_byte_match_emit_encodeSnappyBlockAsm: - SHLB $0x02, DI - MOVB DI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), DI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (SI), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm - -memmove_long_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(R8*1), DI - - // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm: -match_nolit_loop_encodeSnappyBlockAsm: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBlockAsm - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm - -matchlen_bsf_16match_nolit_encodeSnappyBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm - -matchlen_match8_match_nolit_encodeSnappyBlockAsm: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBlockAsm - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm - -matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm - -matchlen_match4_match_nolit_encodeSnappyBlockAsm: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBlockAsm - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBlockAsm - JB match_nolit_end_encodeSnappyBlockAsm - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_encodeSnappyBlockAsm - -matchlen_match1_match_nolit_encodeSnappyBlockAsm: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_encodeSnappyBlockAsm - LEAL 1(R9), R9 - -match_nolit_end_encodeSnappyBlockAsm: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy - CMPL BX, $0x00010000 - JB two_byte_offset_match_nolit_encodeSnappyBlockAsm - -four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: - CMPL R9, $0x40 - JBE four_bytes_remain_match_nolit_encodeSnappyBlockAsm - MOVB $0xff, (AX) - MOVL BX, 1(AX) - LEAL -64(R9), R9 - ADDQ $0x05, AX - CMPL R9, $0x04 - JB four_bytes_remain_match_nolit_encodeSnappyBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm - -four_bytes_remain_match_nolit_encodeSnappyBlockAsm: - TESTL R9, R9 - JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm - XORL SI, SI - LEAL -1(SI)(R9*4), R9 - MOVB R9, (AX) - MOVL BX, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm - -two_byte_offset_match_nolit_encodeSnappyBlockAsm: - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm - MOVB $0xee, (AX) - MOVW BX, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm - CMPL BX, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm - LEAL -15(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm - -emit_copy_three_match_nolit_encodeSnappyBlockAsm: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm: - CMPL CX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x32, DI - SHLQ $0x10, BX - IMULQ R8, BX - SHRQ $0x32, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_encodeSnappyBlockAsm - INCL CX - JMP search_loop_encodeSnappyBlockAsm - -emit_remainder_encodeSnappyBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeSnappyBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeSnappyBlockAsm - CMPL DX, $0x01000000 - JB four_bytes_emit_remainder_encodeSnappyBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -four_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -three_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -two_bytes_emit_remainder_encodeSnappyBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBlockAsm - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm - -one_byte_emit_remainder_encodeSnappyBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm - -memmove_long_emit_remainder_encodeSnappyBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm64K: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm64K: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x06, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm64K - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - SHLQ $0x10, R10 - IMULQ R8, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeSnappyBlockAsm64K - LEAL 1(CX), SI - MOVL 12(SP), BX - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeSnappyBlockAsm64K - -repeat_extend_back_loop_encodeSnappyBlockAsm64K: - CMPL SI, BX - JBE repeat_extend_back_end_encodeSnappyBlockAsm64K - MOVB -1(DX)(DI*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm64K - LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K - -repeat_extend_back_end_encodeSnappyBlockAsm64K: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R8 - SUBL BX, DI - LEAL -1(DI), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_encodeSnappyBlockAsm64K - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_encodeSnappyBlockAsm64K - JB three_bytes_repeat_emit_encodeSnappyBlockAsm64K - -three_bytes_repeat_emit_encodeSnappyBlockAsm64K: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K - -two_bytes_repeat_emit_encodeSnappyBlockAsm64K: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_encodeSnappyBlockAsm64K - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K - -one_byte_repeat_emit_encodeSnappyBlockAsm64K: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(DI*1), BX - - // genMemMoveShort - CMPQ DI, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 - CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (R8), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: - MOVQ BX, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K - -memmove_long_repeat_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(DI*1), BX - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BX, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K: - CMPL DI, $0x10 - JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K - XORQ 8(BX)(R10*1), R11 - JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K - -matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K: - CMPL DI, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K - -matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K: - CMPL DI, $0x04 - JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K - MOVL (R8)(R10*1), R9 - CMPL (BX)(R10*1), R9 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K: - CMPL DI, $0x01 - JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K - JB repeat_extend_forward_end_encodeSnappyBlockAsm64K - MOVW (R8)(R10*1), R9 - CMPW (BX)(R10*1), R9 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K: - MOVB (R8)(R10*1), R9 - CMPB (BX)(R10*1), R9 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K - LEAL 1(R10), R10 - -repeat_extend_forward_end_encodeSnappyBlockAsm64K: - ADDL R10, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BX), BX - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K - CMPL SI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K - LEAL -15(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm64K - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm64K: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm64K - -no_repeat_found_encodeSnappyBlockAsm64K: - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeSnappyBlockAsm64K - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeSnappyBlockAsm64K - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_encodeSnappyBlockAsm64K - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm64K - -candidate3_match_encodeSnappyBlockAsm64K: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm64K - -candidate2_match_encodeSnappyBlockAsm64K: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_encodeSnappyBlockAsm64K: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeSnappyBlockAsm64K - -match_extend_back_loop_encodeSnappyBlockAsm64K: - CMPL CX, SI - JBE match_extend_back_end_encodeSnappyBlockAsm64K - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm64K - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeSnappyBlockAsm64K - JMP match_extend_back_loop_encodeSnappyBlockAsm64K - -match_extend_back_end_encodeSnappyBlockAsm64K: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm64K: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JB one_byte_match_emit_encodeSnappyBlockAsm64K - CMPL DI, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBlockAsm64K - JB three_bytes_match_emit_encodeSnappyBlockAsm64K - -three_bytes_match_emit_encodeSnappyBlockAsm64K: - MOVB $0xf4, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm64K - -two_bytes_match_emit_encodeSnappyBlockAsm64K: - MOVB $0xf0, (AX) - MOVB DI, 1(AX) - ADDQ $0x02, AX - CMPL DI, $0x40 - JB memmove_match_emit_encodeSnappyBlockAsm64K - JMP memmove_long_match_emit_encodeSnappyBlockAsm64K - -one_byte_match_emit_encodeSnappyBlockAsm64K: - SHLB $0x02, DI - MOVB DI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(R8*1), DI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: - MOVQ (SI), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K - -memmove_long_match_emit_encodeSnappyBlockAsm64K: - LEAQ (AX)(R8*1), DI - - // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm64K: -match_nolit_loop_encodeSnappyBlockAsm64K: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBlockAsm64K - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K - -matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_match8_match_nolit_encodeSnappyBlockAsm64K: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBlockAsm64K - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm64K - -matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_match4_match_nolit_encodeSnappyBlockAsm64K: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBlockAsm64K - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm64K: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K - JB match_nolit_end_encodeSnappyBlockAsm64K - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_encodeSnappyBlockAsm64K - -matchlen_match1_match_nolit_encodeSnappyBlockAsm64K: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_encodeSnappyBlockAsm64K - LEAL 1(R9), R9 - -match_nolit_end_encodeSnappyBlockAsm64K: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K - MOVB $0xee, (AX) - MOVW BX, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K - CMPL BX, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K - LEAL -15(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K - -emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: - CMPL CX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm64K - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm64K: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x32, DI - SHLQ $0x10, BX - IMULQ R8, BX - SHRQ $0x32, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_encodeSnappyBlockAsm64K - INCL CX - JMP search_loop_encodeSnappyBlockAsm64K - -emit_remainder_encodeSnappyBlockAsm64K: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeSnappyBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm64K: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBlockAsm64K - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBlockAsm64K - JB three_bytes_emit_remainder_encodeSnappyBlockAsm64K - -three_bytes_emit_remainder_encodeSnappyBlockAsm64K: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K - -two_bytes_emit_remainder_encodeSnappyBlockAsm64K: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBlockAsm64K - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K - -one_byte_emit_remainder_encodeSnappyBlockAsm64K: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K - -memmove_long_emit_remainder_encodeSnappyBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000080, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm12B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm12B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x18, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - SHLQ $0x18, R10 - IMULQ R8, R10 - SHRQ $0x34, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x18, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeSnappyBlockAsm12B - LEAL 1(CX), SI - MOVL 12(SP), BX - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeSnappyBlockAsm12B - -repeat_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL SI, BX - JBE repeat_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(DI*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm12B - LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B - -repeat_extend_back_end_encodeSnappyBlockAsm12B: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R8 - SUBL BX, DI - LEAL -1(DI), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_encodeSnappyBlockAsm12B - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_encodeSnappyBlockAsm12B - JB three_bytes_repeat_emit_encodeSnappyBlockAsm12B - -three_bytes_repeat_emit_encodeSnappyBlockAsm12B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B - -two_bytes_repeat_emit_encodeSnappyBlockAsm12B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_encodeSnappyBlockAsm12B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B - -one_byte_repeat_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(DI*1), BX - - // genMemMoveShort - CMPQ DI, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (R8), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: - MOVQ BX, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - -memmove_long_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(DI*1), BX - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BX, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B: - CMPL DI, $0x10 - JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B - XORQ 8(BX)(R10*1), R11 - JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B - -matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B: - CMPL DI, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B - -matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B: - CMPL DI, $0x04 - JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B - MOVL (R8)(R10*1), R9 - CMPL (BX)(R10*1), R9 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B: - CMPL DI, $0x01 - JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B - JB repeat_extend_forward_end_encodeSnappyBlockAsm12B - MOVW (R8)(R10*1), R9 - CMPW (BX)(R10*1), R9 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B: - MOVB (R8)(R10*1), R9 - CMPB (BX)(R10*1), R9 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B - LEAL 1(R10), R10 - -repeat_extend_forward_end_encodeSnappyBlockAsm12B: - ADDL R10, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BX), BX - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - CMPL SI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - LEAL -15(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm12B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm12B: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm12B - -no_repeat_found_encodeSnappyBlockAsm12B: - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeSnappyBlockAsm12B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeSnappyBlockAsm12B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_encodeSnappyBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm12B - -candidate3_match_encodeSnappyBlockAsm12B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm12B - -candidate2_match_encodeSnappyBlockAsm12B: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_encodeSnappyBlockAsm12B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeSnappyBlockAsm12B - -match_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL CX, SI - JBE match_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm12B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeSnappyBlockAsm12B - JMP match_extend_back_loop_encodeSnappyBlockAsm12B - -match_extend_back_end_encodeSnappyBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm12B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JB one_byte_match_emit_encodeSnappyBlockAsm12B - CMPL DI, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBlockAsm12B - JB three_bytes_match_emit_encodeSnappyBlockAsm12B - -three_bytes_match_emit_encodeSnappyBlockAsm12B: - MOVB $0xf4, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm12B - -two_bytes_match_emit_encodeSnappyBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DI, 1(AX) - ADDQ $0x02, AX - CMPL DI, $0x40 - JB memmove_match_emit_encodeSnappyBlockAsm12B - JMP memmove_long_match_emit_encodeSnappyBlockAsm12B - -one_byte_match_emit_encodeSnappyBlockAsm12B: - SHLB $0x02, DI - MOVB DI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), DI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (SI), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B - -memmove_long_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(R8*1), DI - - // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm12B: -match_nolit_loop_encodeSnappyBlockAsm12B: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBlockAsm12B - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B - -matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_match8_match_nolit_encodeSnappyBlockAsm12B: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBlockAsm12B - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm12B - -matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_match4_match_nolit_encodeSnappyBlockAsm12B: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBlockAsm12B - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm12B: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B - JB match_nolit_end_encodeSnappyBlockAsm12B - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_encodeSnappyBlockAsm12B - -matchlen_match1_match_nolit_encodeSnappyBlockAsm12B: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_encodeSnappyBlockAsm12B - LEAL 1(R9), R9 - -match_nolit_end_encodeSnappyBlockAsm12B: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B - MOVB $0xee, (AX) - MOVW BX, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B - CMPL BX, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B - LEAL -15(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm12B - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm12B: - MOVQ $0x000000cf1bbcdcbb, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x18, DI - IMULQ R8, DI - SHRQ $0x34, DI - SHLQ $0x18, BX - IMULQ R8, BX - SHRQ $0x34, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_encodeSnappyBlockAsm12B - INCL CX - JMP search_loop_encodeSnappyBlockAsm12B - -emit_remainder_encodeSnappyBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeSnappyBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBlockAsm12B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBlockAsm12B - JB three_bytes_emit_remainder_encodeSnappyBlockAsm12B - -three_bytes_emit_remainder_encodeSnappyBlockAsm12B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B - -two_bytes_emit_remainder_encodeSnappyBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBlockAsm12B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B - -one_byte_emit_remainder_encodeSnappyBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B - -memmove_long_emit_remainder_encodeSnappyBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000020, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm10B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm10B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x36, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeSnappyBlockAsm10B - LEAL 1(CX), SI - MOVL 12(SP), BX - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeSnappyBlockAsm10B - -repeat_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL SI, BX - JBE repeat_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(DI*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B - -repeat_extend_back_end_encodeSnappyBlockAsm10B: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R8 - SUBL BX, DI - LEAL -1(DI), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_encodeSnappyBlockAsm10B - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_encodeSnappyBlockAsm10B - JB three_bytes_repeat_emit_encodeSnappyBlockAsm10B - -three_bytes_repeat_emit_encodeSnappyBlockAsm10B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B - -two_bytes_repeat_emit_encodeSnappyBlockAsm10B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_encodeSnappyBlockAsm10B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B - -one_byte_repeat_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(DI*1), BX - - // genMemMoveShort - CMPQ DI, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (R8), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: - MOVQ BX, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - -memmove_long_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(DI*1), BX - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BX, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B: - CMPL DI, $0x10 - JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B - XORQ 8(BX)(R10*1), R11 - JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B - -matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B: - CMPL DI, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B - -matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B: - CMPL DI, $0x04 - JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B - MOVL (R8)(R10*1), R9 - CMPL (BX)(R10*1), R9 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B: - CMPL DI, $0x01 - JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B - JB repeat_extend_forward_end_encodeSnappyBlockAsm10B - MOVW (R8)(R10*1), R9 - CMPW (BX)(R10*1), R9 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B: - MOVB (R8)(R10*1), R9 - CMPB (BX)(R10*1), R9 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B - LEAL 1(R10), R10 - -repeat_extend_forward_end_encodeSnappyBlockAsm10B: - ADDL R10, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BX), BX - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - CMPL SI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - LEAL -15(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm10B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm10B: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm10B - -no_repeat_found_encodeSnappyBlockAsm10B: - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeSnappyBlockAsm10B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeSnappyBlockAsm10B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_encodeSnappyBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm10B - -candidate3_match_encodeSnappyBlockAsm10B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm10B - -candidate2_match_encodeSnappyBlockAsm10B: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_encodeSnappyBlockAsm10B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeSnappyBlockAsm10B - -match_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL CX, SI - JBE match_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeSnappyBlockAsm10B - JMP match_extend_back_loop_encodeSnappyBlockAsm10B - -match_extend_back_end_encodeSnappyBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm10B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JB one_byte_match_emit_encodeSnappyBlockAsm10B - CMPL DI, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBlockAsm10B - JB three_bytes_match_emit_encodeSnappyBlockAsm10B - -three_bytes_match_emit_encodeSnappyBlockAsm10B: - MOVB $0xf4, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm10B - -two_bytes_match_emit_encodeSnappyBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DI, 1(AX) - ADDQ $0x02, AX - CMPL DI, $0x40 - JB memmove_match_emit_encodeSnappyBlockAsm10B - JMP memmove_long_match_emit_encodeSnappyBlockAsm10B - -one_byte_match_emit_encodeSnappyBlockAsm10B: - SHLB $0x02, DI - MOVB DI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), DI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (SI), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B - -memmove_long_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(R8*1), DI - - // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm10B: -match_nolit_loop_encodeSnappyBlockAsm10B: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBlockAsm10B - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B - -matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_match8_match_nolit_encodeSnappyBlockAsm10B: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBlockAsm10B - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm10B - -matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_match4_match_nolit_encodeSnappyBlockAsm10B: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBlockAsm10B - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm10B: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B - JB match_nolit_end_encodeSnappyBlockAsm10B - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_encodeSnappyBlockAsm10B - -matchlen_match1_match_nolit_encodeSnappyBlockAsm10B: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_encodeSnappyBlockAsm10B - LEAL 1(R9), R9 - -match_nolit_end_encodeSnappyBlockAsm10B: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B - MOVB $0xee, (AX) - MOVW BX, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B - CMPL BX, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B - LEAL -15(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm10B - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm10B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x36, DI - SHLQ $0x20, BX - IMULQ R8, BX - SHRQ $0x36, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_encodeSnappyBlockAsm10B - INCL CX - JMP search_loop_encodeSnappyBlockAsm10B - -emit_remainder_encodeSnappyBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeSnappyBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBlockAsm10B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBlockAsm10B - JB three_bytes_emit_remainder_encodeSnappyBlockAsm10B - -three_bytes_emit_remainder_encodeSnappyBlockAsm10B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B - -two_bytes_emit_remainder_encodeSnappyBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBlockAsm10B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B - -one_byte_emit_remainder_encodeSnappyBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B - -memmove_long_emit_remainder_encodeSnappyBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000008, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBlockAsm8B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x04, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm8B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x38, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x38, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x38, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_encodeSnappyBlockAsm8B - LEAL 1(CX), SI - MOVL 12(SP), BX - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_encodeSnappyBlockAsm8B - -repeat_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL SI, BX - JBE repeat_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(DI*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_encodeSnappyBlockAsm8B - LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B - -repeat_extend_back_end_encodeSnappyBlockAsm8B: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R8 - SUBL BX, DI - LEAL -1(DI), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_encodeSnappyBlockAsm8B - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_encodeSnappyBlockAsm8B - JB three_bytes_repeat_emit_encodeSnappyBlockAsm8B - -three_bytes_repeat_emit_encodeSnappyBlockAsm8B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B - -two_bytes_repeat_emit_encodeSnappyBlockAsm8B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_encodeSnappyBlockAsm8B - JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B - -one_byte_repeat_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(DI*1), BX - - // genMemMoveShort - CMPQ DI, $0x08 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ DI, $0x10 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ DI, $0x20 - JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (R8), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (R8), R9 - MOVQ -8(R8)(DI*1), R8 - MOVQ R9, (AX) - MOVQ R8, -8(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (R8), X0 - MOVOU -16(R8)(DI*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DI*1) - JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - -memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: - MOVQ BX, AX - JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - -memmove_long_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(DI*1), BX - - // genMemMoveLong - MOVOU (R8), X0 - MOVOU 16(R8), X1 - MOVOU -32(R8)(DI*1), X2 - MOVOU -16(R8)(DI*1), X3 - MOVQ DI, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R8)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R8)(R11*1), X4 - MOVOU -16(R8)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ DI, R11 - JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DI*1) - MOVOU X3, -16(AX)(DI*1) - MOVQ BX, AX - -emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B: - CMPL DI, $0x10 - JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B - XORQ 8(BX)(R10*1), R11 - JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B - -matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B: - CMPL DI, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B - -matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B: - CMPL DI, $0x04 - JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B - MOVL (R8)(R10*1), R9 - CMPL (BX)(R10*1), R9 - JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B: - CMPL DI, $0x01 - JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B - JB repeat_extend_forward_end_encodeSnappyBlockAsm8B - MOVW (R8)(R10*1), R9 - CMPW (BX)(R10*1), R9 - JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B - -matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B: - MOVB (R8)(R10*1), R9 - CMPB (BX)(R10*1), R9 - JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B - LEAL 1(R10), R10 - -repeat_extend_forward_end_encodeSnappyBlockAsm8B: - ADDL R10, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - - // emitCopy -two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B - MOVB $0xee, (AX) - MOVW SI, 1(AX) - LEAL -60(BX), BX - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B - -two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B - LEAL -15(DI), DI - MOVB SI, 1(AX) - SHRL $0x08, SI - SHLL $0x05, SI - ORL SI, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP repeat_end_emit_encodeSnappyBlockAsm8B - -emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, AX - -repeat_end_emit_encodeSnappyBlockAsm8B: - MOVL CX, 12(SP) - JMP search_loop_encodeSnappyBlockAsm8B - -no_repeat_found_encodeSnappyBlockAsm8B: - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeSnappyBlockAsm8B - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_encodeSnappyBlockAsm8B - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_encodeSnappyBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBlockAsm8B - -candidate3_match_encodeSnappyBlockAsm8B: - ADDL $0x02, CX - JMP candidate_match_encodeSnappyBlockAsm8B - -candidate2_match_encodeSnappyBlockAsm8B: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_encodeSnappyBlockAsm8B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeSnappyBlockAsm8B - -match_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL CX, SI - JBE match_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeSnappyBlockAsm8B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeSnappyBlockAsm8B - JMP match_extend_back_loop_encodeSnappyBlockAsm8B - -match_extend_back_end_encodeSnappyBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBlockAsm8B: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), DI - CMPL DI, $0x3c - JB one_byte_match_emit_encodeSnappyBlockAsm8B - CMPL DI, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBlockAsm8B - JB three_bytes_match_emit_encodeSnappyBlockAsm8B - -three_bytes_match_emit_encodeSnappyBlockAsm8B: - MOVB $0xf4, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBlockAsm8B - -two_bytes_match_emit_encodeSnappyBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DI, 1(AX) - ADDQ $0x02, AX - CMPL DI, $0x40 - JB memmove_match_emit_encodeSnappyBlockAsm8B - JMP memmove_long_match_emit_encodeSnappyBlockAsm8B - -one_byte_match_emit_encodeSnappyBlockAsm8B: - SHLB $0x02, DI - MOVB DI, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), DI - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (SI), R9 - MOVQ R9, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (SI), R9 - MOVQ -8(SI)(R8*1), SI - MOVQ R9, (AX) - MOVQ SI, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (SI), X0 - MOVOU -16(SI)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: - MOVQ DI, AX - JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B - -memmove_long_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(R8*1), DI - - // genMemMoveLong - MOVOU (SI), X0 - MOVOU 16(SI), X1 - MOVOU -32(SI)(R8*1), X2 - MOVOU -16(SI)(R8*1), X3 - MOVQ R8, R10 - SHRQ $0x05, R10 - MOVQ AX, R9 - ANDL $0x0000001f, R9 - MOVQ $0x00000040, R11 - SUBQ R9, R11 - DECQ R10 - JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(SI)(R11*1), R9 - LEAQ -32(AX)(R11*1), R12 - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (R9), X4 - MOVOU 16(R9), X5 - MOVOA X4, (R12) - MOVOA X5, 16(R12) - ADDQ $0x20, R12 - ADDQ $0x20, R9 - ADDQ $0x20, R11 - DECQ R10 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(SI)(R11*1), X4 - MOVOU -16(SI)(R11*1), X5 - MOVOA X4, -32(AX)(R11*1) - MOVOA X5, -16(AX)(R11*1) - ADDQ $0x20, R11 - CMPQ R8, R11 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ DI, AX - -emit_literal_done_match_emit_encodeSnappyBlockAsm8B: -match_nolit_loop_encodeSnappyBlockAsm8B: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBlockAsm8B - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B - -matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_match8_match_nolit_encodeSnappyBlockAsm8B: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBlockAsm8B - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm8B - -matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_match4_match_nolit_encodeSnappyBlockAsm8B: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBlockAsm8B - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_encodeSnappyBlockAsm8B: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B - JB match_nolit_end_encodeSnappyBlockAsm8B - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_encodeSnappyBlockAsm8B - -matchlen_match1_match_nolit_encodeSnappyBlockAsm8B: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_encodeSnappyBlockAsm8B - LEAL 1(R9), R9 - -match_nolit_end_encodeSnappyBlockAsm8B: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B - MOVB $0xee, (AX) - MOVW BX, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B - -two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B - LEAL -15(SI), SI - MOVB BL, 1(AX) - SHRL $0x08, BX - SHLL $0x05, BX - ORL BX, SI - MOVB SI, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B - -emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeSnappyBlockAsm8B - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBlockAsm8B: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x38, DI - SHLQ $0x20, BX - IMULQ R8, BX - SHRQ $0x38, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_encodeSnappyBlockAsm8B - INCL CX - JMP search_loop_encodeSnappyBlockAsm8B - -emit_remainder_encodeSnappyBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeSnappyBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBlockAsm8B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBlockAsm8B - JB three_bytes_emit_remainder_encodeSnappyBlockAsm8B - -three_bytes_emit_remainder_encodeSnappyBlockAsm8B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B - -two_bytes_emit_remainder_encodeSnappyBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBlockAsm8B - JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B - -one_byte_emit_remainder_encodeSnappyBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B - -memmove_long_emit_remainder_encodeSnappyBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00001200, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x07, BX - CMPL BX, $0x63 - JBE check_maxskip_ok_encodeSnappyBetterBlockAsm - LEAL 100(CX), BX - JMP check_maxskip_cont_encodeSnappyBetterBlockAsm - -check_maxskip_ok_encodeSnappyBetterBlockAsm: - LEAL 1(CX)(BX*1), BX - -check_maxskip_cont_encodeSnappyBetterBlockAsm: - CMPL BX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R8 - MOVQ $0x9e3779b1, BX - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x2f, R9 - SHLQ $0x20, R10 - IMULQ BX, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BX - MOVL 524312(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 524312(SP)(R10*4) - MOVQ (DX)(BX*1), R9 - MOVQ (DX)(DI*1), R10 - CMPQ R9, SI - JEQ candidate_match_encodeSnappyBetterBlockAsm - CMPQ R10, SI - JNE no_short_found_encodeSnappyBetterBlockAsm - MOVL DI, BX - JMP candidate_match_encodeSnappyBetterBlockAsm - -no_short_found_encodeSnappyBetterBlockAsm: - CMPL R9, SI - JEQ candidate_match_encodeSnappyBetterBlockAsm - CMPL R10, SI - JEQ candidateS_match_encodeSnappyBetterBlockAsm - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm - -candidateS_match_encodeSnappyBetterBlockAsm: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x2f, R9 - MOVL 24(SP)(R9*4), BX - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeSnappyBetterBlockAsm - DECL CX - MOVL DI, BX - -candidate_match_encodeSnappyBetterBlockAsm: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeSnappyBetterBlockAsm - -match_extend_back_loop_encodeSnappyBetterBlockAsm: - CMPL CX, SI - JBE match_extend_back_end_encodeSnappyBetterBlockAsm - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeSnappyBetterBlockAsm - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm - -match_extend_back_end_encodeSnappyBetterBlockAsm: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 5(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm: - MOVL CX, SI - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), R9 - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm - MOVQ (R8)(R11*1), R10 - MOVQ 8(R8)(R11*1), R12 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm - XORQ 8(R9)(R11*1), R12 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm - LEAL -16(DI), DI - LEAL 16(R11), R11 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm - -matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm - LEAL -8(DI), DI - LEAL 8(R11), R11 - JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm - -matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm - MOVL (R8)(R11*1), R10 - CMPL (R9)(R11*1), R10 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm - LEAL -4(DI), DI - LEAL 4(R11), R11 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm - JB match_nolit_end_encodeSnappyBetterBlockAsm - MOVW (R8)(R11*1), R10 - CMPW (R9)(R11*1), R10 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm - LEAL 2(R11), R11 - SUBL $0x02, DI - JZ match_nolit_end_encodeSnappyBetterBlockAsm - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeSnappyBetterBlockAsm - LEAL 1(R11), R11 - -match_nolit_end_encodeSnappyBetterBlockAsm: - MOVL CX, DI - SUBL BX, DI - - // Check if repeat - CMPL R11, $0x01 - JA match_length_ok_encodeSnappyBetterBlockAsm - CMPL DI, $0x0000ffff - JBE match_length_ok_encodeSnappyBetterBlockAsm - MOVL 20(SP), CX - INCL CX - JMP search_loop_encodeSnappyBetterBlockAsm - -match_length_ok_encodeSnappyBetterBlockAsm: - MOVL DI, 16(SP) - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_encodeSnappyBetterBlockAsm - CMPL BX, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBetterBlockAsm - CMPL BX, $0x00010000 - JB three_bytes_match_emit_encodeSnappyBetterBlockAsm - CMPL BX, $0x01000000 - JB four_bytes_match_emit_encodeSnappyBetterBlockAsm - MOVB $0xfc, (AX) - MOVL BX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -four_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVL BX, R10 - SHRL $0x10, R10 - MOVB $0xf8, (AX) - MOVW BX, 1(AX) - MOVB R10, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -three_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -two_bytes_match_emit_encodeSnappyBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_encodeSnappyBetterBlockAsm - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm - -one_byte_match_emit_encodeSnappyBetterBlockAsm: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: - MOVQ BX, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm - -memmove_long_match_emit_encodeSnappyBetterBlockAsm: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy - CMPL DI, $0x00010000 - JB two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm - -four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R11, $0x40 - JBE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm - MOVB $0xff, (AX) - MOVL DI, 1(AX) - LEAL -64(R11), R11 - ADDQ $0x05, AX - CMPL R11, $0x04 - JB four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm - -four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: - TESTL R11, R11 - JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - XORL BX, BX - LEAL -1(BX)(R11*4), R11 - MOVB R11, (AX) - MOVL DI, 1(AX) - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: - CMPL R11, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: - MOVL R11, BX - SHLL $0x02, BX - CMPL R11, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm - CMPL DI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm - LEAL -15(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: - LEAL -2(BX), BX - MOVB BL, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: - CMPL CX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm: - MOVQ $0x00cf1bbcdcbfa563, BX - MOVQ $0x9e3779b1, DI - LEAQ 1(SI), SI - LEAQ -2(CX), R8 - MOVQ (DX)(SI*1), R9 - MOVQ 1(DX)(SI*1), R10 - MOVQ (DX)(R8*1), R11 - MOVQ 1(DX)(R8*1), R12 - SHLQ $0x08, R9 - IMULQ BX, R9 - SHRQ $0x2f, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x32, R10 - SHLQ $0x08, R11 - IMULQ BX, R11 - SHRQ $0x2f, R11 - SHLQ $0x20, R12 - IMULQ DI, R12 - SHRQ $0x32, R12 - LEAQ 1(SI), DI - LEAQ 1(R8), R13 - MOVL SI, 24(SP)(R9*4) - MOVL R8, 24(SP)(R11*4) - MOVL DI, 524312(SP)(R10*4) - MOVL R13, 524312(SP)(R12*4) - LEAQ 1(R8)(SI*1), DI - SHRQ $0x01, DI - ADDQ $0x01, SI - SUBQ $0x01, R8 - -index_loop_encodeSnappyBetterBlockAsm: - CMPQ DI, R8 - JAE search_loop_encodeSnappyBetterBlockAsm - MOVQ (DX)(SI*1), R9 - MOVQ (DX)(DI*1), R10 - SHLQ $0x08, R9 - IMULQ BX, R9 - SHRQ $0x2f, R9 - SHLQ $0x08, R10 - IMULQ BX, R10 - SHRQ $0x2f, R10 - MOVL SI, 24(SP)(R9*4) - MOVL DI, 24(SP)(R10*4) - ADDQ $0x02, SI - ADDQ $0x02, DI - JMP index_loop_encodeSnappyBetterBlockAsm - -emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeSnappyBetterBlockAsm - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x00010000 - JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm - CMPL DX, $0x01000000 - JB four_bytes_emit_remainder_encodeSnappyBetterBlockAsm - MOVB $0xfc, (AX) - MOVL DX, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVL DX, BX - SHRL $0x10, BX - MOVB $0xf8, (AX) - MOVW DX, 1(AX) - MOVB BL, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBetterBlockAsm - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000a00, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm64K: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm64K - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm64K: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x07, BX - LEAL 1(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm64K - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x00cf1bbcdcbfa563, R8 - MOVQ $0x9e3779b1, BX - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ BX, R10 - SHRQ $0x32, R10 - MOVL 24(SP)(R9*4), BX - MOVL 262168(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 262168(SP)(R10*4) - MOVQ (DX)(BX*1), R9 - MOVQ (DX)(DI*1), R10 - CMPQ R9, SI - JEQ candidate_match_encodeSnappyBetterBlockAsm64K - CMPQ R10, SI - JNE no_short_found_encodeSnappyBetterBlockAsm64K - MOVL DI, BX - JMP candidate_match_encodeSnappyBetterBlockAsm64K - -no_short_found_encodeSnappyBetterBlockAsm64K: - CMPL R9, SI - JEQ candidate_match_encodeSnappyBetterBlockAsm64K - CMPL R10, SI - JEQ candidateS_match_encodeSnappyBetterBlockAsm64K - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm64K - -candidateS_match_encodeSnappyBetterBlockAsm64K: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x08, R9 - IMULQ R8, R9 - SHRQ $0x30, R9 - MOVL 24(SP)(R9*4), BX - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeSnappyBetterBlockAsm64K - DECL CX - MOVL DI, BX - -candidate_match_encodeSnappyBetterBlockAsm64K: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K - -match_extend_back_loop_encodeSnappyBetterBlockAsm64K: - CMPL CX, SI - JBE match_extend_back_end_encodeSnappyBetterBlockAsm64K - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K - -match_extend_back_end_encodeSnappyBetterBlockAsm64K: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm64K: - MOVL CX, SI - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), R9 - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K - MOVQ (R8)(R11*1), R10 - MOVQ 8(R8)(R11*1), R12 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K - XORQ 8(R9)(R11*1), R12 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K - LEAL -16(DI), DI - LEAL 16(R11), R11 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K - -matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K - LEAL -8(DI), DI - LEAL 8(R11), R11 - JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K - -matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K - MOVL (R8)(R11*1), R10 - CMPL (R9)(R11*1), R10 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K - LEAL -4(DI), DI - LEAL 4(R11), R11 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K - JB match_nolit_end_encodeSnappyBetterBlockAsm64K - MOVW (R8)(R11*1), R10 - CMPW (R9)(R11*1), R10 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K - LEAL 2(R11), R11 - SUBL $0x02, DI - JZ match_nolit_end_encodeSnappyBetterBlockAsm64K - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeSnappyBetterBlockAsm64K - LEAL 1(R11), R11 - -match_nolit_end_encodeSnappyBetterBlockAsm64K: - MOVL CX, DI - SUBL BX, DI - - // Check if repeat - MOVL DI, 16(SP) - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_encodeSnappyBetterBlockAsm64K - CMPL BX, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBetterBlockAsm64K - JB three_bytes_match_emit_encodeSnappyBetterBlockAsm64K - -three_bytes_match_emit_encodeSnappyBetterBlockAsm64K: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K - -two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_encodeSnappyBetterBlockAsm64K - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K - -one_byte_match_emit_encodeSnappyBetterBlockAsm64K: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: - MOVQ BX, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K - -memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: - CMPL R11, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: - MOVL R11, BX - SHLL $0x02, BX - CMPL R11, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K - CMPL DI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K - LEAL -15(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: - LEAL -2(BX), BX - MOVB BL, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: - CMPL CX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm64K - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: - MOVQ $0x00cf1bbcdcbfa563, BX - MOVQ $0x9e3779b1, DI - LEAQ 1(SI), SI - LEAQ -2(CX), R8 - MOVQ (DX)(SI*1), R9 - MOVQ 1(DX)(SI*1), R10 - MOVQ (DX)(R8*1), R11 - MOVQ 1(DX)(R8*1), R12 - SHLQ $0x08, R9 - IMULQ BX, R9 - SHRQ $0x30, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x32, R10 - SHLQ $0x08, R11 - IMULQ BX, R11 - SHRQ $0x30, R11 - SHLQ $0x20, R12 - IMULQ DI, R12 - SHRQ $0x32, R12 - LEAQ 1(SI), DI - LEAQ 1(R8), R13 - MOVL SI, 24(SP)(R9*4) - MOVL R8, 24(SP)(R11*4) - MOVL DI, 262168(SP)(R10*4) - MOVL R13, 262168(SP)(R12*4) - LEAQ 1(R8)(SI*1), DI - SHRQ $0x01, DI - ADDQ $0x01, SI - SUBQ $0x01, R8 - -index_loop_encodeSnappyBetterBlockAsm64K: - CMPQ DI, R8 - JAE search_loop_encodeSnappyBetterBlockAsm64K - MOVQ (DX)(SI*1), R9 - MOVQ (DX)(DI*1), R10 - SHLQ $0x08, R9 - IMULQ BX, R9 - SHRQ $0x30, R9 - SHLQ $0x08, R10 - IMULQ BX, R10 - SHRQ $0x30, R10 - MOVL SI, 24(SP)(R9*4) - MOVL DI, 24(SP)(R10*4) - ADDQ $0x02, SI - ADDQ $0x02, DI - JMP index_loop_encodeSnappyBetterBlockAsm64K - -emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeSnappyBetterBlockAsm64K - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm64K: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K - JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBetterBlockAsm64K - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000280, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm12B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm12B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm12B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x06, BX - LEAL 1(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm12B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BX - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - SHLQ $0x20, R10 - IMULQ BX, R10 - SHRQ $0x34, R10 - MOVL 24(SP)(R9*4), BX - MOVL 65560(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 65560(SP)(R10*4) - MOVQ (DX)(BX*1), R9 - MOVQ (DX)(DI*1), R10 - CMPQ R9, SI - JEQ candidate_match_encodeSnappyBetterBlockAsm12B - CMPQ R10, SI - JNE no_short_found_encodeSnappyBetterBlockAsm12B - MOVL DI, BX - JMP candidate_match_encodeSnappyBetterBlockAsm12B - -no_short_found_encodeSnappyBetterBlockAsm12B: - CMPL R9, SI - JEQ candidate_match_encodeSnappyBetterBlockAsm12B - CMPL R10, SI - JEQ candidateS_match_encodeSnappyBetterBlockAsm12B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm12B - -candidateS_match_encodeSnappyBetterBlockAsm12B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x32, R9 - MOVL 24(SP)(R9*4), BX - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeSnappyBetterBlockAsm12B - DECL CX - MOVL DI, BX - -candidate_match_encodeSnappyBetterBlockAsm12B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B - -match_extend_back_loop_encodeSnappyBetterBlockAsm12B: - CMPL CX, SI - JBE match_extend_back_end_encodeSnappyBetterBlockAsm12B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B - -match_extend_back_end_encodeSnappyBetterBlockAsm12B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm12B: - MOVL CX, SI - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), R9 - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B - MOVQ (R8)(R11*1), R10 - MOVQ 8(R8)(R11*1), R12 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B - XORQ 8(R9)(R11*1), R12 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B - LEAL -16(DI), DI - LEAL 16(R11), R11 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B - -matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B - LEAL -8(DI), DI - LEAL 8(R11), R11 - JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B - -matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B - MOVL (R8)(R11*1), R10 - CMPL (R9)(R11*1), R10 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B - LEAL -4(DI), DI - LEAL 4(R11), R11 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B - JB match_nolit_end_encodeSnappyBetterBlockAsm12B - MOVW (R8)(R11*1), R10 - CMPW (R9)(R11*1), R10 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B - LEAL 2(R11), R11 - SUBL $0x02, DI - JZ match_nolit_end_encodeSnappyBetterBlockAsm12B - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeSnappyBetterBlockAsm12B - LEAL 1(R11), R11 - -match_nolit_end_encodeSnappyBetterBlockAsm12B: - MOVL CX, DI - SUBL BX, DI - - // Check if repeat - MOVL DI, 16(SP) - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_encodeSnappyBetterBlockAsm12B - CMPL BX, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBetterBlockAsm12B - JB three_bytes_match_emit_encodeSnappyBetterBlockAsm12B - -three_bytes_match_emit_encodeSnappyBetterBlockAsm12B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_encodeSnappyBetterBlockAsm12B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B - -one_byte_match_emit_encodeSnappyBetterBlockAsm12B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: - MOVQ BX, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: - CMPL R11, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: - MOVL R11, BX - SHLL $0x02, BX - CMPL R11, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B - CMPL DI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B - LEAL -15(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: - LEAL -2(BX), BX - MOVB BL, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm12B - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: - MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ $0x9e3779b1, DI - LEAQ 1(SI), SI - LEAQ -2(CX), R8 - MOVQ (DX)(SI*1), R9 - MOVQ 1(DX)(SI*1), R10 - MOVQ (DX)(R8*1), R11 - MOVQ 1(DX)(R8*1), R12 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x32, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x34, R10 - SHLQ $0x10, R11 - IMULQ BX, R11 - SHRQ $0x32, R11 - SHLQ $0x20, R12 - IMULQ DI, R12 - SHRQ $0x34, R12 - LEAQ 1(SI), DI - LEAQ 1(R8), R13 - MOVL SI, 24(SP)(R9*4) - MOVL R8, 24(SP)(R11*4) - MOVL DI, 65560(SP)(R10*4) - MOVL R13, 65560(SP)(R12*4) - LEAQ 1(R8)(SI*1), DI - SHRQ $0x01, DI - ADDQ $0x01, SI - SUBQ $0x01, R8 - -index_loop_encodeSnappyBetterBlockAsm12B: - CMPQ DI, R8 - JAE search_loop_encodeSnappyBetterBlockAsm12B - MOVQ (DX)(SI*1), R9 - MOVQ (DX)(DI*1), R10 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x32, R9 - SHLQ $0x10, R10 - IMULQ BX, R10 - SHRQ $0x32, R10 - MOVL SI, 24(SP)(R9*4) - MOVL DI, 24(SP)(R10*4) - ADDQ $0x02, SI - ADDQ $0x02, DI - JMP index_loop_encodeSnappyBetterBlockAsm12B - -emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeSnappyBetterBlockAsm12B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm12B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B - JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBetterBlockAsm12B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x000000a0, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm10B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm10B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm10B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 1(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm10B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BX - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - SHLQ $0x20, R10 - IMULQ BX, R10 - SHRQ $0x36, R10 - MOVL 24(SP)(R9*4), BX - MOVL 16408(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 16408(SP)(R10*4) - MOVQ (DX)(BX*1), R9 - MOVQ (DX)(DI*1), R10 - CMPQ R9, SI - JEQ candidate_match_encodeSnappyBetterBlockAsm10B - CMPQ R10, SI - JNE no_short_found_encodeSnappyBetterBlockAsm10B - MOVL DI, BX - JMP candidate_match_encodeSnappyBetterBlockAsm10B - -no_short_found_encodeSnappyBetterBlockAsm10B: - CMPL R9, SI - JEQ candidate_match_encodeSnappyBetterBlockAsm10B - CMPL R10, SI - JEQ candidateS_match_encodeSnappyBetterBlockAsm10B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm10B - -candidateS_match_encodeSnappyBetterBlockAsm10B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x34, R9 - MOVL 24(SP)(R9*4), BX - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeSnappyBetterBlockAsm10B - DECL CX - MOVL DI, BX - -candidate_match_encodeSnappyBetterBlockAsm10B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B - -match_extend_back_loop_encodeSnappyBetterBlockAsm10B: - CMPL CX, SI - JBE match_extend_back_end_encodeSnappyBetterBlockAsm10B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B - -match_extend_back_end_encodeSnappyBetterBlockAsm10B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm10B: - MOVL CX, SI - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), R9 - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B - MOVQ (R8)(R11*1), R10 - MOVQ 8(R8)(R11*1), R12 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B - XORQ 8(R9)(R11*1), R12 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B - LEAL -16(DI), DI - LEAL 16(R11), R11 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B - -matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B - LEAL -8(DI), DI - LEAL 8(R11), R11 - JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B - -matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B - MOVL (R8)(R11*1), R10 - CMPL (R9)(R11*1), R10 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B - LEAL -4(DI), DI - LEAL 4(R11), R11 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B - JB match_nolit_end_encodeSnappyBetterBlockAsm10B - MOVW (R8)(R11*1), R10 - CMPW (R9)(R11*1), R10 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B - LEAL 2(R11), R11 - SUBL $0x02, DI - JZ match_nolit_end_encodeSnappyBetterBlockAsm10B - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeSnappyBetterBlockAsm10B - LEAL 1(R11), R11 - -match_nolit_end_encodeSnappyBetterBlockAsm10B: - MOVL CX, DI - SUBL BX, DI - - // Check if repeat - MOVL DI, 16(SP) - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_encodeSnappyBetterBlockAsm10B - CMPL BX, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBetterBlockAsm10B - JB three_bytes_match_emit_encodeSnappyBetterBlockAsm10B - -three_bytes_match_emit_encodeSnappyBetterBlockAsm10B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_encodeSnappyBetterBlockAsm10B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B - -one_byte_match_emit_encodeSnappyBetterBlockAsm10B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: - MOVQ BX, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: - CMPL R11, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: - MOVL R11, BX - SHLL $0x02, BX - CMPL R11, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B - CMPL DI, $0x00000800 - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B - LEAL -15(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: - LEAL -2(BX), BX - MOVB BL, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm10B - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: - MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ $0x9e3779b1, DI - LEAQ 1(SI), SI - LEAQ -2(CX), R8 - MOVQ (DX)(SI*1), R9 - MOVQ 1(DX)(SI*1), R10 - MOVQ (DX)(R8*1), R11 - MOVQ 1(DX)(R8*1), R12 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x34, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x36, R10 - SHLQ $0x10, R11 - IMULQ BX, R11 - SHRQ $0x34, R11 - SHLQ $0x20, R12 - IMULQ DI, R12 - SHRQ $0x36, R12 - LEAQ 1(SI), DI - LEAQ 1(R8), R13 - MOVL SI, 24(SP)(R9*4) - MOVL R8, 24(SP)(R11*4) - MOVL DI, 16408(SP)(R10*4) - MOVL R13, 16408(SP)(R12*4) - LEAQ 1(R8)(SI*1), DI - SHRQ $0x01, DI - ADDQ $0x01, SI - SUBQ $0x01, R8 - -index_loop_encodeSnappyBetterBlockAsm10B: - CMPQ DI, R8 - JAE search_loop_encodeSnappyBetterBlockAsm10B - MOVQ (DX)(SI*1), R9 - MOVQ (DX)(DI*1), R10 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x34, R9 - SHLQ $0x10, R10 - IMULQ BX, R10 - SHRQ $0x34, R10 - MOVL SI, 24(SP)(R9*4) - MOVL DI, 24(SP)(R10*4) - ADDQ $0x02, SI - ADDQ $0x02, DI - JMP index_loop_encodeSnappyBetterBlockAsm10B - -emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeSnappyBetterBlockAsm10B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm10B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B - JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBetterBlockAsm10B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int -// Requires: BMI, SSE2 -TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 - MOVQ dst_base+0(FP), AX - MOVQ $0x00000028, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_encodeSnappyBetterBlockAsm8B: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_encodeSnappyBetterBlockAsm8B - MOVL $0x00000000, 12(SP) - MOVQ src_len+32(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL $0x00000000, 16(SP) - MOVQ src_base+24(FP), DX - -search_loop_encodeSnappyBetterBlockAsm8B: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x04, BX - LEAL 1(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm8B - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ $0x9e3779b1, BX - MOVQ SI, R9 - MOVQ SI, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ BX, R10 - SHRQ $0x38, R10 - MOVL 24(SP)(R9*4), BX - MOVL 4120(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - MOVL CX, 4120(SP)(R10*4) - MOVQ (DX)(BX*1), R9 - MOVQ (DX)(DI*1), R10 - CMPQ R9, SI - JEQ candidate_match_encodeSnappyBetterBlockAsm8B - CMPQ R10, SI - JNE no_short_found_encodeSnappyBetterBlockAsm8B - MOVL DI, BX - JMP candidate_match_encodeSnappyBetterBlockAsm8B - -no_short_found_encodeSnappyBetterBlockAsm8B: - CMPL R9, SI - JEQ candidate_match_encodeSnappyBetterBlockAsm8B - CMPL R10, SI - JEQ candidateS_match_encodeSnappyBetterBlockAsm8B - MOVL 20(SP), CX - JMP search_loop_encodeSnappyBetterBlockAsm8B - -candidateS_match_encodeSnappyBetterBlockAsm8B: - SHRQ $0x08, SI - MOVQ SI, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x36, R9 - MOVL 24(SP)(R9*4), BX - INCL CX - MOVL CX, 24(SP)(R9*4) - CMPL (DX)(BX*1), SI - JEQ candidate_match_encodeSnappyBetterBlockAsm8B - DECL CX - MOVL DI, BX - -candidate_match_encodeSnappyBetterBlockAsm8B: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B - -match_extend_back_loop_encodeSnappyBetterBlockAsm8B: - CMPL CX, SI - JBE match_extend_back_end_encodeSnappyBetterBlockAsm8B - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B - JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B - -match_extend_back_end_encodeSnappyBetterBlockAsm8B: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_dst_size_check_encodeSnappyBetterBlockAsm8B: - MOVL CX, SI - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), R9 - - // matchLen - XORL R11, R11 - -matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL DI, $0x10 - JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B - MOVQ (R8)(R11*1), R10 - MOVQ 8(R8)(R11*1), R12 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B - XORQ 8(R9)(R11*1), R12 - JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B - LEAL -16(DI), DI - LEAL 16(R11), R11 - JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B - -matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R12, R12 - -#else - BSFQ R12, R12 - -#endif - SARQ $0x03, R12 - LEAL 8(R11)(R12*1), R11 - JMP match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL DI, $0x08 - JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B - LEAL -8(DI), DI - LEAL 8(R11), R11 - JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B - -matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL (R11)(R10*1), R11 - JMP match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL DI, $0x04 - JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B - MOVL (R8)(R11*1), R10 - CMPL (R9)(R11*1), R10 - JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B - LEAL -4(DI), DI - LEAL 4(R11), R11 - -matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL DI, $0x01 - JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B - JB match_nolit_end_encodeSnappyBetterBlockAsm8B - MOVW (R8)(R11*1), R10 - CMPW (R9)(R11*1), R10 - JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B - LEAL 2(R11), R11 - SUBL $0x02, DI - JZ match_nolit_end_encodeSnappyBetterBlockAsm8B - -matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B: - MOVB (R8)(R11*1), R10 - CMPB (R9)(R11*1), R10 - JNE match_nolit_end_encodeSnappyBetterBlockAsm8B - LEAL 1(R11), R11 - -match_nolit_end_encodeSnappyBetterBlockAsm8B: - MOVL CX, DI - SUBL BX, DI - - // Check if repeat - MOVL DI, 16(SP) - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R9 - SUBL BX, R8 - LEAL -1(R8), BX - CMPL BX, $0x3c - JB one_byte_match_emit_encodeSnappyBetterBlockAsm8B - CMPL BX, $0x00000100 - JB two_bytes_match_emit_encodeSnappyBetterBlockAsm8B - JB three_bytes_match_emit_encodeSnappyBetterBlockAsm8B - -three_bytes_match_emit_encodeSnappyBetterBlockAsm8B: - MOVB $0xf4, (AX) - MOVW BX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B - -two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB BL, 1(AX) - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_match_emit_encodeSnappyBetterBlockAsm8B - JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B - -one_byte_match_emit_encodeSnappyBetterBlockAsm8B: - SHLB $0x02, BL - MOVB BL, (AX) - ADDQ $0x01, AX - -memmove_match_emit_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(R8*1), BX - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: - MOVQ (R9), R10 - MOVQ R10, (AX) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: - MOVQ (R9), R10 - MOVQ -8(R9)(R8*1), R9 - MOVQ R10, (AX) - MOVQ R9, -8(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: - MOVQ BX, AX - JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B - -memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(R8*1), BX - - // genMemMoveLong - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R10 - ANDL $0x0000001f, R10 - MOVQ $0x00000040, R13 - SUBQ R10, R13 - DECQ R12 - JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(R9)(R13*1), R10 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: - MOVOU (R10), X4 - MOVOU 16(R10), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R10 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(R9)(R13*1), X4 - MOVOU -16(R9)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ BX, AX - -emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: - ADDL R11, CX - ADDL $0x04, R11 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: - CMPL R11, $0x40 - JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B - MOVB $0xee, (AX) - MOVW DI, 1(AX) - LEAL -60(R11), R11 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B - -two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: - MOVL R11, BX - SHLL $0x02, BX - CMPL R11, $0x0c - JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B - LEAL -15(BX), BX - MOVB DI, 1(AX) - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, BX - MOVB BL, (AX) - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B - -emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: - LEAL -2(BX), BX - MOVB BL, (AX) - MOVW DI, 1(AX) - ADDQ $0x03, AX - -match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: - CMPL CX, 8(SP) - JAE emit_remainder_encodeSnappyBetterBlockAsm8B - CMPQ AX, (SP) - JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: - MOVQ $0x0000cf1bbcdcbf9b, BX - MOVQ $0x9e3779b1, DI - LEAQ 1(SI), SI - LEAQ -2(CX), R8 - MOVQ (DX)(SI*1), R9 - MOVQ 1(DX)(SI*1), R10 - MOVQ (DX)(R8*1), R11 - MOVQ 1(DX)(R8*1), R12 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x36, R9 - SHLQ $0x20, R10 - IMULQ DI, R10 - SHRQ $0x38, R10 - SHLQ $0x10, R11 - IMULQ BX, R11 - SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ DI, R12 - SHRQ $0x38, R12 - LEAQ 1(SI), DI - LEAQ 1(R8), R13 - MOVL SI, 24(SP)(R9*4) - MOVL R8, 24(SP)(R11*4) - MOVL DI, 4120(SP)(R10*4) - MOVL R13, 4120(SP)(R12*4) - LEAQ 1(R8)(SI*1), DI - SHRQ $0x01, DI - ADDQ $0x01, SI - SUBQ $0x01, R8 - -index_loop_encodeSnappyBetterBlockAsm8B: - CMPQ DI, R8 - JAE search_loop_encodeSnappyBetterBlockAsm8B - MOVQ (DX)(SI*1), R9 - MOVQ (DX)(DI*1), R10 - SHLQ $0x10, R9 - IMULQ BX, R9 - SHRQ $0x36, R9 - SHLQ $0x10, R10 - IMULQ BX, R10 - SHRQ $0x36, R10 - MOVL SI, 24(SP)(R9*4) - MOVL DI, 24(SP)(R10*4) - ADDQ $0x02, SI - ADDQ $0x02, DI - JMP index_loop_encodeSnappyBetterBlockAsm8B - -emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_encodeSnappyBetterBlockAsm8B - MOVQ $0x00000000, ret+48(FP) - RET - -emit_remainder_ok_encodeSnappyBetterBlockAsm8B: - MOVQ src_len+32(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), DX - CMPL DX, $0x3c - JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B - CMPL DX, $0x00000100 - JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B - JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B - -three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVB $0xf4, (AX) - MOVW DX, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B - -two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVB $0xf0, (AX) - MOVB DL, 1(AX) - ADDQ $0x02, AX - CMPL DX, $0x40 - JB memmove_emit_remainder_encodeSnappyBetterBlockAsm8B - JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B - -one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: - SHLB $0x02, DL - MOVB DL, (AX) - ADDQ $0x01, AX - -memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveShort - CMPQ BX, $0x03 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2 - JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3 - CMPQ BX, $0x08 - JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7 - CMPQ BX, $0x10 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 - CMPQ BX, $0x20 - JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 - JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(BX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(BX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(BX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(BX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(BX*1) - JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B - -emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - -memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ DX, AX - JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B - -memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: - LEAQ (AX)(SI*1), DX - MOVL SI, BX - - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(BX*1), X2 - MOVOU -16(CX)(BX*1), X3 - MOVQ BX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back - -emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ BX, R8 - JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BX*1) - MOVOU X3, -16(AX)(BX*1) - MOVQ DX, AX - -emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ AX, ret+48(FP) - RET - -// func calcBlockSize(src []byte) int -// Requires: BMI, SSE2 -TEXT ·calcBlockSize(SB), $32792-32 - XORQ AX, AX - MOVQ $0x00000100, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_calcBlockSize: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_calcBlockSize - MOVL $0x00000000, 12(SP) - MOVQ src_len+8(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+0(FP), DX - -search_loop_calcBlockSize: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_calcBlockSize - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x33, R9 - SHLQ $0x10, R10 - IMULQ R8, R10 - SHRQ $0x33, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x10, R9 - IMULQ R8, R9 - SHRQ $0x33, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_calcBlockSize - LEAL 1(CX), SI - MOVL 12(SP), BX - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_calcBlockSize - -repeat_extend_back_loop_calcBlockSize: - CMPL SI, BX - JBE repeat_extend_back_end_calcBlockSize - MOVB -1(DX)(DI*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_calcBlockSize - LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_calcBlockSize - -repeat_extend_back_end_calcBlockSize: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_calcBlockSize - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R8 - SUBL BX, DI - LEAL -1(DI), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_calcBlockSize - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_calcBlockSize - CMPL BX, $0x00010000 - JB three_bytes_repeat_emit_calcBlockSize - CMPL BX, $0x01000000 - JB four_bytes_repeat_emit_calcBlockSize - ADDQ $0x05, AX - JMP memmove_long_repeat_emit_calcBlockSize - -four_bytes_repeat_emit_calcBlockSize: - ADDQ $0x04, AX - JMP memmove_long_repeat_emit_calcBlockSize - -three_bytes_repeat_emit_calcBlockSize: - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_calcBlockSize - -two_bytes_repeat_emit_calcBlockSize: - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_calcBlockSize - JMP memmove_long_repeat_emit_calcBlockSize - -one_byte_repeat_emit_calcBlockSize: - ADDQ $0x01, AX - -memmove_repeat_emit_calcBlockSize: - LEAQ (AX)(DI*1), AX - JMP emit_literal_done_repeat_emit_calcBlockSize - -memmove_long_repeat_emit_calcBlockSize: - LEAQ (AX)(DI*1), AX - -emit_literal_done_repeat_emit_calcBlockSize: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+8(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_repeat_extend_calcBlockSize: - CMPL DI, $0x10 - JB matchlen_match8_repeat_extend_calcBlockSize - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_calcBlockSize - XORQ 8(BX)(R10*1), R11 - JNZ matchlen_bsf_16repeat_extend_calcBlockSize - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_repeat_extend_calcBlockSize - -matchlen_bsf_16repeat_extend_calcBlockSize: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP repeat_extend_forward_end_calcBlockSize - -matchlen_match8_repeat_extend_calcBlockSize: - CMPL DI, $0x08 - JB matchlen_match4_repeat_extend_calcBlockSize - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_calcBlockSize - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_repeat_extend_calcBlockSize - -matchlen_bsf_8_repeat_extend_calcBlockSize: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_calcBlockSize - -matchlen_match4_repeat_extend_calcBlockSize: - CMPL DI, $0x04 - JB matchlen_match2_repeat_extend_calcBlockSize - MOVL (R8)(R10*1), R9 - CMPL (BX)(R10*1), R9 - JNE matchlen_match2_repeat_extend_calcBlockSize - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_repeat_extend_calcBlockSize: - CMPL DI, $0x01 - JE matchlen_match1_repeat_extend_calcBlockSize - JB repeat_extend_forward_end_calcBlockSize - MOVW (R8)(R10*1), R9 - CMPW (BX)(R10*1), R9 - JNE matchlen_match1_repeat_extend_calcBlockSize - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ repeat_extend_forward_end_calcBlockSize - -matchlen_match1_repeat_extend_calcBlockSize: - MOVB (R8)(R10*1), R9 - CMPB (BX)(R10*1), R9 - JNE repeat_extend_forward_end_calcBlockSize - LEAL 1(R10), R10 - -repeat_extend_forward_end_calcBlockSize: - ADDL R10, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - - // emitCopy - CMPL SI, $0x00010000 - JB two_byte_offset_repeat_as_copy_calcBlockSize - -four_bytes_loop_back_repeat_as_copy_calcBlockSize: - CMPL BX, $0x40 - JBE four_bytes_remain_repeat_as_copy_calcBlockSize - LEAL -64(BX), BX - ADDQ $0x05, AX - CMPL BX, $0x04 - JB four_bytes_remain_repeat_as_copy_calcBlockSize - JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize - -four_bytes_remain_repeat_as_copy_calcBlockSize: - TESTL BX, BX - JZ repeat_end_emit_calcBlockSize - XORL BX, BX - ADDQ $0x05, AX - JMP repeat_end_emit_calcBlockSize - -two_byte_offset_repeat_as_copy_calcBlockSize: - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_calcBlockSize - LEAL -60(BX), BX - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_calcBlockSize - -two_byte_offset_short_repeat_as_copy_calcBlockSize: - MOVL BX, DI - SHLL $0x02, DI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_calcBlockSize - CMPL SI, $0x00000800 - JAE emit_copy_three_repeat_as_copy_calcBlockSize - ADDQ $0x02, AX - JMP repeat_end_emit_calcBlockSize - -emit_copy_three_repeat_as_copy_calcBlockSize: - ADDQ $0x03, AX - -repeat_end_emit_calcBlockSize: - MOVL CX, 12(SP) - JMP search_loop_calcBlockSize - -no_repeat_found_calcBlockSize: - CMPL (DX)(BX*1), SI - JEQ candidate_match_calcBlockSize - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_calcBlockSize - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_calcBlockSize - MOVL 20(SP), CX - JMP search_loop_calcBlockSize - -candidate3_match_calcBlockSize: - ADDL $0x02, CX - JMP candidate_match_calcBlockSize - -candidate2_match_calcBlockSize: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_calcBlockSize: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_calcBlockSize - -match_extend_back_loop_calcBlockSize: - CMPL CX, SI - JBE match_extend_back_end_calcBlockSize - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_calcBlockSize - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_calcBlockSize - JMP match_extend_back_loop_calcBlockSize - -match_extend_back_end_calcBlockSize: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 5(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_calcBlockSize - MOVQ $0x00000000, ret+24(FP) - RET - -match_dst_size_check_calcBlockSize: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_calcBlockSize - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JB one_byte_match_emit_calcBlockSize - CMPL SI, $0x00000100 - JB two_bytes_match_emit_calcBlockSize - CMPL SI, $0x00010000 - JB three_bytes_match_emit_calcBlockSize - CMPL SI, $0x01000000 - JB four_bytes_match_emit_calcBlockSize - ADDQ $0x05, AX - JMP memmove_long_match_emit_calcBlockSize - -four_bytes_match_emit_calcBlockSize: - ADDQ $0x04, AX - JMP memmove_long_match_emit_calcBlockSize - -three_bytes_match_emit_calcBlockSize: - ADDQ $0x03, AX - JMP memmove_long_match_emit_calcBlockSize - -two_bytes_match_emit_calcBlockSize: - ADDQ $0x02, AX - CMPL SI, $0x40 - JB memmove_match_emit_calcBlockSize - JMP memmove_long_match_emit_calcBlockSize - -one_byte_match_emit_calcBlockSize: - ADDQ $0x01, AX - -memmove_match_emit_calcBlockSize: - LEAQ (AX)(R8*1), AX - JMP emit_literal_done_match_emit_calcBlockSize - -memmove_long_match_emit_calcBlockSize: - LEAQ (AX)(R8*1), AX - -emit_literal_done_match_emit_calcBlockSize: -match_nolit_loop_calcBlockSize: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+8(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_calcBlockSize: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_calcBlockSize - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_calcBlockSize - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_calcBlockSize - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_calcBlockSize - -matchlen_bsf_16match_nolit_calcBlockSize: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_calcBlockSize - -matchlen_match8_match_nolit_calcBlockSize: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_calcBlockSize - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_calcBlockSize - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_calcBlockSize - -matchlen_bsf_8_match_nolit_calcBlockSize: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_calcBlockSize - -matchlen_match4_match_nolit_calcBlockSize: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_calcBlockSize - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_calcBlockSize - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_calcBlockSize: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_calcBlockSize - JB match_nolit_end_calcBlockSize - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_calcBlockSize - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_calcBlockSize - -matchlen_match1_match_nolit_calcBlockSize: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_calcBlockSize - LEAL 1(R9), R9 - -match_nolit_end_calcBlockSize: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy - CMPL BX, $0x00010000 - JB two_byte_offset_match_nolit_calcBlockSize - -four_bytes_loop_back_match_nolit_calcBlockSize: - CMPL R9, $0x40 - JBE four_bytes_remain_match_nolit_calcBlockSize - LEAL -64(R9), R9 - ADDQ $0x05, AX - CMPL R9, $0x04 - JB four_bytes_remain_match_nolit_calcBlockSize - JMP four_bytes_loop_back_match_nolit_calcBlockSize - -four_bytes_remain_match_nolit_calcBlockSize: - TESTL R9, R9 - JZ match_nolit_emitcopy_end_calcBlockSize - XORL BX, BX - ADDQ $0x05, AX - JMP match_nolit_emitcopy_end_calcBlockSize - -two_byte_offset_match_nolit_calcBlockSize: - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_calcBlockSize - LEAL -60(R9), R9 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_calcBlockSize - -two_byte_offset_short_match_nolit_calcBlockSize: - MOVL R9, SI - SHLL $0x02, SI - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_calcBlockSize - CMPL BX, $0x00000800 - JAE emit_copy_three_match_nolit_calcBlockSize - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_calcBlockSize - -emit_copy_three_match_nolit_calcBlockSize: - ADDQ $0x03, AX - -match_nolit_emitcopy_end_calcBlockSize: - CMPL CX, 8(SP) - JAE emit_remainder_calcBlockSize - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_calcBlockSize - MOVQ $0x00000000, ret+24(FP) - RET - -match_nolit_dst_ok_calcBlockSize: - MOVQ $0x0000cf1bbcdcbf9b, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x10, DI - IMULQ R8, DI - SHRQ $0x33, DI - SHLQ $0x10, BX - IMULQ R8, BX - SHRQ $0x33, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_calcBlockSize - INCL CX - JMP search_loop_calcBlockSize - -emit_remainder_calcBlockSize: - MOVQ src_len+8(FP), CX - SUBL 12(SP), CX - LEAQ 5(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_calcBlockSize - MOVQ $0x00000000, ret+24(FP) - RET - -emit_remainder_ok_calcBlockSize: - MOVQ src_len+8(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_calcBlockSize - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), CX - CMPL CX, $0x3c - JB one_byte_emit_remainder_calcBlockSize - CMPL CX, $0x00000100 - JB two_bytes_emit_remainder_calcBlockSize - CMPL CX, $0x00010000 - JB three_bytes_emit_remainder_calcBlockSize - CMPL CX, $0x01000000 - JB four_bytes_emit_remainder_calcBlockSize - ADDQ $0x05, AX - JMP memmove_long_emit_remainder_calcBlockSize - -four_bytes_emit_remainder_calcBlockSize: - ADDQ $0x04, AX - JMP memmove_long_emit_remainder_calcBlockSize - -three_bytes_emit_remainder_calcBlockSize: - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_calcBlockSize - -two_bytes_emit_remainder_calcBlockSize: - ADDQ $0x02, AX - CMPL CX, $0x40 - JB memmove_emit_remainder_calcBlockSize - JMP memmove_long_emit_remainder_calcBlockSize - -one_byte_emit_remainder_calcBlockSize: - ADDQ $0x01, AX - -memmove_emit_remainder_calcBlockSize: - LEAQ (AX)(SI*1), AX - JMP emit_literal_done_emit_remainder_calcBlockSize - -memmove_long_emit_remainder_calcBlockSize: - LEAQ (AX)(SI*1), AX - -emit_literal_done_emit_remainder_calcBlockSize: - MOVQ AX, ret+24(FP) - RET - -// func calcBlockSizeSmall(src []byte) int -// Requires: BMI, SSE2 -TEXT ·calcBlockSizeSmall(SB), $2072-32 - XORQ AX, AX - MOVQ $0x00000010, CX - LEAQ 24(SP), DX - PXOR X0, X0 - -zero_loop_calcBlockSizeSmall: - MOVOU X0, (DX) - MOVOU X0, 16(DX) - MOVOU X0, 32(DX) - MOVOU X0, 48(DX) - MOVOU X0, 64(DX) - MOVOU X0, 80(DX) - MOVOU X0, 96(DX) - MOVOU X0, 112(DX) - ADDQ $0x80, DX - DECQ CX - JNZ zero_loop_calcBlockSizeSmall - MOVL $0x00000000, 12(SP) - MOVQ src_len+8(FP), CX - LEAQ -9(CX), DX - LEAQ -8(CX), BX - MOVL BX, 8(SP) - SHRQ $0x05, CX - SUBL CX, DX - LEAQ (AX)(DX*1), DX - MOVQ DX, (SP) - MOVL $0x00000001, CX - MOVL CX, 16(SP) - MOVQ src_base+0(FP), DX - -search_loop_calcBlockSizeSmall: - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x04, BX - LEAL 4(CX)(BX*1), BX - CMPL BX, 8(SP) - JAE emit_remainder_calcBlockSizeSmall - MOVQ (DX)(CX*1), SI - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R8 - MOVQ SI, R9 - MOVQ SI, R10 - SHRQ $0x08, R10 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x37, R9 - SHLQ $0x20, R10 - IMULQ R8, R10 - SHRQ $0x37, R10 - MOVL 24(SP)(R9*4), BX - MOVL 24(SP)(R10*4), DI - MOVL CX, 24(SP)(R9*4) - LEAL 1(CX), R9 - MOVL R9, 24(SP)(R10*4) - MOVQ SI, R9 - SHRQ $0x10, R9 - SHLQ $0x20, R9 - IMULQ R8, R9 - SHRQ $0x37, R9 - MOVL CX, R8 - SUBL 16(SP), R8 - MOVL 1(DX)(R8*1), R10 - MOVQ SI, R8 - SHRQ $0x08, R8 - CMPL R8, R10 - JNE no_repeat_found_calcBlockSizeSmall - LEAL 1(CX), SI - MOVL 12(SP), BX - MOVL SI, DI - SUBL 16(SP), DI - JZ repeat_extend_back_end_calcBlockSizeSmall - -repeat_extend_back_loop_calcBlockSizeSmall: - CMPL SI, BX - JBE repeat_extend_back_end_calcBlockSizeSmall - MOVB -1(DX)(DI*1), R8 - MOVB -1(DX)(SI*1), R9 - CMPB R8, R9 - JNE repeat_extend_back_end_calcBlockSizeSmall - LEAL -1(SI), SI - DECL DI - JNZ repeat_extend_back_loop_calcBlockSizeSmall - -repeat_extend_back_end_calcBlockSizeSmall: - MOVL 12(SP), BX - CMPL BX, SI - JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall - MOVL SI, DI - MOVL SI, 12(SP) - LEAQ (DX)(BX*1), R8 - SUBL BX, DI - LEAL -1(DI), BX - CMPL BX, $0x3c - JB one_byte_repeat_emit_calcBlockSizeSmall - CMPL BX, $0x00000100 - JB two_bytes_repeat_emit_calcBlockSizeSmall - JB three_bytes_repeat_emit_calcBlockSizeSmall - -three_bytes_repeat_emit_calcBlockSizeSmall: - ADDQ $0x03, AX - JMP memmove_long_repeat_emit_calcBlockSizeSmall - -two_bytes_repeat_emit_calcBlockSizeSmall: - ADDQ $0x02, AX - CMPL BX, $0x40 - JB memmove_repeat_emit_calcBlockSizeSmall - JMP memmove_long_repeat_emit_calcBlockSizeSmall - -one_byte_repeat_emit_calcBlockSizeSmall: - ADDQ $0x01, AX - -memmove_repeat_emit_calcBlockSizeSmall: - LEAQ (AX)(DI*1), AX - JMP emit_literal_done_repeat_emit_calcBlockSizeSmall - -memmove_long_repeat_emit_calcBlockSizeSmall: - LEAQ (AX)(DI*1), AX - -emit_literal_done_repeat_emit_calcBlockSizeSmall: - ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+8(FP), DI - SUBL CX, DI - LEAQ (DX)(CX*1), R8 - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R10, R10 - -matchlen_loopback_16_repeat_extend_calcBlockSizeSmall: - CMPL DI, $0x10 - JB matchlen_match8_repeat_extend_calcBlockSizeSmall - MOVQ (R8)(R10*1), R9 - MOVQ 8(R8)(R10*1), R11 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall - XORQ 8(BX)(R10*1), R11 - JNZ matchlen_bsf_16repeat_extend_calcBlockSizeSmall - LEAL -16(DI), DI - LEAL 16(R10), R10 - JMP matchlen_loopback_16_repeat_extend_calcBlockSizeSmall - -matchlen_bsf_16repeat_extend_calcBlockSizeSmall: -#ifdef GOAMD64_v3 - TZCNTQ R11, R11 - -#else - BSFQ R11, R11 - -#endif - SARQ $0x03, R11 - LEAL 8(R10)(R11*1), R10 - JMP repeat_extend_forward_end_calcBlockSizeSmall - -matchlen_match8_repeat_extend_calcBlockSizeSmall: - CMPL DI, $0x08 - JB matchlen_match4_repeat_extend_calcBlockSizeSmall - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall - LEAL -8(DI), DI - LEAL 8(R10), R10 - JMP matchlen_match4_repeat_extend_calcBlockSizeSmall - -matchlen_bsf_8_repeat_extend_calcBlockSizeSmall: -#ifdef GOAMD64_v3 - TZCNTQ R9, R9 - -#else - BSFQ R9, R9 - -#endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 - JMP repeat_extend_forward_end_calcBlockSizeSmall - -matchlen_match4_repeat_extend_calcBlockSizeSmall: - CMPL DI, $0x04 - JB matchlen_match2_repeat_extend_calcBlockSizeSmall - MOVL (R8)(R10*1), R9 - CMPL (BX)(R10*1), R9 - JNE matchlen_match2_repeat_extend_calcBlockSizeSmall - LEAL -4(DI), DI - LEAL 4(R10), R10 - -matchlen_match2_repeat_extend_calcBlockSizeSmall: - CMPL DI, $0x01 - JE matchlen_match1_repeat_extend_calcBlockSizeSmall - JB repeat_extend_forward_end_calcBlockSizeSmall - MOVW (R8)(R10*1), R9 - CMPW (BX)(R10*1), R9 - JNE matchlen_match1_repeat_extend_calcBlockSizeSmall - LEAL 2(R10), R10 - SUBL $0x02, DI - JZ repeat_extend_forward_end_calcBlockSizeSmall - -matchlen_match1_repeat_extend_calcBlockSizeSmall: - MOVB (R8)(R10*1), R9 - CMPB (BX)(R10*1), R9 - JNE repeat_extend_forward_end_calcBlockSizeSmall - LEAL 1(R10), R10 - -repeat_extend_forward_end_calcBlockSizeSmall: - ADDL R10, CX - MOVL CX, BX - SUBL SI, BX - MOVL 16(SP), SI - - // emitCopy -two_byte_offset_repeat_as_copy_calcBlockSizeSmall: - CMPL BX, $0x40 - JBE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall - LEAL -60(BX), BX - ADDQ $0x03, AX - JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall - -two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall: - MOVL BX, SI - SHLL $0x02, SI - CMPL BX, $0x0c - JAE emit_copy_three_repeat_as_copy_calcBlockSizeSmall - ADDQ $0x02, AX - JMP repeat_end_emit_calcBlockSizeSmall - -emit_copy_three_repeat_as_copy_calcBlockSizeSmall: - ADDQ $0x03, AX - -repeat_end_emit_calcBlockSizeSmall: - MOVL CX, 12(SP) - JMP search_loop_calcBlockSizeSmall - -no_repeat_found_calcBlockSizeSmall: - CMPL (DX)(BX*1), SI - JEQ candidate_match_calcBlockSizeSmall - SHRQ $0x08, SI - MOVL 24(SP)(R9*4), BX - LEAL 2(CX), R8 - CMPL (DX)(DI*1), SI - JEQ candidate2_match_calcBlockSizeSmall - MOVL R8, 24(SP)(R9*4) - SHRQ $0x08, SI - CMPL (DX)(BX*1), SI - JEQ candidate3_match_calcBlockSizeSmall - MOVL 20(SP), CX - JMP search_loop_calcBlockSizeSmall - -candidate3_match_calcBlockSizeSmall: - ADDL $0x02, CX - JMP candidate_match_calcBlockSizeSmall - -candidate2_match_calcBlockSizeSmall: - MOVL R8, 24(SP)(R9*4) - INCL CX - MOVL DI, BX - -candidate_match_calcBlockSizeSmall: - MOVL 12(SP), SI - TESTL BX, BX - JZ match_extend_back_end_calcBlockSizeSmall - -match_extend_back_loop_calcBlockSizeSmall: - CMPL CX, SI - JBE match_extend_back_end_calcBlockSizeSmall - MOVB -1(DX)(BX*1), DI - MOVB -1(DX)(CX*1), R8 - CMPB DI, R8 - JNE match_extend_back_end_calcBlockSizeSmall - LEAL -1(CX), CX - DECL BX - JZ match_extend_back_end_calcBlockSizeSmall - JMP match_extend_back_loop_calcBlockSizeSmall - -match_extend_back_end_calcBlockSizeSmall: - MOVL CX, SI - SUBL 12(SP), SI - LEAQ 3(AX)(SI*1), SI - CMPQ SI, (SP) - JB match_dst_size_check_calcBlockSizeSmall - MOVQ $0x00000000, ret+24(FP) - RET - -match_dst_size_check_calcBlockSizeSmall: - MOVL CX, SI - MOVL 12(SP), DI - CMPL DI, SI - JEQ emit_literal_done_match_emit_calcBlockSizeSmall - MOVL SI, R8 - MOVL SI, 12(SP) - LEAQ (DX)(DI*1), SI - SUBL DI, R8 - LEAL -1(R8), SI - CMPL SI, $0x3c - JB one_byte_match_emit_calcBlockSizeSmall - CMPL SI, $0x00000100 - JB two_bytes_match_emit_calcBlockSizeSmall - JB three_bytes_match_emit_calcBlockSizeSmall - -three_bytes_match_emit_calcBlockSizeSmall: - ADDQ $0x03, AX - JMP memmove_long_match_emit_calcBlockSizeSmall - -two_bytes_match_emit_calcBlockSizeSmall: - ADDQ $0x02, AX - CMPL SI, $0x40 - JB memmove_match_emit_calcBlockSizeSmall - JMP memmove_long_match_emit_calcBlockSizeSmall - -one_byte_match_emit_calcBlockSizeSmall: - ADDQ $0x01, AX - -memmove_match_emit_calcBlockSizeSmall: - LEAQ (AX)(R8*1), AX - JMP emit_literal_done_match_emit_calcBlockSizeSmall - -memmove_long_match_emit_calcBlockSizeSmall: - LEAQ (AX)(R8*1), AX - -emit_literal_done_match_emit_calcBlockSizeSmall: -match_nolit_loop_calcBlockSizeSmall: - MOVL CX, SI - SUBL BX, SI - MOVL SI, 16(SP) - ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+8(FP), SI - SUBL CX, SI - LEAQ (DX)(CX*1), DI - LEAQ (DX)(BX*1), BX - - // matchLen - XORL R9, R9 - -matchlen_loopback_16_match_nolit_calcBlockSizeSmall: - CMPL SI, $0x10 - JB matchlen_match8_match_nolit_calcBlockSizeSmall - MOVQ (DI)(R9*1), R8 - MOVQ 8(DI)(R9*1), R10 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall - XORQ 8(BX)(R9*1), R10 - JNZ matchlen_bsf_16match_nolit_calcBlockSizeSmall - LEAL -16(SI), SI - LEAL 16(R9), R9 - JMP matchlen_loopback_16_match_nolit_calcBlockSizeSmall - -matchlen_bsf_16match_nolit_calcBlockSizeSmall: -#ifdef GOAMD64_v3 - TZCNTQ R10, R10 - -#else - BSFQ R10, R10 - -#endif - SARQ $0x03, R10 - LEAL 8(R9)(R10*1), R9 - JMP match_nolit_end_calcBlockSizeSmall - -matchlen_match8_match_nolit_calcBlockSizeSmall: - CMPL SI, $0x08 - JB matchlen_match4_match_nolit_calcBlockSizeSmall - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall - LEAL -8(SI), SI - LEAL 8(R9), R9 - JMP matchlen_match4_match_nolit_calcBlockSizeSmall - -matchlen_bsf_8_match_nolit_calcBlockSizeSmall: -#ifdef GOAMD64_v3 - TZCNTQ R8, R8 - -#else - BSFQ R8, R8 - -#endif - SARQ $0x03, R8 - LEAL (R9)(R8*1), R9 - JMP match_nolit_end_calcBlockSizeSmall - -matchlen_match4_match_nolit_calcBlockSizeSmall: - CMPL SI, $0x04 - JB matchlen_match2_match_nolit_calcBlockSizeSmall - MOVL (DI)(R9*1), R8 - CMPL (BX)(R9*1), R8 - JNE matchlen_match2_match_nolit_calcBlockSizeSmall - LEAL -4(SI), SI - LEAL 4(R9), R9 - -matchlen_match2_match_nolit_calcBlockSizeSmall: - CMPL SI, $0x01 - JE matchlen_match1_match_nolit_calcBlockSizeSmall - JB match_nolit_end_calcBlockSizeSmall - MOVW (DI)(R9*1), R8 - CMPW (BX)(R9*1), R8 - JNE matchlen_match1_match_nolit_calcBlockSizeSmall - LEAL 2(R9), R9 - SUBL $0x02, SI - JZ match_nolit_end_calcBlockSizeSmall - -matchlen_match1_match_nolit_calcBlockSizeSmall: - MOVB (DI)(R9*1), R8 - CMPB (BX)(R9*1), R8 - JNE match_nolit_end_calcBlockSizeSmall - LEAL 1(R9), R9 - -match_nolit_end_calcBlockSizeSmall: - ADDL R9, CX - MOVL 16(SP), BX - ADDL $0x04, R9 - MOVL CX, 12(SP) - - // emitCopy -two_byte_offset_match_nolit_calcBlockSizeSmall: - CMPL R9, $0x40 - JBE two_byte_offset_short_match_nolit_calcBlockSizeSmall - LEAL -60(R9), R9 - ADDQ $0x03, AX - JMP two_byte_offset_match_nolit_calcBlockSizeSmall - -two_byte_offset_short_match_nolit_calcBlockSizeSmall: - MOVL R9, BX - SHLL $0x02, BX - CMPL R9, $0x0c - JAE emit_copy_three_match_nolit_calcBlockSizeSmall - ADDQ $0x02, AX - JMP match_nolit_emitcopy_end_calcBlockSizeSmall - -emit_copy_three_match_nolit_calcBlockSizeSmall: - ADDQ $0x03, AX - -match_nolit_emitcopy_end_calcBlockSizeSmall: - CMPL CX, 8(SP) - JAE emit_remainder_calcBlockSizeSmall - MOVQ -2(DX)(CX*1), SI - CMPQ AX, (SP) - JB match_nolit_dst_ok_calcBlockSizeSmall - MOVQ $0x00000000, ret+24(FP) - RET - -match_nolit_dst_ok_calcBlockSizeSmall: - MOVQ $0x9e3779b1, R8 - MOVQ SI, DI - SHRQ $0x10, SI - MOVQ SI, BX - SHLQ $0x20, DI - IMULQ R8, DI - SHRQ $0x37, DI - SHLQ $0x20, BX - IMULQ R8, BX - SHRQ $0x37, BX - LEAL -2(CX), R8 - LEAQ 24(SP)(BX*4), R9 - MOVL (R9), BX - MOVL R8, 24(SP)(DI*4) - MOVL CX, (R9) - CMPL (DX)(BX*1), SI - JEQ match_nolit_loop_calcBlockSizeSmall - INCL CX - JMP search_loop_calcBlockSizeSmall - -emit_remainder_calcBlockSizeSmall: - MOVQ src_len+8(FP), CX - SUBL 12(SP), CX - LEAQ 3(AX)(CX*1), CX - CMPQ CX, (SP) - JB emit_remainder_ok_calcBlockSizeSmall - MOVQ $0x00000000, ret+24(FP) - RET - -emit_remainder_ok_calcBlockSizeSmall: - MOVQ src_len+8(FP), CX - MOVL 12(SP), BX - CMPL BX, CX - JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall - MOVL CX, SI - MOVL CX, 12(SP) - LEAQ (DX)(BX*1), CX - SUBL BX, SI - LEAL -1(SI), CX - CMPL CX, $0x3c - JB one_byte_emit_remainder_calcBlockSizeSmall - CMPL CX, $0x00000100 - JB two_bytes_emit_remainder_calcBlockSizeSmall - JB three_bytes_emit_remainder_calcBlockSizeSmall - -three_bytes_emit_remainder_calcBlockSizeSmall: - ADDQ $0x03, AX - JMP memmove_long_emit_remainder_calcBlockSizeSmall - -two_bytes_emit_remainder_calcBlockSizeSmall: - ADDQ $0x02, AX - CMPL CX, $0x40 - JB memmove_emit_remainder_calcBlockSizeSmall - JMP memmove_long_emit_remainder_calcBlockSizeSmall - -one_byte_emit_remainder_calcBlockSizeSmall: - ADDQ $0x01, AX - -memmove_emit_remainder_calcBlockSizeSmall: - LEAQ (AX)(SI*1), AX - JMP emit_literal_done_emit_remainder_calcBlockSizeSmall - -memmove_long_emit_remainder_calcBlockSizeSmall: - LEAQ (AX)(SI*1), AX - -emit_literal_done_emit_remainder_calcBlockSizeSmall: - MOVQ AX, ret+24(FP) - RET - -// func emitLiteral(dst []byte, lit []byte) int -// Requires: SSE2 -TEXT ·emitLiteral(SB), NOSPLIT, $0-56 - MOVQ lit_len+32(FP), DX - MOVQ dst_base+0(FP), AX - MOVQ lit_base+24(FP), CX - TESTQ DX, DX - JZ emit_literal_end_standalone_skip - MOVL DX, BX - LEAL -1(DX), SI - CMPL SI, $0x3c - JB one_byte_standalone - CMPL SI, $0x00000100 - JB two_bytes_standalone - CMPL SI, $0x00010000 - JB three_bytes_standalone - CMPL SI, $0x01000000 - JB four_bytes_standalone - MOVB $0xfc, (AX) - MOVL SI, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP memmove_long_standalone - -four_bytes_standalone: - MOVL SI, DI - SHRL $0x10, DI - MOVB $0xf8, (AX) - MOVW SI, 1(AX) - MOVB DI, 3(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP memmove_long_standalone - -three_bytes_standalone: - MOVB $0xf4, (AX) - MOVW SI, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP memmove_long_standalone - -two_bytes_standalone: - MOVB $0xf0, (AX) - MOVB SI, 1(AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - CMPL SI, $0x40 - JB memmove_standalone - JMP memmove_long_standalone - -one_byte_standalone: - SHLB $0x02, SI - MOVB SI, (AX) - ADDQ $0x01, BX - ADDQ $0x01, AX - -memmove_standalone: - // genMemMoveShort - CMPQ DX, $0x03 - JB emit_lit_memmove_standalone_memmove_move_1or2 - JE emit_lit_memmove_standalone_memmove_move_3 - CMPQ DX, $0x08 - JB emit_lit_memmove_standalone_memmove_move_4through7 - CMPQ DX, $0x10 - JBE emit_lit_memmove_standalone_memmove_move_8through16 - CMPQ DX, $0x20 - JBE emit_lit_memmove_standalone_memmove_move_17through32 - JMP emit_lit_memmove_standalone_memmove_move_33through64 - -emit_lit_memmove_standalone_memmove_move_1or2: - MOVB (CX), SI - MOVB -1(CX)(DX*1), CL - MOVB SI, (AX) - MOVB CL, -1(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_3: - MOVW (CX), SI - MOVB 2(CX), CL - MOVW SI, (AX) - MOVB CL, 2(AX) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_4through7: - MOVL (CX), SI - MOVL -4(CX)(DX*1), CX - MOVL SI, (AX) - MOVL CX, -4(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_8through16: - MOVQ (CX), SI - MOVQ -8(CX)(DX*1), CX - MOVQ SI, (AX) - MOVQ CX, -8(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_17through32: - MOVOU (CX), X0 - MOVOU -16(CX)(DX*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(DX*1) - JMP emit_literal_end_standalone - -emit_lit_memmove_standalone_memmove_move_33through64: - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(DX*1), X2 - MOVOU -16(CX)(DX*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DX*1) - MOVOU X3, -16(AX)(DX*1) - JMP emit_literal_end_standalone - JMP emit_literal_end_standalone - -memmove_long_standalone: - // genMemMoveLong - MOVOU (CX), X0 - MOVOU 16(CX), X1 - MOVOU -32(CX)(DX*1), X2 - MOVOU -16(CX)(DX*1), X3 - MOVQ DX, DI - SHRQ $0x05, DI - MOVQ AX, SI - ANDL $0x0000001f, SI - MOVQ $0x00000040, R8 - SUBQ SI, R8 - DECQ DI - JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 - LEAQ -32(CX)(R8*1), SI - LEAQ -32(AX)(R8*1), R9 - -emit_lit_memmove_long_standalonelarge_big_loop_back: - MOVOU (SI), X4 - MOVOU 16(SI), X5 - MOVOA X4, (R9) - MOVOA X5, 16(R9) - ADDQ $0x20, R9 - ADDQ $0x20, SI - ADDQ $0x20, R8 - DECQ DI - JNA emit_lit_memmove_long_standalonelarge_big_loop_back - -emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: - MOVOU -32(CX)(R8*1), X4 - MOVOU -16(CX)(R8*1), X5 - MOVOA X4, -32(AX)(R8*1) - MOVOA X5, -16(AX)(R8*1) - ADDQ $0x20, R8 - CMPQ DX, R8 - JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(DX*1) - MOVOU X3, -16(AX)(DX*1) - JMP emit_literal_end_standalone - JMP emit_literal_end_standalone - -emit_literal_end_standalone_skip: - XORQ BX, BX - -emit_literal_end_standalone: - MOVQ BX, ret+48(FP) - RET - -// func emitRepeat(dst []byte, offset int, length int) int -TEXT ·emitRepeat(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitRepeat -emit_repeat_again_standalone: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JBE repeat_two_standalone - CMPL SI, $0x0c - JAE cant_repeat_two_offset_standalone - CMPL CX, $0x00000800 - JB repeat_two_offset_standalone - -cant_repeat_two_offset_standalone: - CMPL DX, $0x00000104 - JB repeat_three_standalone - CMPL DX, $0x00010100 - JB repeat_four_standalone - CMPL DX, $0x0100ffff - JB repeat_five_standalone - LEAL -16842747(DX), DX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone - -repeat_five_standalone: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_repeat_end - -repeat_four_standalone: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_repeat_end - -repeat_three_standalone: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_repeat_end - -repeat_two_standalone: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_repeat_end - -repeat_two_offset_standalone: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - -gen_emit_repeat_end: - MOVQ BX, ret+40(FP) - RET - -// func emitCopy(dst []byte, offset int, length int) int -TEXT ·emitCopy(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitCopy - CMPL CX, $0x00010000 - JB two_byte_offset_standalone - CMPL DX, $0x40 - JBE four_bytes_remain_standalone - MOVB $0xff, (AX) - MOVL CX, 1(AX) - LEAL -64(DX), DX - ADDQ $0x05, BX - ADDQ $0x05, AX - CMPL DX, $0x04 - JB four_bytes_remain_standalone - - // emitRepeat -emit_repeat_again_standalone_emit_copy: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JBE repeat_two_standalone_emit_copy - CMPL SI, $0x0c - JAE cant_repeat_two_offset_standalone_emit_copy - CMPL CX, $0x00000800 - JB repeat_two_offset_standalone_emit_copy - -cant_repeat_two_offset_standalone_emit_copy: - CMPL DX, $0x00000104 - JB repeat_three_standalone_emit_copy - CMPL DX, $0x00010100 - JB repeat_four_standalone_emit_copy - CMPL DX, $0x0100ffff - JB repeat_five_standalone_emit_copy - LEAL -16842747(DX), DX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone_emit_copy - -repeat_five_standalone_emit_copy: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -repeat_four_standalone_emit_copy: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_copy_end - -repeat_three_standalone_emit_copy: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_copy_end - -repeat_two_standalone_emit_copy: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -repeat_two_offset_standalone_emit_copy: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -four_bytes_remain_standalone: - TESTL DX, DX - JZ gen_emit_copy_end - XORL SI, SI - LEAL -1(SI)(DX*4), DX - MOVB DL, (AX) - MOVL CX, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -two_byte_offset_standalone: - CMPL DX, $0x40 - JBE two_byte_offset_short_standalone - CMPL CX, $0x00000800 - JAE long_offset_short_standalone - MOVL $0x00000001, SI - LEAL 16(SI), SI - MOVB CL, 1(AX) - MOVL CX, DI - SHRL $0x08, DI - SHLL $0x05, DI - ORL DI, SI - MOVB SI, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - SUBL $0x08, DX - - // emitRepeat - LEAL -4(DX), DX - JMP cant_repeat_two_offset_standalone_emit_copy_short_2b - -emit_repeat_again_standalone_emit_copy_short_2b: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JBE repeat_two_standalone_emit_copy_short_2b - CMPL SI, $0x0c - JAE cant_repeat_two_offset_standalone_emit_copy_short_2b - CMPL CX, $0x00000800 - JB repeat_two_offset_standalone_emit_copy_short_2b - -cant_repeat_two_offset_standalone_emit_copy_short_2b: - CMPL DX, $0x00000104 - JB repeat_three_standalone_emit_copy_short_2b - CMPL DX, $0x00010100 - JB repeat_four_standalone_emit_copy_short_2b - CMPL DX, $0x0100ffff - JB repeat_five_standalone_emit_copy_short_2b - LEAL -16842747(DX), DX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone_emit_copy_short_2b - -repeat_five_standalone_emit_copy_short_2b: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -repeat_four_standalone_emit_copy_short_2b: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_copy_end - -repeat_three_standalone_emit_copy_short_2b: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_copy_end - -repeat_two_standalone_emit_copy_short_2b: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -repeat_two_offset_standalone_emit_copy_short_2b: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -long_offset_short_standalone: - MOVB $0xee, (AX) - MOVW CX, 1(AX) - LEAL -60(DX), DX - ADDQ $0x03, AX - ADDQ $0x03, BX - - // emitRepeat -emit_repeat_again_standalone_emit_copy_short: - MOVL DX, SI - LEAL -4(DX), DX - CMPL SI, $0x08 - JBE repeat_two_standalone_emit_copy_short - CMPL SI, $0x0c - JAE cant_repeat_two_offset_standalone_emit_copy_short - CMPL CX, $0x00000800 - JB repeat_two_offset_standalone_emit_copy_short - -cant_repeat_two_offset_standalone_emit_copy_short: - CMPL DX, $0x00000104 - JB repeat_three_standalone_emit_copy_short - CMPL DX, $0x00010100 - JB repeat_four_standalone_emit_copy_short - CMPL DX, $0x0100ffff - JB repeat_five_standalone_emit_copy_short - LEAL -16842747(DX), DX - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - ADDQ $0x05, BX - JMP emit_repeat_again_standalone_emit_copy_short - -repeat_five_standalone_emit_copy_short: - LEAL -65536(DX), DX - MOVL DX, CX - MOVW $0x001d, (AX) - MOVW DX, 2(AX) - SARL $0x10, CX - MOVB CL, 4(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end - -repeat_four_standalone_emit_copy_short: - LEAL -256(DX), DX - MOVW $0x0019, (AX) - MOVW DX, 2(AX) - ADDQ $0x04, BX - ADDQ $0x04, AX - JMP gen_emit_copy_end - -repeat_three_standalone_emit_copy_short: - LEAL -4(DX), DX - MOVW $0x0015, (AX) - MOVB DL, 2(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - JMP gen_emit_copy_end - -repeat_two_standalone_emit_copy_short: - SHLL $0x02, DX - ORL $0x01, DX - MOVW DX, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -repeat_two_offset_standalone_emit_copy_short: - XORQ SI, SI - LEAL 1(SI)(DX*4), DX - MOVB CL, 1(AX) - SARL $0x08, CX - SHLL $0x05, CX - ORL CX, DX - MOVB DL, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -two_byte_offset_short_standalone: - MOVL DX, SI - SHLL $0x02, SI - CMPL DX, $0x0c - JAE emit_copy_three_standalone - CMPL CX, $0x00000800 - JAE emit_copy_three_standalone - LEAL -15(SI), SI - MOVB CL, 1(AX) - SHRL $0x08, CX - SHLL $0x05, CX - ORL CX, SI - MOVB SI, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end - -emit_copy_three_standalone: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW CX, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - -gen_emit_copy_end: - MOVQ BX, ret+40(FP) - RET - -// func emitCopyNoRepeat(dst []byte, offset int, length int) int -TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 - XORQ BX, BX - MOVQ dst_base+0(FP), AX - MOVQ offset+24(FP), CX - MOVQ length+32(FP), DX - - // emitCopy - CMPL CX, $0x00010000 - JB two_byte_offset_standalone_snappy - -four_bytes_loop_back_standalone_snappy: - CMPL DX, $0x40 - JBE four_bytes_remain_standalone_snappy - MOVB $0xff, (AX) - MOVL CX, 1(AX) - LEAL -64(DX), DX - ADDQ $0x05, BX - ADDQ $0x05, AX - CMPL DX, $0x04 - JB four_bytes_remain_standalone_snappy - JMP four_bytes_loop_back_standalone_snappy - -four_bytes_remain_standalone_snappy: - TESTL DX, DX - JZ gen_emit_copy_end_snappy - XORL SI, SI - LEAL -1(SI)(DX*4), DX - MOVB DL, (AX) - MOVL CX, 1(AX) - ADDQ $0x05, BX - ADDQ $0x05, AX - JMP gen_emit_copy_end_snappy - -two_byte_offset_standalone_snappy: - CMPL DX, $0x40 - JBE two_byte_offset_short_standalone_snappy - MOVB $0xee, (AX) - MOVW CX, 1(AX) - LEAL -60(DX), DX - ADDQ $0x03, AX - ADDQ $0x03, BX - JMP two_byte_offset_standalone_snappy - -two_byte_offset_short_standalone_snappy: - MOVL DX, SI - SHLL $0x02, SI - CMPL DX, $0x0c - JAE emit_copy_three_standalone_snappy - CMPL CX, $0x00000800 - JAE emit_copy_three_standalone_snappy - LEAL -15(SI), SI - MOVB CL, 1(AX) - SHRL $0x08, CX - SHLL $0x05, CX - ORL CX, SI - MOVB SI, (AX) - ADDQ $0x02, BX - ADDQ $0x02, AX - JMP gen_emit_copy_end_snappy - -emit_copy_three_standalone_snappy: - LEAL -2(SI), SI - MOVB SI, (AX) - MOVW CX, 1(AX) - ADDQ $0x03, BX - ADDQ $0x03, AX - -gen_emit_copy_end_snappy: - MOVQ BX, ret+40(FP) - RET - -// func matchLen(a []byte, b []byte) int -// Requires: BMI -TEXT ·matchLen(SB), NOSPLIT, $0-56 - MOVQ a_base+0(FP), AX - MOVQ b_base+24(FP), CX - MOVQ a_len+8(FP), DX - - // matchLen - XORL SI, SI - -matchlen_loopback_16_standalone: - CMPL DX, $0x10 - JB matchlen_match8_standalone - MOVQ (AX)(SI*1), BX - MOVQ 8(AX)(SI*1), DI - XORQ (CX)(SI*1), BX - JNZ matchlen_bsf_8_standalone - XORQ 8(CX)(SI*1), DI - JNZ matchlen_bsf_16standalone - LEAL -16(DX), DX - LEAL 16(SI), SI - JMP matchlen_loopback_16_standalone - -matchlen_bsf_16standalone: -#ifdef GOAMD64_v3 - TZCNTQ DI, DI - -#else - BSFQ DI, DI - -#endif - SARQ $0x03, DI - LEAL 8(SI)(DI*1), SI - JMP gen_match_len_end - -matchlen_match8_standalone: - CMPL DX, $0x08 - JB matchlen_match4_standalone - MOVQ (AX)(SI*1), BX - XORQ (CX)(SI*1), BX - JNZ matchlen_bsf_8_standalone - LEAL -8(DX), DX - LEAL 8(SI), SI - JMP matchlen_match4_standalone - -matchlen_bsf_8_standalone: -#ifdef GOAMD64_v3 - TZCNTQ BX, BX - -#else - BSFQ BX, BX - -#endif - SARQ $0x03, BX - LEAL (SI)(BX*1), SI - JMP gen_match_len_end - -matchlen_match4_standalone: - CMPL DX, $0x04 - JB matchlen_match2_standalone - MOVL (AX)(SI*1), BX - CMPL (CX)(SI*1), BX - JNE matchlen_match2_standalone - LEAL -4(DX), DX - LEAL 4(SI), SI - -matchlen_match2_standalone: - CMPL DX, $0x01 - JE matchlen_match1_standalone - JB gen_match_len_end - MOVW (AX)(SI*1), BX - CMPW (CX)(SI*1), BX - JNE matchlen_match1_standalone - LEAL 2(SI), SI - SUBL $0x02, DX - JZ gen_match_len_end - -matchlen_match1_standalone: - MOVB (AX)(SI*1), BL - CMPB (CX)(SI*1), BL - JNE gen_match_len_end - LEAL 1(SI), SI - -gen_match_len_end: - MOVQ SI, ret+48(FP) - RET - -// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) -// Requires: SSE2 -TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64 - XORQ SI, SI - MOVQ dst_base+0(FP), AX - MOVQ dst_len+8(FP), CX - MOVQ src_base+24(FP), DX - MOVQ src_len+32(FP), BX - LEAQ (DX)(BX*1), BX - LEAQ -10(AX)(CX*1), CX - XORQ DI, DI - -lz4_s2_loop: - CMPQ DX, BX - JAE lz4_s2_corrupt - CMPQ AX, CX - JAE lz4_s2_dstfull - MOVBQZX (DX), R8 - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x04, R9 - ANDQ $0x0f, R10 - CMPQ R8, $0xf0 - JB lz4_s2_ll_end - -lz4_s2_ll_loop: - INCQ DX - CMPQ DX, BX - JAE lz4_s2_corrupt - MOVBQZX (DX), R8 - ADDQ R8, R9 - CMPQ R8, $0xff - JEQ lz4_s2_ll_loop - -lz4_s2_ll_end: - LEAQ (DX)(R9*1), R8 - ADDQ $0x04, R10 - CMPQ R8, BX - JAE lz4_s2_corrupt - INCQ DX - INCQ R8 - TESTQ R9, R9 - JZ lz4_s2_lits_done - LEAQ (AX)(R9*1), R11 - CMPQ R11, CX - JAE lz4_s2_dstfull - ADDQ R9, SI - LEAL -1(R9), R11 - CMPL R11, $0x3c - JB one_byte_lz4_s2 - CMPL R11, $0x00000100 - JB two_bytes_lz4_s2 - CMPL R11, $0x00010000 - JB three_bytes_lz4_s2 - CMPL R11, $0x01000000 - JB four_bytes_lz4_s2 - MOVB $0xfc, (AX) - MOVL R11, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_lz4_s2 - -four_bytes_lz4_s2: - MOVL R11, R12 - SHRL $0x10, R12 - MOVB $0xf8, (AX) - MOVW R11, 1(AX) - MOVB R12, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_lz4_s2 - -three_bytes_lz4_s2: - MOVB $0xf4, (AX) - MOVW R11, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_lz4_s2 - -two_bytes_lz4_s2: - MOVB $0xf0, (AX) - MOVB R11, 1(AX) - ADDQ $0x02, AX - CMPL R11, $0x40 - JB memmove_lz4_s2 - JMP memmove_long_lz4_s2 - -one_byte_lz4_s2: - SHLB $0x02, R11 - MOVB R11, (AX) - ADDQ $0x01, AX - -memmove_lz4_s2: - LEAQ (AX)(R9*1), R11 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_lz4_s2_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_lz4_s2_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_lz4_s2_memmove_move_17through32 - JMP emit_lit_memmove_lz4_s2_memmove_move_33through64 - -emit_lit_memmove_lz4_s2_memmove_move_8: - MOVQ (DX), R12 - MOVQ R12, (AX) - JMP memmove_end_copy_lz4_s2 - -emit_lit_memmove_lz4_s2_memmove_move_8through16: - MOVQ (DX), R12 - MOVQ -8(DX)(R9*1), DX - MOVQ R12, (AX) - MOVQ DX, -8(AX)(R9*1) - JMP memmove_end_copy_lz4_s2 - -emit_lit_memmove_lz4_s2_memmove_move_17through32: - MOVOU (DX), X0 - MOVOU -16(DX)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_lz4_s2 - -emit_lit_memmove_lz4_s2_memmove_move_33through64: - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R9*1), X2 - MOVOU -16(DX)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_lz4_s2: - MOVQ R11, AX - JMP lz4_s2_lits_emit_done - -memmove_long_lz4_s2: - LEAQ (AX)(R9*1), R11 - - // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R9*1), X2 - MOVOU -16(DX)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R12 - ANDL $0x0000001f, R12 - MOVQ $0x00000040, R14 - SUBQ R12, R14 - DECQ R13 - JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 - LEAQ -32(DX)(R14*1), R12 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_lz4_s2large_big_loop_back: - MOVOU (R12), X4 - MOVOU 16(R12), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R12 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_lz4_s2large_big_loop_back - -emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32: - MOVOU -32(DX)(R14*1), X4 - MOVOU -16(DX)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R11, AX - -lz4_s2_lits_emit_done: - MOVQ R8, DX - -lz4_s2_lits_done: - CMPQ DX, BX - JNE lz4_s2_match - CMPQ R10, $0x04 - JEQ lz4_s2_done - JMP lz4_s2_corrupt - -lz4_s2_match: - LEAQ 2(DX), R8 - CMPQ R8, BX - JAE lz4_s2_corrupt - MOVWQZX (DX), R9 - MOVQ R8, DX - TESTQ R9, R9 - JZ lz4_s2_corrupt - CMPQ R9, SI - JA lz4_s2_corrupt - CMPQ R10, $0x13 - JNE lz4_s2_ml_done - -lz4_s2_ml_loop: - MOVBQZX (DX), R8 - INCQ DX - ADDQ R8, R10 - CMPQ DX, BX - JAE lz4_s2_corrupt - CMPQ R8, $0xff - JEQ lz4_s2_ml_loop - -lz4_s2_ml_done: - ADDQ R10, SI - CMPQ R9, DI - JNE lz4_s2_docopy - - // emitRepeat -emit_repeat_again_lz4_s2: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2 - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2 - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2 - -cant_repeat_two_offset_lz4_s2: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2 - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2 - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2 - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2 - -repeat_five_lz4_s2: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4_s2_loop - -repeat_four_lz4_s2: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4_s2_loop - -repeat_three_lz4_s2: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4_s2_loop - -repeat_two_lz4_s2: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -repeat_two_offset_lz4_s2: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -lz4_s2_docopy: - MOVQ R9, DI - - // emitCopy - CMPL R10, $0x40 - JBE two_byte_offset_short_lz4_s2 - CMPL R9, $0x00000800 - JAE long_offset_short_lz4_s2 - MOVL $0x00000001, R8 - LEAL 16(R8), R8 - MOVB R9, 1(AX) - MOVL R9, R11 - SHRL $0x08, R11 - SHLL $0x05, R11 - ORL R11, R8 - MOVB R8, (AX) - ADDQ $0x02, AX - SUBL $0x08, R10 - - // emitRepeat - LEAL -4(R10), R10 - JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b - -emit_repeat_again_lz4_s2_emit_copy_short_2b: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2_emit_copy_short_2b - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2_emit_copy_short_2b - -cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2_emit_copy_short_2b - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2_emit_copy_short_2b - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2_emit_copy_short_2b - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2_emit_copy_short_2b - -repeat_five_lz4_s2_emit_copy_short_2b: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4_s2_loop - -repeat_four_lz4_s2_emit_copy_short_2b: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4_s2_loop - -repeat_three_lz4_s2_emit_copy_short_2b: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4_s2_loop - -repeat_two_lz4_s2_emit_copy_short_2b: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -repeat_two_offset_lz4_s2_emit_copy_short_2b: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -long_offset_short_lz4_s2: - MOVB $0xee, (AX) - MOVW R9, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_lz4_s2_emit_copy_short: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2_emit_copy_short - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2_emit_copy_short - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2_emit_copy_short - -cant_repeat_two_offset_lz4_s2_emit_copy_short: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2_emit_copy_short - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2_emit_copy_short - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2_emit_copy_short - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2_emit_copy_short - -repeat_five_lz4_s2_emit_copy_short: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4_s2_loop - -repeat_four_lz4_s2_emit_copy_short: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4_s2_loop - -repeat_three_lz4_s2_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4_s2_loop - -repeat_two_lz4_s2_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -repeat_two_offset_lz4_s2_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -two_byte_offset_short_lz4_s2: - MOVL R10, R8 - SHLL $0x02, R8 - CMPL R10, $0x0c - JAE emit_copy_three_lz4_s2 - CMPL R9, $0x00000800 - JAE emit_copy_three_lz4_s2 - LEAL -15(R8), R8 - MOVB R9, 1(AX) - SHRL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R8 - MOVB R8, (AX) - ADDQ $0x02, AX - JMP lz4_s2_loop - -emit_copy_three_lz4_s2: - LEAL -2(R8), R8 - MOVB R8, (AX) - MOVW R9, 1(AX) - ADDQ $0x03, AX - JMP lz4_s2_loop - -lz4_s2_done: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ SI, uncompressed+48(FP) - MOVQ AX, dstUsed+56(FP) - RET - -lz4_s2_corrupt: - XORQ AX, AX - LEAQ -1(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -lz4_s2_dstfull: - XORQ AX, AX - LEAQ -2(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) -// Requires: SSE2 -TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64 - XORQ SI, SI - MOVQ dst_base+0(FP), AX - MOVQ dst_len+8(FP), CX - MOVQ src_base+24(FP), DX - MOVQ src_len+32(FP), BX - LEAQ (DX)(BX*1), BX - LEAQ -10(AX)(CX*1), CX - XORQ DI, DI - -lz4s_s2_loop: - CMPQ DX, BX - JAE lz4s_s2_corrupt - CMPQ AX, CX - JAE lz4s_s2_dstfull - MOVBQZX (DX), R8 - MOVQ R8, R9 - MOVQ R8, R10 - SHRQ $0x04, R9 - ANDQ $0x0f, R10 - CMPQ R8, $0xf0 - JB lz4s_s2_ll_end - -lz4s_s2_ll_loop: - INCQ DX - CMPQ DX, BX - JAE lz4s_s2_corrupt - MOVBQZX (DX), R8 - ADDQ R8, R9 - CMPQ R8, $0xff - JEQ lz4s_s2_ll_loop - -lz4s_s2_ll_end: - LEAQ (DX)(R9*1), R8 - ADDQ $0x03, R10 - CMPQ R8, BX - JAE lz4s_s2_corrupt - INCQ DX - INCQ R8 - TESTQ R9, R9 - JZ lz4s_s2_lits_done - LEAQ (AX)(R9*1), R11 - CMPQ R11, CX - JAE lz4s_s2_dstfull - ADDQ R9, SI - LEAL -1(R9), R11 - CMPL R11, $0x3c - JB one_byte_lz4s_s2 - CMPL R11, $0x00000100 - JB two_bytes_lz4s_s2 - CMPL R11, $0x00010000 - JB three_bytes_lz4s_s2 - CMPL R11, $0x01000000 - JB four_bytes_lz4s_s2 - MOVB $0xfc, (AX) - MOVL R11, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_lz4s_s2 - -four_bytes_lz4s_s2: - MOVL R11, R12 - SHRL $0x10, R12 - MOVB $0xf8, (AX) - MOVW R11, 1(AX) - MOVB R12, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_lz4s_s2 - -three_bytes_lz4s_s2: - MOVB $0xf4, (AX) - MOVW R11, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_lz4s_s2 - -two_bytes_lz4s_s2: - MOVB $0xf0, (AX) - MOVB R11, 1(AX) - ADDQ $0x02, AX - CMPL R11, $0x40 - JB memmove_lz4s_s2 - JMP memmove_long_lz4s_s2 - -one_byte_lz4s_s2: - SHLB $0x02, R11 - MOVB R11, (AX) - ADDQ $0x01, AX - -memmove_lz4s_s2: - LEAQ (AX)(R9*1), R11 - - // genMemMoveShort - CMPQ R9, $0x08 - JBE emit_lit_memmove_lz4s_s2_memmove_move_8 - CMPQ R9, $0x10 - JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16 - CMPQ R9, $0x20 - JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32 - JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64 - -emit_lit_memmove_lz4s_s2_memmove_move_8: - MOVQ (DX), R12 - MOVQ R12, (AX) - JMP memmove_end_copy_lz4s_s2 - -emit_lit_memmove_lz4s_s2_memmove_move_8through16: - MOVQ (DX), R12 - MOVQ -8(DX)(R9*1), DX - MOVQ R12, (AX) - MOVQ DX, -8(AX)(R9*1) - JMP memmove_end_copy_lz4s_s2 - -emit_lit_memmove_lz4s_s2_memmove_move_17through32: - MOVOU (DX), X0 - MOVOU -16(DX)(R9*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R9*1) - JMP memmove_end_copy_lz4s_s2 - -emit_lit_memmove_lz4s_s2_memmove_move_33through64: - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R9*1), X2 - MOVOU -16(DX)(R9*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - -memmove_end_copy_lz4s_s2: - MOVQ R11, AX - JMP lz4s_s2_lits_emit_done - -memmove_long_lz4s_s2: - LEAQ (AX)(R9*1), R11 - - // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R9*1), X2 - MOVOU -16(DX)(R9*1), X3 - MOVQ R9, R13 - SHRQ $0x05, R13 - MOVQ AX, R12 - ANDL $0x0000001f, R12 - MOVQ $0x00000040, R14 - SUBQ R12, R14 - DECQ R13 - JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 - LEAQ -32(DX)(R14*1), R12 - LEAQ -32(AX)(R14*1), R15 - -emit_lit_memmove_long_lz4s_s2large_big_loop_back: - MOVOU (R12), X4 - MOVOU 16(R12), X5 - MOVOA X4, (R15) - MOVOA X5, 16(R15) - ADDQ $0x20, R15 - ADDQ $0x20, R12 - ADDQ $0x20, R14 - DECQ R13 - JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back - -emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32: - MOVOU -32(DX)(R14*1), X4 - MOVOU -16(DX)(R14*1), X5 - MOVOA X4, -32(AX)(R14*1) - MOVOA X5, -16(AX)(R14*1) - ADDQ $0x20, R14 - CMPQ R9, R14 - JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R9*1) - MOVOU X3, -16(AX)(R9*1) - MOVQ R11, AX - -lz4s_s2_lits_emit_done: - MOVQ R8, DX - -lz4s_s2_lits_done: - CMPQ DX, BX - JNE lz4s_s2_match - CMPQ R10, $0x03 - JEQ lz4s_s2_done - JMP lz4s_s2_corrupt - -lz4s_s2_match: - CMPQ R10, $0x03 - JEQ lz4s_s2_loop - LEAQ 2(DX), R8 - CMPQ R8, BX - JAE lz4s_s2_corrupt - MOVWQZX (DX), R9 - MOVQ R8, DX - TESTQ R9, R9 - JZ lz4s_s2_corrupt - CMPQ R9, SI - JA lz4s_s2_corrupt - CMPQ R10, $0x12 - JNE lz4s_s2_ml_done - -lz4s_s2_ml_loop: - MOVBQZX (DX), R8 - INCQ DX - ADDQ R8, R10 - CMPQ DX, BX - JAE lz4s_s2_corrupt - CMPQ R8, $0xff - JEQ lz4s_s2_ml_loop - -lz4s_s2_ml_done: - ADDQ R10, SI - CMPQ R9, DI - JNE lz4s_s2_docopy - - // emitRepeat -emit_repeat_again_lz4_s2: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2 - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2 - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2 - -cant_repeat_two_offset_lz4_s2: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2 - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2 - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2 - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2 - -repeat_five_lz4_s2: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4s_s2_loop - -repeat_four_lz4_s2: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4s_s2_loop - -repeat_three_lz4_s2: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4s_s2_loop - -repeat_two_lz4_s2: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -repeat_two_offset_lz4_s2: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -lz4s_s2_docopy: - MOVQ R9, DI - - // emitCopy - CMPL R10, $0x40 - JBE two_byte_offset_short_lz4_s2 - CMPL R9, $0x00000800 - JAE long_offset_short_lz4_s2 - MOVL $0x00000001, R8 - LEAL 16(R8), R8 - MOVB R9, 1(AX) - MOVL R9, R11 - SHRL $0x08, R11 - SHLL $0x05, R11 - ORL R11, R8 - MOVB R8, (AX) - ADDQ $0x02, AX - SUBL $0x08, R10 - - // emitRepeat - LEAL -4(R10), R10 - JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b - -emit_repeat_again_lz4_s2_emit_copy_short_2b: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2_emit_copy_short_2b - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2_emit_copy_short_2b - -cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2_emit_copy_short_2b - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2_emit_copy_short_2b - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2_emit_copy_short_2b - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2_emit_copy_short_2b - -repeat_five_lz4_s2_emit_copy_short_2b: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4s_s2_loop - -repeat_four_lz4_s2_emit_copy_short_2b: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4s_s2_loop - -repeat_three_lz4_s2_emit_copy_short_2b: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4s_s2_loop - -repeat_two_lz4_s2_emit_copy_short_2b: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -repeat_two_offset_lz4_s2_emit_copy_short_2b: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -long_offset_short_lz4_s2: - MOVB $0xee, (AX) - MOVW R9, 1(AX) - LEAL -60(R10), R10 - ADDQ $0x03, AX - - // emitRepeat -emit_repeat_again_lz4_s2_emit_copy_short: - MOVL R10, R8 - LEAL -4(R10), R10 - CMPL R8, $0x08 - JBE repeat_two_lz4_s2_emit_copy_short - CMPL R8, $0x0c - JAE cant_repeat_two_offset_lz4_s2_emit_copy_short - CMPL R9, $0x00000800 - JB repeat_two_offset_lz4_s2_emit_copy_short - -cant_repeat_two_offset_lz4_s2_emit_copy_short: - CMPL R10, $0x00000104 - JB repeat_three_lz4_s2_emit_copy_short - CMPL R10, $0x00010100 - JB repeat_four_lz4_s2_emit_copy_short - CMPL R10, $0x0100ffff - JB repeat_five_lz4_s2_emit_copy_short - LEAL -16842747(R10), R10 - MOVL $0xfffb001d, (AX) - MOVB $0xff, 4(AX) - ADDQ $0x05, AX - JMP emit_repeat_again_lz4_s2_emit_copy_short - -repeat_five_lz4_s2_emit_copy_short: - LEAL -65536(R10), R10 - MOVL R10, R9 - MOVW $0x001d, (AX) - MOVW R10, 2(AX) - SARL $0x10, R9 - MOVB R9, 4(AX) - ADDQ $0x05, AX - JMP lz4s_s2_loop - -repeat_four_lz4_s2_emit_copy_short: - LEAL -256(R10), R10 - MOVW $0x0019, (AX) - MOVW R10, 2(AX) - ADDQ $0x04, AX - JMP lz4s_s2_loop - -repeat_three_lz4_s2_emit_copy_short: - LEAL -4(R10), R10 - MOVW $0x0015, (AX) - MOVB R10, 2(AX) - ADDQ $0x03, AX - JMP lz4s_s2_loop - -repeat_two_lz4_s2_emit_copy_short: - SHLL $0x02, R10 - ORL $0x01, R10 - MOVW R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -repeat_two_offset_lz4_s2_emit_copy_short: - XORQ R8, R8 - LEAL 1(R8)(R10*4), R10 - MOVB R9, 1(AX) - SARL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R10 - MOVB R10, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -two_byte_offset_short_lz4_s2: - MOVL R10, R8 - SHLL $0x02, R8 - CMPL R10, $0x0c - JAE emit_copy_three_lz4_s2 - CMPL R9, $0x00000800 - JAE emit_copy_three_lz4_s2 - LEAL -15(R8), R8 - MOVB R9, 1(AX) - SHRL $0x08, R9 - SHLL $0x05, R9 - ORL R9, R8 - MOVB R8, (AX) - ADDQ $0x02, AX - JMP lz4s_s2_loop - -emit_copy_three_lz4_s2: - LEAL -2(R8), R8 - MOVB R8, (AX) - MOVW R9, 1(AX) - ADDQ $0x03, AX - JMP lz4s_s2_loop - -lz4s_s2_done: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ SI, uncompressed+48(FP) - MOVQ AX, dstUsed+56(FP) - RET - -lz4s_s2_corrupt: - XORQ AX, AX - LEAQ -1(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -lz4s_s2_dstfull: - XORQ AX, AX - LEAQ -2(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -// func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) -// Requires: SSE2 -TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64 - XORQ SI, SI - MOVQ dst_base+0(FP), AX - MOVQ dst_len+8(FP), CX - MOVQ src_base+24(FP), DX - MOVQ src_len+32(FP), BX - LEAQ (DX)(BX*1), BX - LEAQ -10(AX)(CX*1), CX - -lz4_snappy_loop: - CMPQ DX, BX - JAE lz4_snappy_corrupt - CMPQ AX, CX - JAE lz4_snappy_dstfull - MOVBQZX (DX), DI - MOVQ DI, R8 - MOVQ DI, R9 - SHRQ $0x04, R8 - ANDQ $0x0f, R9 - CMPQ DI, $0xf0 - JB lz4_snappy_ll_end - -lz4_snappy_ll_loop: - INCQ DX - CMPQ DX, BX - JAE lz4_snappy_corrupt - MOVBQZX (DX), DI - ADDQ DI, R8 - CMPQ DI, $0xff - JEQ lz4_snappy_ll_loop - -lz4_snappy_ll_end: - LEAQ (DX)(R8*1), DI - ADDQ $0x04, R9 - CMPQ DI, BX - JAE lz4_snappy_corrupt - INCQ DX - INCQ DI - TESTQ R8, R8 - JZ lz4_snappy_lits_done - LEAQ (AX)(R8*1), R10 - CMPQ R10, CX - JAE lz4_snappy_dstfull - ADDQ R8, SI - LEAL -1(R8), R10 - CMPL R10, $0x3c - JB one_byte_lz4_snappy - CMPL R10, $0x00000100 - JB two_bytes_lz4_snappy - CMPL R10, $0x00010000 - JB three_bytes_lz4_snappy - CMPL R10, $0x01000000 - JB four_bytes_lz4_snappy - MOVB $0xfc, (AX) - MOVL R10, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_lz4_snappy - -four_bytes_lz4_snappy: - MOVL R10, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW R10, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_lz4_snappy - -three_bytes_lz4_snappy: - MOVB $0xf4, (AX) - MOVW R10, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_lz4_snappy - -two_bytes_lz4_snappy: - MOVB $0xf0, (AX) - MOVB R10, 1(AX) - ADDQ $0x02, AX - CMPL R10, $0x40 - JB memmove_lz4_snappy - JMP memmove_long_lz4_snappy - -one_byte_lz4_snappy: - SHLB $0x02, R10 - MOVB R10, (AX) - ADDQ $0x01, AX - -memmove_lz4_snappy: - LEAQ (AX)(R8*1), R10 - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_lz4_snappy_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32 - JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64 - -emit_lit_memmove_lz4_snappy_memmove_move_8: - MOVQ (DX), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_lz4_snappy - -emit_lit_memmove_lz4_snappy_memmove_move_8through16: - MOVQ (DX), R11 - MOVQ -8(DX)(R8*1), DX - MOVQ R11, (AX) - MOVQ DX, -8(AX)(R8*1) - JMP memmove_end_copy_lz4_snappy - -emit_lit_memmove_lz4_snappy_memmove_move_17through32: - MOVOU (DX), X0 - MOVOU -16(DX)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_lz4_snappy - -emit_lit_memmove_lz4_snappy_memmove_move_33through64: - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R8*1), X2 - MOVOU -16(DX)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_lz4_snappy: - MOVQ R10, AX - JMP lz4_snappy_lits_emit_done - -memmove_long_lz4_snappy: - LEAQ (AX)(R8*1), R10 - - // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R8*1), X2 - MOVOU -16(DX)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 - LEAQ -32(DX)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_lz4_snappylarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back - -emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32: - MOVOU -32(DX)(R13*1), X4 - MOVOU -16(DX)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ R10, AX - -lz4_snappy_lits_emit_done: - MOVQ DI, DX - -lz4_snappy_lits_done: - CMPQ DX, BX - JNE lz4_snappy_match - CMPQ R9, $0x04 - JEQ lz4_snappy_done - JMP lz4_snappy_corrupt - -lz4_snappy_match: - LEAQ 2(DX), DI - CMPQ DI, BX - JAE lz4_snappy_corrupt - MOVWQZX (DX), R8 - MOVQ DI, DX - TESTQ R8, R8 - JZ lz4_snappy_corrupt - CMPQ R8, SI - JA lz4_snappy_corrupt - CMPQ R9, $0x13 - JNE lz4_snappy_ml_done - -lz4_snappy_ml_loop: - MOVBQZX (DX), DI - INCQ DX - ADDQ DI, R9 - CMPQ DX, BX - JAE lz4_snappy_corrupt - CMPQ DI, $0xff - JEQ lz4_snappy_ml_loop - -lz4_snappy_ml_done: - ADDQ R9, SI - - // emitCopy -two_byte_offset_lz4_s2: - CMPL R9, $0x40 - JBE two_byte_offset_short_lz4_s2 - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - CMPQ AX, CX - JAE lz4_snappy_loop - JMP two_byte_offset_lz4_s2 - -two_byte_offset_short_lz4_s2: - MOVL R9, DI - SHLL $0x02, DI - CMPL R9, $0x0c - JAE emit_copy_three_lz4_s2 - CMPL R8, $0x00000800 - JAE emit_copy_three_lz4_s2 - LEAL -15(DI), DI - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP lz4_snappy_loop - -emit_copy_three_lz4_s2: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP lz4_snappy_loop - -lz4_snappy_done: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ SI, uncompressed+48(FP) - MOVQ AX, dstUsed+56(FP) - RET - -lz4_snappy_corrupt: - XORQ AX, AX - LEAQ -1(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -lz4_snappy_dstfull: - XORQ AX, AX - LEAQ -2(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) -// Requires: SSE2 -TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64 - XORQ SI, SI - MOVQ dst_base+0(FP), AX - MOVQ dst_len+8(FP), CX - MOVQ src_base+24(FP), DX - MOVQ src_len+32(FP), BX - LEAQ (DX)(BX*1), BX - LEAQ -10(AX)(CX*1), CX - -lz4s_snappy_loop: - CMPQ DX, BX - JAE lz4s_snappy_corrupt - CMPQ AX, CX - JAE lz4s_snappy_dstfull - MOVBQZX (DX), DI - MOVQ DI, R8 - MOVQ DI, R9 - SHRQ $0x04, R8 - ANDQ $0x0f, R9 - CMPQ DI, $0xf0 - JB lz4s_snappy_ll_end - -lz4s_snappy_ll_loop: - INCQ DX - CMPQ DX, BX - JAE lz4s_snappy_corrupt - MOVBQZX (DX), DI - ADDQ DI, R8 - CMPQ DI, $0xff - JEQ lz4s_snappy_ll_loop - -lz4s_snappy_ll_end: - LEAQ (DX)(R8*1), DI - ADDQ $0x03, R9 - CMPQ DI, BX - JAE lz4s_snappy_corrupt - INCQ DX - INCQ DI - TESTQ R8, R8 - JZ lz4s_snappy_lits_done - LEAQ (AX)(R8*1), R10 - CMPQ R10, CX - JAE lz4s_snappy_dstfull - ADDQ R8, SI - LEAL -1(R8), R10 - CMPL R10, $0x3c - JB one_byte_lz4s_snappy - CMPL R10, $0x00000100 - JB two_bytes_lz4s_snappy - CMPL R10, $0x00010000 - JB three_bytes_lz4s_snappy - CMPL R10, $0x01000000 - JB four_bytes_lz4s_snappy - MOVB $0xfc, (AX) - MOVL R10, 1(AX) - ADDQ $0x05, AX - JMP memmove_long_lz4s_snappy - -four_bytes_lz4s_snappy: - MOVL R10, R11 - SHRL $0x10, R11 - MOVB $0xf8, (AX) - MOVW R10, 1(AX) - MOVB R11, 3(AX) - ADDQ $0x04, AX - JMP memmove_long_lz4s_snappy - -three_bytes_lz4s_snappy: - MOVB $0xf4, (AX) - MOVW R10, 1(AX) - ADDQ $0x03, AX - JMP memmove_long_lz4s_snappy - -two_bytes_lz4s_snappy: - MOVB $0xf0, (AX) - MOVB R10, 1(AX) - ADDQ $0x02, AX - CMPL R10, $0x40 - JB memmove_lz4s_snappy - JMP memmove_long_lz4s_snappy - -one_byte_lz4s_snappy: - SHLB $0x02, R10 - MOVB R10, (AX) - ADDQ $0x01, AX - -memmove_lz4s_snappy: - LEAQ (AX)(R8*1), R10 - - // genMemMoveShort - CMPQ R8, $0x08 - JBE emit_lit_memmove_lz4s_snappy_memmove_move_8 - CMPQ R8, $0x10 - JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16 - CMPQ R8, $0x20 - JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32 - JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64 - -emit_lit_memmove_lz4s_snappy_memmove_move_8: - MOVQ (DX), R11 - MOVQ R11, (AX) - JMP memmove_end_copy_lz4s_snappy - -emit_lit_memmove_lz4s_snappy_memmove_move_8through16: - MOVQ (DX), R11 - MOVQ -8(DX)(R8*1), DX - MOVQ R11, (AX) - MOVQ DX, -8(AX)(R8*1) - JMP memmove_end_copy_lz4s_snappy - -emit_lit_memmove_lz4s_snappy_memmove_move_17through32: - MOVOU (DX), X0 - MOVOU -16(DX)(R8*1), X1 - MOVOU X0, (AX) - MOVOU X1, -16(AX)(R8*1) - JMP memmove_end_copy_lz4s_snappy - -emit_lit_memmove_lz4s_snappy_memmove_move_33through64: - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R8*1), X2 - MOVOU -16(DX)(R8*1), X3 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - -memmove_end_copy_lz4s_snappy: - MOVQ R10, AX - JMP lz4s_snappy_lits_emit_done - -memmove_long_lz4s_snappy: - LEAQ (AX)(R8*1), R10 - - // genMemMoveLong - MOVOU (DX), X0 - MOVOU 16(DX), X1 - MOVOU -32(DX)(R8*1), X2 - MOVOU -16(DX)(R8*1), X3 - MOVQ R8, R12 - SHRQ $0x05, R12 - MOVQ AX, R11 - ANDL $0x0000001f, R11 - MOVQ $0x00000040, R13 - SUBQ R11, R13 - DECQ R12 - JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 - LEAQ -32(DX)(R13*1), R11 - LEAQ -32(AX)(R13*1), R14 - -emit_lit_memmove_long_lz4s_snappylarge_big_loop_back: - MOVOU (R11), X4 - MOVOU 16(R11), X5 - MOVOA X4, (R14) - MOVOA X5, 16(R14) - ADDQ $0x20, R14 - ADDQ $0x20, R11 - ADDQ $0x20, R13 - DECQ R12 - JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back - -emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32: - MOVOU -32(DX)(R13*1), X4 - MOVOU -16(DX)(R13*1), X5 - MOVOA X4, -32(AX)(R13*1) - MOVOA X5, -16(AX)(R13*1) - ADDQ $0x20, R13 - CMPQ R8, R13 - JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(R8*1) - MOVOU X3, -16(AX)(R8*1) - MOVQ R10, AX - -lz4s_snappy_lits_emit_done: - MOVQ DI, DX - -lz4s_snappy_lits_done: - CMPQ DX, BX - JNE lz4s_snappy_match - CMPQ R9, $0x03 - JEQ lz4s_snappy_done - JMP lz4s_snappy_corrupt - -lz4s_snappy_match: - CMPQ R9, $0x03 - JEQ lz4s_snappy_loop - LEAQ 2(DX), DI - CMPQ DI, BX - JAE lz4s_snappy_corrupt - MOVWQZX (DX), R8 - MOVQ DI, DX - TESTQ R8, R8 - JZ lz4s_snappy_corrupt - CMPQ R8, SI - JA lz4s_snappy_corrupt - CMPQ R9, $0x12 - JNE lz4s_snappy_ml_done - -lz4s_snappy_ml_loop: - MOVBQZX (DX), DI - INCQ DX - ADDQ DI, R9 - CMPQ DX, BX - JAE lz4s_snappy_corrupt - CMPQ DI, $0xff - JEQ lz4s_snappy_ml_loop - -lz4s_snappy_ml_done: - ADDQ R9, SI - - // emitCopy -two_byte_offset_lz4_s2: - CMPL R9, $0x40 - JBE two_byte_offset_short_lz4_s2 - MOVB $0xee, (AX) - MOVW R8, 1(AX) - LEAL -60(R9), R9 - ADDQ $0x03, AX - CMPQ AX, CX - JAE lz4s_snappy_loop - JMP two_byte_offset_lz4_s2 - -two_byte_offset_short_lz4_s2: - MOVL R9, DI - SHLL $0x02, DI - CMPL R9, $0x0c - JAE emit_copy_three_lz4_s2 - CMPL R8, $0x00000800 - JAE emit_copy_three_lz4_s2 - LEAL -15(DI), DI - MOVB R8, 1(AX) - SHRL $0x08, R8 - SHLL $0x05, R8 - ORL R8, DI - MOVB DI, (AX) - ADDQ $0x02, AX - JMP lz4s_snappy_loop - -emit_copy_three_lz4_s2: - LEAL -2(DI), DI - MOVB DI, (AX) - MOVW R8, 1(AX) - ADDQ $0x03, AX - JMP lz4s_snappy_loop - -lz4s_snappy_done: - MOVQ dst_base+0(FP), CX - SUBQ CX, AX - MOVQ SI, uncompressed+48(FP) - MOVQ AX, dstUsed+56(FP) - RET - -lz4s_snappy_corrupt: - XORQ AX, AX - LEAQ -1(AX), SI - MOVQ SI, uncompressed+48(FP) - RET - -lz4s_snappy_dstfull: - XORQ AX, AX - LEAQ -2(AX), SI - MOVQ SI, uncompressed+48(FP) - RET -- cgit v1.2.3