diff options
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block16_amd64.s')
| -rw-r--r-- | vendor/github.com/minio/md5-simd/block16_amd64.s | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/vendor/github.com/minio/md5-simd/block16_amd64.s b/vendor/github.com/minio/md5-simd/block16_amd64.s new file mode 100644 index 0000000..be0a43a --- /dev/null +++ b/vendor/github.com/minio/md5-simd/block16_amd64.s | |||
| @@ -0,0 +1,228 @@ | |||
| 1 | // Copyright (c) 2020 MinIO Inc. All rights reserved. | ||
| 2 | // Use of this source code is governed by a license that can be | ||
| 3 | // found in the LICENSE file. | ||
| 4 | |||
| 5 | //+build !noasm,!appengine,gc | ||
| 6 | |||
| 7 | // This is the AVX512 implementation of the MD5 block function (16-way parallel) | ||
| 8 | |||
| 9 | #define prep(index) \ | ||
| 10 | KMOVQ kmask, ktmp \ | ||
| 11 | VPGATHERDD index*4(base)(ptrs*1), ktmp, mem | ||
| 12 | |||
| 13 | #define ROUND1(a, b, c, d, index, const, shift) \ | ||
| 14 | VPXORQ c, tmp, tmp \ | ||
| 15 | VPADDD 64*const(consts), a, a \ | ||
| 16 | VPADDD mem, a, a \ | ||
| 17 | VPTERNLOGD $0x6C, b, d, tmp \ | ||
| 18 | prep(index) \ | ||
| 19 | VPADDD tmp, a, a \ | ||
| 20 | VPROLD $shift, a, a \ | ||
| 21 | VMOVAPD c, tmp \ | ||
| 22 | VPADDD b, a, a | ||
| 23 | |||
| 24 | #define ROUND1noload(a, b, c, d, const, shift) \ | ||
| 25 | VPXORQ c, tmp, tmp \ | ||
| 26 | VPADDD 64*const(consts), a, a \ | ||
| 27 | VPADDD mem, a, a \ | ||
| 28 | VPTERNLOGD $0x6C, b, d, tmp \ | ||
| 29 | VPADDD tmp, a, a \ | ||
| 30 | VPROLD $shift, a, a \ | ||
| 31 | VMOVAPD c, tmp \ | ||
| 32 | VPADDD b, a, a | ||
| 33 | |||
| 34 | #define ROUND2(a, b, c, d, zreg, const, shift) \ | ||
| 35 | VPADDD 64*const(consts), a, a \ | ||
| 36 | VPADDD zreg, a, a \ | ||
| 37 | VANDNPD c, tmp, tmp \ | ||
| 38 | VPTERNLOGD $0xEC, b, tmp, tmp2 \ | ||
| 39 | VMOVAPD c, tmp \ | ||
| 40 | VPADDD tmp2, a, a \ | ||
| 41 | VMOVAPD c, tmp2 \ | ||
| 42 | VPROLD $shift, a, a \ | ||
| 43 | VPADDD b, a, a | ||
| 44 | |||
| 45 | #define ROUND3(a, b, c, d, zreg, const, shift) \ | ||
| 46 | VPADDD 64*const(consts), a, a \ | ||
| 47 | VPADDD zreg, a, a \ | ||
| 48 | VPTERNLOGD $0x96, b, d, tmp \ | ||
| 49 | VPADDD tmp, a, a \ | ||
| 50 | VPROLD $shift, a, a \ | ||
| 51 | VMOVAPD b, tmp \ | ||
| 52 | VPADDD b, a, a | ||
| 53 | |||
| 54 | #define ROUND4(a, b, c, d, zreg, const, shift) \ | ||
| 55 | VPADDD 64*const(consts), a, a \ | ||
| 56 | VPADDD zreg, a, a \ | ||
| 57 | VPTERNLOGD $0x36, b, c, tmp \ | ||
| 58 | VPADDD tmp, a, a \ | ||
| 59 | VPROLD $shift, a, a \ | ||
| 60 | VPXORQ c, ones, tmp \ | ||
| 61 | VPADDD b, a, a | ||
| 62 | |||
| 63 | TEXT ·block16(SB), 4, $0-40 | ||
| 64 | |||
| 65 | MOVQ state+0(FP), BX | ||
| 66 | MOVQ base+8(FP), SI | ||
| 67 | MOVQ ptrs+16(FP), AX | ||
| 68 | KMOVQ mask+24(FP), K1 | ||
| 69 | MOVQ n+32(FP), DX | ||
| 70 | MOVQ ·avx512md5consts+0(SB), DI | ||
| 71 | |||
| 72 | #define a Z0 | ||
| 73 | #define b Z1 | ||
| 74 | #define c Z2 | ||
| 75 | #define d Z3 | ||
| 76 | |||
| 77 | #define sa Z4 | ||
| 78 | #define sb Z5 | ||
| 79 | #define sc Z6 | ||
| 80 | #define sd Z7 | ||
| 81 | |||
| 82 | #define tmp Z8 | ||
| 83 | #define tmp2 Z9 | ||
| 84 | #define ptrs Z10 | ||
| 85 | #define ones Z12 | ||
| 86 | #define mem Z15 | ||
| 87 | |||
| 88 | #define kmask K1 | ||
| 89 | #define ktmp K3 | ||
| 90 | |||
| 91 | // ---------------------------------------------------------- | ||
| 92 | // Registers Z16 through to Z31 are used for caching purposes | ||
| 93 | // ---------------------------------------------------------- | ||
| 94 | |||
| 95 | #define dig BX | ||
| 96 | #define count DX | ||
| 97 | #define base SI | ||
| 98 | #define consts DI | ||
| 99 | |||
| 100 | // load digest into state registers | ||
| 101 | VMOVUPD (dig), a | ||
| 102 | VMOVUPD 0x40(dig), b | ||
| 103 | VMOVUPD 0x80(dig), c | ||
| 104 | VMOVUPD 0xc0(dig), d | ||
| 105 | |||
| 106 | // load source pointers | ||
| 107 | VMOVUPD 0x00(AX), ptrs | ||
| 108 | |||
| 109 | MOVQ $-1, AX | ||
| 110 | VPBROADCASTQ AX, ones | ||
| 111 | |||
| 112 | loop: | ||
| 113 | VMOVAPD a, sa | ||
| 114 | VMOVAPD b, sb | ||
| 115 | VMOVAPD c, sc | ||
| 116 | VMOVAPD d, sd | ||
| 117 | |||
| 118 | prep(0) | ||
| 119 | VMOVAPD d, tmp | ||
| 120 | VMOVAPD mem, Z16 | ||
| 121 | |||
| 122 | ROUND1(a,b,c,d, 1,0x00, 7) | ||
| 123 | VMOVAPD mem, Z17 | ||
| 124 | ROUND1(d,a,b,c, 2,0x01,12) | ||
| 125 | VMOVAPD mem, Z18 | ||
| 126 | ROUND1(c,d,a,b, 3,0x02,17) | ||
| 127 | VMOVAPD mem, Z19 | ||
| 128 | ROUND1(b,c,d,a, 4,0x03,22) | ||
| 129 | VMOVAPD mem, Z20 | ||
| 130 | ROUND1(a,b,c,d, 5,0x04, 7) | ||
| 131 | VMOVAPD mem, Z21 | ||
| 132 | ROUND1(d,a,b,c, 6,0x05,12) | ||
| 133 | VMOVAPD mem, Z22 | ||
| 134 | ROUND1(c,d,a,b, 7,0x06,17) | ||
| 135 | VMOVAPD mem, Z23 | ||
| 136 | ROUND1(b,c,d,a, 8,0x07,22) | ||
| 137 | VMOVAPD mem, Z24 | ||
| 138 | ROUND1(a,b,c,d, 9,0x08, 7) | ||
| 139 | VMOVAPD mem, Z25 | ||
| 140 | ROUND1(d,a,b,c,10,0x09,12) | ||
| 141 | VMOVAPD mem, Z26 | ||
| 142 | ROUND1(c,d,a,b,11,0x0a,17) | ||
| 143 | VMOVAPD mem, Z27 | ||
| 144 | ROUND1(b,c,d,a,12,0x0b,22) | ||
| 145 | VMOVAPD mem, Z28 | ||
| 146 | ROUND1(a,b,c,d,13,0x0c, 7) | ||
| 147 | VMOVAPD mem, Z29 | ||
| 148 | ROUND1(d,a,b,c,14,0x0d,12) | ||
| 149 | VMOVAPD mem, Z30 | ||
| 150 | ROUND1(c,d,a,b,15,0x0e,17) | ||
| 151 | VMOVAPD mem, Z31 | ||
| 152 | |||
| 153 | ROUND1noload(b,c,d,a, 0x0f,22) | ||
| 154 | |||
| 155 | VMOVAPD d, tmp | ||
| 156 | VMOVAPD d, tmp2 | ||
| 157 | |||
| 158 | ROUND2(a,b,c,d, Z17,0x10, 5) | ||
| 159 | ROUND2(d,a,b,c, Z22,0x11, 9) | ||
| 160 | ROUND2(c,d,a,b, Z27,0x12,14) | ||
| 161 | ROUND2(b,c,d,a, Z16,0x13,20) | ||
| 162 | ROUND2(a,b,c,d, Z21,0x14, 5) | ||
| 163 | ROUND2(d,a,b,c, Z26,0x15, 9) | ||
| 164 | ROUND2(c,d,a,b, Z31,0x16,14) | ||
| 165 | ROUND2(b,c,d,a, Z20,0x17,20) | ||
| 166 | ROUND2(a,b,c,d, Z25,0x18, 5) | ||
| 167 | ROUND2(d,a,b,c, Z30,0x19, 9) | ||
| 168 | ROUND2(c,d,a,b, Z19,0x1a,14) | ||
| 169 | ROUND2(b,c,d,a, Z24,0x1b,20) | ||
| 170 | ROUND2(a,b,c,d, Z29,0x1c, 5) | ||
| 171 | ROUND2(d,a,b,c, Z18,0x1d, 9) | ||
| 172 | ROUND2(c,d,a,b, Z23,0x1e,14) | ||
| 173 | ROUND2(b,c,d,a, Z28,0x1f,20) | ||
| 174 | |||
| 175 | VMOVAPD c, tmp | ||
| 176 | |||
| 177 | ROUND3(a,b,c,d, Z21,0x20, 4) | ||
| 178 | ROUND3(d,a,b,c, Z24,0x21,11) | ||
| 179 | ROUND3(c,d,a,b, Z27,0x22,16) | ||
| 180 | ROUND3(b,c,d,a, Z30,0x23,23) | ||
| 181 | ROUND3(a,b,c,d, Z17,0x24, 4) | ||
| 182 | ROUND3(d,a,b,c, Z20,0x25,11) | ||
| 183 | ROUND3(c,d,a,b, Z23,0x26,16) | ||
| 184 | ROUND3(b,c,d,a, Z26,0x27,23) | ||
| 185 | ROUND3(a,b,c,d, Z29,0x28, 4) | ||
| 186 | ROUND3(d,a,b,c, Z16,0x29,11) | ||
| 187 | ROUND3(c,d,a,b, Z19,0x2a,16) | ||
| 188 | ROUND3(b,c,d,a, Z22,0x2b,23) | ||
| 189 | ROUND3(a,b,c,d, Z25,0x2c, 4) | ||
| 190 | ROUND3(d,a,b,c, Z28,0x2d,11) | ||
| 191 | ROUND3(c,d,a,b, Z31,0x2e,16) | ||
| 192 | ROUND3(b,c,d,a, Z18,0x2f,23) | ||
| 193 | |||
| 194 | VPXORQ d, ones, tmp | ||
| 195 | |||
| 196 | ROUND4(a,b,c,d, Z16,0x30, 6) | ||
| 197 | ROUND4(d,a,b,c, Z23,0x31,10) | ||
| 198 | ROUND4(c,d,a,b, Z30,0x32,15) | ||
| 199 | ROUND4(b,c,d,a, Z21,0x33,21) | ||
| 200 | ROUND4(a,b,c,d, Z28,0x34, 6) | ||
| 201 | ROUND4(d,a,b,c, Z19,0x35,10) | ||
| 202 | ROUND4(c,d,a,b, Z26,0x36,15) | ||
| 203 | ROUND4(b,c,d,a, Z17,0x37,21) | ||
| 204 | ROUND4(a,b,c,d, Z24,0x38, 6) | ||
| 205 | ROUND4(d,a,b,c, Z31,0x39,10) | ||
| 206 | ROUND4(c,d,a,b, Z22,0x3a,15) | ||
| 207 | ROUND4(b,c,d,a, Z29,0x3b,21) | ||
| 208 | ROUND4(a,b,c,d, Z20,0x3c, 6) | ||
| 209 | ROUND4(d,a,b,c, Z27,0x3d,10) | ||
| 210 | ROUND4(c,d,a,b, Z18,0x3e,15) | ||
| 211 | ROUND4(b,c,d,a, Z25,0x3f,21) | ||
| 212 | |||
| 213 | VPADDD sa, a, a | ||
| 214 | VPADDD sb, b, b | ||
| 215 | VPADDD sc, c, c | ||
| 216 | VPADDD sd, d, d | ||
| 217 | |||
| 218 | LEAQ 64(base), base | ||
| 219 | SUBQ $64, count | ||
| 220 | JNE loop | ||
| 221 | |||
| 222 | VMOVUPD a, (dig) | ||
| 223 | VMOVUPD b, 0x40(dig) | ||
| 224 | VMOVUPD c, 0x80(dig) | ||
| 225 | VMOVUPD d, 0xc0(dig) | ||
| 226 | |||
| 227 | VZEROUPPER | ||
| 228 | RET | ||