diff options
| author | Rutger Broekhoff | 2023-12-29 21:31:53 +0100 |
|---|---|---|
| committer | Rutger Broekhoff | 2023-12-29 21:31:53 +0100 |
| commit | 404aeae4545d2426c089a5f8d5e82dae56f5212b (patch) | |
| tree | 2d84e00af272b39fc04f3795ae06bc48970e57b5 /vendor/github.com/minio/md5-simd/block8_amd64.s | |
| parent | 209d8b0187ed025dec9ac149ebcced3462877bff (diff) | |
| download | gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.tar.gz gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.zip | |
Make Nix builds work
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block8_amd64.s')
| -rw-r--r-- | vendor/github.com/minio/md5-simd/block8_amd64.s | 281 |
1 files changed, 281 insertions, 0 deletions
diff --git a/vendor/github.com/minio/md5-simd/block8_amd64.s b/vendor/github.com/minio/md5-simd/block8_amd64.s new file mode 100644 index 0000000..f57db17 --- /dev/null +++ b/vendor/github.com/minio/md5-simd/block8_amd64.s | |||
| @@ -0,0 +1,281 @@ | |||
| 1 | //+build !noasm,!appengine,gc | ||
| 2 | |||
| 3 | // Copyright (c) 2018 Igneous Systems | ||
| 4 | // MIT License | ||
| 5 | // | ||
| 6 | // Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| 7 | // of this software and associated documentation files (the "Software"), to deal | ||
| 8 | // in the Software without restriction, including without limitation the rights | ||
| 9 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 10 | // copies of the Software, and to permit persons to whom the Software is | ||
| 11 | // furnished to do so, subject to the following conditions: | ||
| 12 | // | ||
| 13 | // The above copyright notice and this permission notice shall be included in all | ||
| 14 | // copies or substantial portions of the Software. | ||
| 15 | // | ||
| 16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| 17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| 18 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| 19 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| 20 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| 21 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 22 | // SOFTWARE. | ||
| 23 | |||
| 24 | // Copyright (c) 2020 MinIO Inc. All rights reserved. | ||
| 25 | // Use of this source code is governed by a license that can be | ||
| 26 | // found in the LICENSE file. | ||
| 27 | |||
| 28 | // This is the AVX2 implementation of the MD5 block function (8-way parallel) | ||
| 29 | |||
| 30 | // block8(state *uint64, base uintptr, bufs *int32, cache *byte, n int) | ||
| 31 | TEXT ·block8(SB), 4, $0-40 | ||
| 32 | MOVQ state+0(FP), BX | ||
| 33 | MOVQ base+8(FP), SI | ||
| 34 | MOVQ bufs+16(FP), AX | ||
| 35 | MOVQ cache+24(FP), CX | ||
| 36 | MOVQ n+32(FP), DX | ||
| 37 | MOVQ ·avx256md5consts+0(SB), DI | ||
| 38 | |||
| 39 | // Align cache (which is stack allocated by the compiler) | ||
| 40 | // to a 256 bit boundary (ymm register alignment) | ||
| 41 | // The cache8 type is deliberately oversized to permit this. | ||
| 42 | ADDQ $31, CX | ||
| 43 | ANDB $-32, CL | ||
| 44 | |||
| 45 | #define a Y0 | ||
| 46 | #define b Y1 | ||
| 47 | #define c Y2 | ||
| 48 | #define d Y3 | ||
| 49 | |||
| 50 | #define sa Y4 | ||
| 51 | #define sb Y5 | ||
| 52 | #define sc Y6 | ||
| 53 | #define sd Y7 | ||
| 54 | |||
| 55 | #define tmp Y8 | ||
| 56 | #define tmp2 Y9 | ||
| 57 | |||
| 58 | #define mask Y10 | ||
| 59 | #define off Y11 | ||
| 60 | |||
| 61 | #define ones Y12 | ||
| 62 | |||
| 63 | #define rtmp1 Y13 | ||
| 64 | #define rtmp2 Y14 | ||
| 65 | |||
| 66 | #define mem Y15 | ||
| 67 | |||
| 68 | #define dig BX | ||
| 69 | #define cache CX | ||
| 70 | #define count DX | ||
| 71 | #define base SI | ||
| 72 | #define consts DI | ||
| 73 | |||
| 74 | #define prepmask \ | ||
| 75 | VPXOR mask, mask, mask \ | ||
| 76 | VPCMPGTD mask, off, mask | ||
| 77 | |||
| 78 | #define prep(index) \ | ||
| 79 | VMOVAPD mask, rtmp2 \ | ||
| 80 | VPGATHERDD rtmp2, index*4(base)(off*1), mem | ||
| 81 | |||
| 82 | #define load(index) \ | ||
| 83 | VMOVAPD index*32(cache), mem | ||
| 84 | |||
| 85 | #define store(index) \ | ||
| 86 | VMOVAPD mem, index*32(cache) | ||
| 87 | |||
| 88 | #define roll(shift, a) \ | ||
| 89 | VPSLLD $shift, a, rtmp1 \ | ||
| 90 | VPSRLD $32-shift, a, a \ | ||
| 91 | VPOR rtmp1, a, a | ||
| 92 | |||
| 93 | #define ROUND1(a, b, c, d, index, const, shift) \ | ||
| 94 | VPXOR c, tmp, tmp \ | ||
| 95 | VPADDD 32*const(consts), a, a \ | ||
| 96 | VPADDD mem, a, a \ | ||
| 97 | VPAND b, tmp, tmp \ | ||
| 98 | VPXOR d, tmp, tmp \ | ||
| 99 | prep(index) \ | ||
| 100 | VPADDD tmp, a, a \ | ||
| 101 | roll(shift,a) \ | ||
| 102 | VMOVAPD c, tmp \ | ||
| 103 | VPADDD b, a, a | ||
| 104 | |||
| 105 | #define ROUND1load(a, b, c, d, index, const, shift) \ | ||
| 106 | VXORPD c, tmp, tmp \ | ||
| 107 | VPADDD 32*const(consts), a, a \ | ||
| 108 | VPADDD mem, a, a \ | ||
| 109 | VPAND b, tmp, tmp \ | ||
| 110 | VPXOR d, tmp, tmp \ | ||
| 111 | load(index) \ | ||
| 112 | VPADDD tmp, a, a \ | ||
| 113 | roll(shift,a) \ | ||
| 114 | VMOVAPD c, tmp \ | ||
| 115 | VPADDD b, a, a | ||
| 116 | |||
| 117 | #define ROUND2(a, b, c, d, index, const, shift) \ | ||
| 118 | VPADDD 32*const(consts), a, a \ | ||
| 119 | VPADDD mem, a, a \ | ||
| 120 | VPAND b, tmp2, tmp2 \ | ||
| 121 | VANDNPD c, tmp, tmp \ | ||
| 122 | load(index) \ | ||
| 123 | VPOR tmp, tmp2, tmp2 \ | ||
| 124 | VMOVAPD c, tmp \ | ||
| 125 | VPADDD tmp2, a, a \ | ||
| 126 | VMOVAPD c, tmp2 \ | ||
| 127 | roll(shift,a) \ | ||
| 128 | VPADDD b, a, a | ||
| 129 | |||
| 130 | #define ROUND3(a, b, c, d, index, const, shift) \ | ||
| 131 | VPADDD 32*const(consts), a, a \ | ||
| 132 | VPADDD mem, a, a \ | ||
| 133 | load(index) \ | ||
| 134 | VPXOR d, tmp, tmp \ | ||
| 135 | VPXOR b, tmp, tmp \ | ||
| 136 | VPADDD tmp, a, a \ | ||
| 137 | roll(shift,a) \ | ||
| 138 | VMOVAPD b, tmp \ | ||
| 139 | VPADDD b, a, a | ||
| 140 | |||
| 141 | #define ROUND4(a, b, c, d, index, const, shift) \ | ||
| 142 | VPADDD 32*const(consts), a, a \ | ||
| 143 | VPADDD mem, a, a \ | ||
| 144 | VPOR b, tmp, tmp \ | ||
| 145 | VPXOR c, tmp, tmp \ | ||
| 146 | VPADDD tmp, a, a \ | ||
| 147 | load(index) \ | ||
| 148 | roll(shift,a) \ | ||
| 149 | VPXOR c, ones, tmp \ | ||
| 150 | VPADDD b, a, a | ||
| 151 | |||
| 152 | // load digest into state registers | ||
| 153 | VMOVUPD (dig), a | ||
| 154 | VMOVUPD 32(dig), b | ||
| 155 | VMOVUPD 64(dig), c | ||
| 156 | VMOVUPD 96(dig), d | ||
| 157 | |||
| 158 | // load source buffer offsets | ||
| 159 | VMOVUPD (AX), off | ||
| 160 | |||
| 161 | prepmask | ||
| 162 | VPCMPEQD ones, ones, ones | ||
| 163 | |||
| 164 | loop: | ||
| 165 | VMOVAPD a, sa | ||
| 166 | VMOVAPD b, sb | ||
| 167 | VMOVAPD c, sc | ||
| 168 | VMOVAPD d, sd | ||
| 169 | |||
| 170 | prep(0) | ||
| 171 | VMOVAPD d, tmp | ||
| 172 | store(0) | ||
| 173 | |||
| 174 | ROUND1(a,b,c,d, 1,0x00, 7) | ||
| 175 | store(1) | ||
| 176 | ROUND1(d,a,b,c, 2,0x01,12) | ||
| 177 | store(2) | ||
| 178 | ROUND1(c,d,a,b, 3,0x02,17) | ||
| 179 | store(3) | ||
| 180 | ROUND1(b,c,d,a, 4,0x03,22) | ||
| 181 | store(4) | ||
| 182 | ROUND1(a,b,c,d, 5,0x04, 7) | ||
| 183 | store(5) | ||
| 184 | ROUND1(d,a,b,c, 6,0x05,12) | ||
| 185 | store(6) | ||
| 186 | ROUND1(c,d,a,b, 7,0x06,17) | ||
| 187 | store(7) | ||
| 188 | ROUND1(b,c,d,a, 8,0x07,22) | ||
| 189 | store(8) | ||
| 190 | ROUND1(a,b,c,d, 9,0x08, 7) | ||
| 191 | store(9) | ||
| 192 | ROUND1(d,a,b,c,10,0x09,12) | ||
| 193 | store(10) | ||
| 194 | ROUND1(c,d,a,b,11,0x0a,17) | ||
| 195 | store(11) | ||
| 196 | ROUND1(b,c,d,a,12,0x0b,22) | ||
| 197 | store(12) | ||
| 198 | ROUND1(a,b,c,d,13,0x0c, 7) | ||
| 199 | store(13) | ||
| 200 | ROUND1(d,a,b,c,14,0x0d,12) | ||
| 201 | store(14) | ||
| 202 | ROUND1(c,d,a,b,15,0x0e,17) | ||
| 203 | store(15) | ||
| 204 | ROUND1load(b,c,d,a, 1,0x0f,22) | ||
| 205 | |||
| 206 | VMOVAPD d, tmp | ||
| 207 | VMOVAPD d, tmp2 | ||
| 208 | |||
| 209 | ROUND2(a,b,c,d, 6,0x10, 5) | ||
| 210 | ROUND2(d,a,b,c,11,0x11, 9) | ||
| 211 | ROUND2(c,d,a,b, 0,0x12,14) | ||
| 212 | ROUND2(b,c,d,a, 5,0x13,20) | ||
| 213 | ROUND2(a,b,c,d,10,0x14, 5) | ||
| 214 | ROUND2(d,a,b,c,15,0x15, 9) | ||
| 215 | ROUND2(c,d,a,b, 4,0x16,14) | ||
| 216 | ROUND2(b,c,d,a, 9,0x17,20) | ||
| 217 | ROUND2(a,b,c,d,14,0x18, 5) | ||
| 218 | ROUND2(d,a,b,c, 3,0x19, 9) | ||
| 219 | ROUND2(c,d,a,b, 8,0x1a,14) | ||
| 220 | ROUND2(b,c,d,a,13,0x1b,20) | ||
| 221 | ROUND2(a,b,c,d, 2,0x1c, 5) | ||
| 222 | ROUND2(d,a,b,c, 7,0x1d, 9) | ||
| 223 | ROUND2(c,d,a,b,12,0x1e,14) | ||
| 224 | ROUND2(b,c,d,a, 0,0x1f,20) | ||
| 225 | |||
| 226 | load(5) | ||
| 227 | VMOVAPD c, tmp | ||
| 228 | |||
| 229 | ROUND3(a,b,c,d, 8,0x20, 4) | ||
| 230 | ROUND3(d,a,b,c,11,0x21,11) | ||
| 231 | ROUND3(c,d,a,b,14,0x22,16) | ||
| 232 | ROUND3(b,c,d,a, 1,0x23,23) | ||
| 233 | ROUND3(a,b,c,d, 4,0x24, 4) | ||
| 234 | ROUND3(d,a,b,c, 7,0x25,11) | ||
| 235 | ROUND3(c,d,a,b,10,0x26,16) | ||
| 236 | ROUND3(b,c,d,a,13,0x27,23) | ||
| 237 | ROUND3(a,b,c,d, 0,0x28, 4) | ||
| 238 | ROUND3(d,a,b,c, 3,0x29,11) | ||
| 239 | ROUND3(c,d,a,b, 6,0x2a,16) | ||
| 240 | ROUND3(b,c,d,a, 9,0x2b,23) | ||
| 241 | ROUND3(a,b,c,d,12,0x2c, 4) | ||
| 242 | ROUND3(d,a,b,c,15,0x2d,11) | ||
| 243 | ROUND3(c,d,a,b, 2,0x2e,16) | ||
| 244 | ROUND3(b,c,d,a, 0,0x2f,23) | ||
| 245 | |||
| 246 | load(0) | ||
| 247 | VPXOR d, ones, tmp | ||
| 248 | |||
| 249 | ROUND4(a,b,c,d, 7,0x30, 6) | ||
| 250 | ROUND4(d,a,b,c,14,0x31,10) | ||
| 251 | ROUND4(c,d,a,b, 5,0x32,15) | ||
| 252 | ROUND4(b,c,d,a,12,0x33,21) | ||
| 253 | ROUND4(a,b,c,d, 3,0x34, 6) | ||
| 254 | ROUND4(d,a,b,c,10,0x35,10) | ||
| 255 | ROUND4(c,d,a,b, 1,0x36,15) | ||
| 256 | ROUND4(b,c,d,a, 8,0x37,21) | ||
| 257 | ROUND4(a,b,c,d,15,0x38, 6) | ||
| 258 | ROUND4(d,a,b,c, 6,0x39,10) | ||
| 259 | ROUND4(c,d,a,b,13,0x3a,15) | ||
| 260 | ROUND4(b,c,d,a, 4,0x3b,21) | ||
| 261 | ROUND4(a,b,c,d,11,0x3c, 6) | ||
| 262 | ROUND4(d,a,b,c, 2,0x3d,10) | ||
| 263 | ROUND4(c,d,a,b, 9,0x3e,15) | ||
| 264 | ROUND4(b,c,d,a, 0,0x3f,21) | ||
| 265 | |||
| 266 | VPADDD sa, a, a | ||
| 267 | VPADDD sb, b, b | ||
| 268 | VPADDD sc, c, c | ||
| 269 | VPADDD sd, d, d | ||
| 270 | |||
| 271 | LEAQ 64(base), base | ||
| 272 | SUBQ $64, count | ||
| 273 | JNE loop | ||
| 274 | |||
| 275 | VMOVUPD a, (dig) | ||
| 276 | VMOVUPD b, 32(dig) | ||
| 277 | VMOVUPD c, 64(dig) | ||
| 278 | VMOVUPD d, 96(dig) | ||
| 279 | |||
| 280 | VZEROUPPER | ||
| 281 | RET | ||