diff options
author | Rutger Broekhoff | 2023-12-29 21:31:53 +0100 |
---|---|---|
committer | Rutger Broekhoff | 2023-12-29 21:31:53 +0100 |
commit | 404aeae4545d2426c089a5f8d5e82dae56f5212b (patch) | |
tree | 2d84e00af272b39fc04f3795ae06bc48970e57b5 /vendor/github.com/minio/md5-simd/block8_amd64.s | |
parent | 209d8b0187ed025dec9ac149ebcced3462877bff (diff) | |
download | gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.tar.gz gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.zip |
Make Nix builds work
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block8_amd64.s')
-rw-r--r-- | vendor/github.com/minio/md5-simd/block8_amd64.s | 281 |
1 files changed, 281 insertions, 0 deletions
diff --git a/vendor/github.com/minio/md5-simd/block8_amd64.s b/vendor/github.com/minio/md5-simd/block8_amd64.s new file mode 100644 index 0000000..f57db17 --- /dev/null +++ b/vendor/github.com/minio/md5-simd/block8_amd64.s | |||
@@ -0,0 +1,281 @@ | |||
1 | //+build !noasm,!appengine,gc | ||
2 | |||
3 | // Copyright (c) 2018 Igneous Systems | ||
4 | // MIT License | ||
5 | // | ||
6 | // Permission is hereby granted, free of charge, to any person obtaining a copy | ||
7 | // of this software and associated documentation files (the "Software"), to deal | ||
8 | // in the Software without restriction, including without limitation the rights | ||
9 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
10 | // copies of the Software, and to permit persons to whom the Software is | ||
11 | // furnished to do so, subject to the following conditions: | ||
12 | // | ||
13 | // The above copyright notice and this permission notice shall be included in all | ||
14 | // copies or substantial portions of the Software. | ||
15 | // | ||
16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
18 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
19 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
20 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
21 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
22 | // SOFTWARE. | ||
23 | |||
24 | // Copyright (c) 2020 MinIO Inc. All rights reserved. | ||
25 | // Use of this source code is governed by a license that can be | ||
26 | // found in the LICENSE file. | ||
27 | |||
28 | // This is the AVX2 implementation of the MD5 block function (8-way parallel) | ||
29 | |||
30 | // block8(state *uint64, base uintptr, bufs *int32, cache *byte, n int) | ||
31 | TEXT ·block8(SB), 4, $0-40 | ||
32 | MOVQ state+0(FP), BX | ||
33 | MOVQ base+8(FP), SI | ||
34 | MOVQ bufs+16(FP), AX | ||
35 | MOVQ cache+24(FP), CX | ||
36 | MOVQ n+32(FP), DX | ||
37 | MOVQ ·avx256md5consts+0(SB), DI | ||
38 | |||
39 | // Align cache (which is stack allocated by the compiler) | ||
40 | // to a 256 bit boundary (ymm register alignment) | ||
41 | // The cache8 type is deliberately oversized to permit this. | ||
42 | ADDQ $31, CX | ||
43 | ANDB $-32, CL | ||
44 | |||
45 | #define a Y0 | ||
46 | #define b Y1 | ||
47 | #define c Y2 | ||
48 | #define d Y3 | ||
49 | |||
50 | #define sa Y4 | ||
51 | #define sb Y5 | ||
52 | #define sc Y6 | ||
53 | #define sd Y7 | ||
54 | |||
55 | #define tmp Y8 | ||
56 | #define tmp2 Y9 | ||
57 | |||
58 | #define mask Y10 | ||
59 | #define off Y11 | ||
60 | |||
61 | #define ones Y12 | ||
62 | |||
63 | #define rtmp1 Y13 | ||
64 | #define rtmp2 Y14 | ||
65 | |||
66 | #define mem Y15 | ||
67 | |||
68 | #define dig BX | ||
69 | #define cache CX | ||
70 | #define count DX | ||
71 | #define base SI | ||
72 | #define consts DI | ||
73 | |||
74 | #define prepmask \ | ||
75 | VPXOR mask, mask, mask \ | ||
76 | VPCMPGTD mask, off, mask | ||
77 | |||
78 | #define prep(index) \ | ||
79 | VMOVAPD mask, rtmp2 \ | ||
80 | VPGATHERDD rtmp2, index*4(base)(off*1), mem | ||
81 | |||
82 | #define load(index) \ | ||
83 | VMOVAPD index*32(cache), mem | ||
84 | |||
85 | #define store(index) \ | ||
86 | VMOVAPD mem, index*32(cache) | ||
87 | |||
88 | #define roll(shift, a) \ | ||
89 | VPSLLD $shift, a, rtmp1 \ | ||
90 | VPSRLD $32-shift, a, a \ | ||
91 | VPOR rtmp1, a, a | ||
92 | |||
93 | #define ROUND1(a, b, c, d, index, const, shift) \ | ||
94 | VPXOR c, tmp, tmp \ | ||
95 | VPADDD 32*const(consts), a, a \ | ||
96 | VPADDD mem, a, a \ | ||
97 | VPAND b, tmp, tmp \ | ||
98 | VPXOR d, tmp, tmp \ | ||
99 | prep(index) \ | ||
100 | VPADDD tmp, a, a \ | ||
101 | roll(shift,a) \ | ||
102 | VMOVAPD c, tmp \ | ||
103 | VPADDD b, a, a | ||
104 | |||
105 | #define ROUND1load(a, b, c, d, index, const, shift) \ | ||
106 | VXORPD c, tmp, tmp \ | ||
107 | VPADDD 32*const(consts), a, a \ | ||
108 | VPADDD mem, a, a \ | ||
109 | VPAND b, tmp, tmp \ | ||
110 | VPXOR d, tmp, tmp \ | ||
111 | load(index) \ | ||
112 | VPADDD tmp, a, a \ | ||
113 | roll(shift,a) \ | ||
114 | VMOVAPD c, tmp \ | ||
115 | VPADDD b, a, a | ||
116 | |||
117 | #define ROUND2(a, b, c, d, index, const, shift) \ | ||
118 | VPADDD 32*const(consts), a, a \ | ||
119 | VPADDD mem, a, a \ | ||
120 | VPAND b, tmp2, tmp2 \ | ||
121 | VANDNPD c, tmp, tmp \ | ||
122 | load(index) \ | ||
123 | VPOR tmp, tmp2, tmp2 \ | ||
124 | VMOVAPD c, tmp \ | ||
125 | VPADDD tmp2, a, a \ | ||
126 | VMOVAPD c, tmp2 \ | ||
127 | roll(shift,a) \ | ||
128 | VPADDD b, a, a | ||
129 | |||
130 | #define ROUND3(a, b, c, d, index, const, shift) \ | ||
131 | VPADDD 32*const(consts), a, a \ | ||
132 | VPADDD mem, a, a \ | ||
133 | load(index) \ | ||
134 | VPXOR d, tmp, tmp \ | ||
135 | VPXOR b, tmp, tmp \ | ||
136 | VPADDD tmp, a, a \ | ||
137 | roll(shift,a) \ | ||
138 | VMOVAPD b, tmp \ | ||
139 | VPADDD b, a, a | ||
140 | |||
141 | #define ROUND4(a, b, c, d, index, const, shift) \ | ||
142 | VPADDD 32*const(consts), a, a \ | ||
143 | VPADDD mem, a, a \ | ||
144 | VPOR b, tmp, tmp \ | ||
145 | VPXOR c, tmp, tmp \ | ||
146 | VPADDD tmp, a, a \ | ||
147 | load(index) \ | ||
148 | roll(shift,a) \ | ||
149 | VPXOR c, ones, tmp \ | ||
150 | VPADDD b, a, a | ||
151 | |||
152 | // load digest into state registers | ||
153 | VMOVUPD (dig), a | ||
154 | VMOVUPD 32(dig), b | ||
155 | VMOVUPD 64(dig), c | ||
156 | VMOVUPD 96(dig), d | ||
157 | |||
158 | // load source buffer offsets | ||
159 | VMOVUPD (AX), off | ||
160 | |||
161 | prepmask | ||
162 | VPCMPEQD ones, ones, ones | ||
163 | |||
164 | loop: | ||
165 | VMOVAPD a, sa | ||
166 | VMOVAPD b, sb | ||
167 | VMOVAPD c, sc | ||
168 | VMOVAPD d, sd | ||
169 | |||
170 | prep(0) | ||
171 | VMOVAPD d, tmp | ||
172 | store(0) | ||
173 | |||
174 | ROUND1(a,b,c,d, 1,0x00, 7) | ||
175 | store(1) | ||
176 | ROUND1(d,a,b,c, 2,0x01,12) | ||
177 | store(2) | ||
178 | ROUND1(c,d,a,b, 3,0x02,17) | ||
179 | store(3) | ||
180 | ROUND1(b,c,d,a, 4,0x03,22) | ||
181 | store(4) | ||
182 | ROUND1(a,b,c,d, 5,0x04, 7) | ||
183 | store(5) | ||
184 | ROUND1(d,a,b,c, 6,0x05,12) | ||
185 | store(6) | ||
186 | ROUND1(c,d,a,b, 7,0x06,17) | ||
187 | store(7) | ||
188 | ROUND1(b,c,d,a, 8,0x07,22) | ||
189 | store(8) | ||
190 | ROUND1(a,b,c,d, 9,0x08, 7) | ||
191 | store(9) | ||
192 | ROUND1(d,a,b,c,10,0x09,12) | ||
193 | store(10) | ||
194 | ROUND1(c,d,a,b,11,0x0a,17) | ||
195 | store(11) | ||
196 | ROUND1(b,c,d,a,12,0x0b,22) | ||
197 | store(12) | ||
198 | ROUND1(a,b,c,d,13,0x0c, 7) | ||
199 | store(13) | ||
200 | ROUND1(d,a,b,c,14,0x0d,12) | ||
201 | store(14) | ||
202 | ROUND1(c,d,a,b,15,0x0e,17) | ||
203 | store(15) | ||
204 | ROUND1load(b,c,d,a, 1,0x0f,22) | ||
205 | |||
206 | VMOVAPD d, tmp | ||
207 | VMOVAPD d, tmp2 | ||
208 | |||
209 | ROUND2(a,b,c,d, 6,0x10, 5) | ||
210 | ROUND2(d,a,b,c,11,0x11, 9) | ||
211 | ROUND2(c,d,a,b, 0,0x12,14) | ||
212 | ROUND2(b,c,d,a, 5,0x13,20) | ||
213 | ROUND2(a,b,c,d,10,0x14, 5) | ||
214 | ROUND2(d,a,b,c,15,0x15, 9) | ||
215 | ROUND2(c,d,a,b, 4,0x16,14) | ||
216 | ROUND2(b,c,d,a, 9,0x17,20) | ||
217 | ROUND2(a,b,c,d,14,0x18, 5) | ||
218 | ROUND2(d,a,b,c, 3,0x19, 9) | ||
219 | ROUND2(c,d,a,b, 8,0x1a,14) | ||
220 | ROUND2(b,c,d,a,13,0x1b,20) | ||
221 | ROUND2(a,b,c,d, 2,0x1c, 5) | ||
222 | ROUND2(d,a,b,c, 7,0x1d, 9) | ||
223 | ROUND2(c,d,a,b,12,0x1e,14) | ||
224 | ROUND2(b,c,d,a, 0,0x1f,20) | ||
225 | |||
226 | load(5) | ||
227 | VMOVAPD c, tmp | ||
228 | |||
229 | ROUND3(a,b,c,d, 8,0x20, 4) | ||
230 | ROUND3(d,a,b,c,11,0x21,11) | ||
231 | ROUND3(c,d,a,b,14,0x22,16) | ||
232 | ROUND3(b,c,d,a, 1,0x23,23) | ||
233 | ROUND3(a,b,c,d, 4,0x24, 4) | ||
234 | ROUND3(d,a,b,c, 7,0x25,11) | ||
235 | ROUND3(c,d,a,b,10,0x26,16) | ||
236 | ROUND3(b,c,d,a,13,0x27,23) | ||
237 | ROUND3(a,b,c,d, 0,0x28, 4) | ||
238 | ROUND3(d,a,b,c, 3,0x29,11) | ||
239 | ROUND3(c,d,a,b, 6,0x2a,16) | ||
240 | ROUND3(b,c,d,a, 9,0x2b,23) | ||
241 | ROUND3(a,b,c,d,12,0x2c, 4) | ||
242 | ROUND3(d,a,b,c,15,0x2d,11) | ||
243 | ROUND3(c,d,a,b, 2,0x2e,16) | ||
244 | ROUND3(b,c,d,a, 0,0x2f,23) | ||
245 | |||
246 | load(0) | ||
247 | VPXOR d, ones, tmp | ||
248 | |||
249 | ROUND4(a,b,c,d, 7,0x30, 6) | ||
250 | ROUND4(d,a,b,c,14,0x31,10) | ||
251 | ROUND4(c,d,a,b, 5,0x32,15) | ||
252 | ROUND4(b,c,d,a,12,0x33,21) | ||
253 | ROUND4(a,b,c,d, 3,0x34, 6) | ||
254 | ROUND4(d,a,b,c,10,0x35,10) | ||
255 | ROUND4(c,d,a,b, 1,0x36,15) | ||
256 | ROUND4(b,c,d,a, 8,0x37,21) | ||
257 | ROUND4(a,b,c,d,15,0x38, 6) | ||
258 | ROUND4(d,a,b,c, 6,0x39,10) | ||
259 | ROUND4(c,d,a,b,13,0x3a,15) | ||
260 | ROUND4(b,c,d,a, 4,0x3b,21) | ||
261 | ROUND4(a,b,c,d,11,0x3c, 6) | ||
262 | ROUND4(d,a,b,c, 2,0x3d,10) | ||
263 | ROUND4(c,d,a,b, 9,0x3e,15) | ||
264 | ROUND4(b,c,d,a, 0,0x3f,21) | ||
265 | |||
266 | VPADDD sa, a, a | ||
267 | VPADDD sb, b, b | ||
268 | VPADDD sc, c, c | ||
269 | VPADDD sd, d, d | ||
270 | |||
271 | LEAQ 64(base), base | ||
272 | SUBQ $64, count | ||
273 | JNE loop | ||
274 | |||
275 | VMOVUPD a, (dig) | ||
276 | VMOVUPD b, 32(dig) | ||
277 | VMOVUPD c, 64(dig) | ||
278 | VMOVUPD d, 96(dig) | ||
279 | |||
280 | VZEROUPPER | ||
281 | RET | ||