diff options
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block16_amd64.s')
-rw-r--r-- | vendor/github.com/minio/md5-simd/block16_amd64.s | 228 |
1 files changed, 228 insertions, 0 deletions
diff --git a/vendor/github.com/minio/md5-simd/block16_amd64.s b/vendor/github.com/minio/md5-simd/block16_amd64.s new file mode 100644 index 0000000..be0a43a --- /dev/null +++ b/vendor/github.com/minio/md5-simd/block16_amd64.s | |||
@@ -0,0 +1,228 @@ | |||
1 | // Copyright (c) 2020 MinIO Inc. All rights reserved. | ||
2 | // Use of this source code is governed by a license that can be | ||
3 | // found in the LICENSE file. | ||
4 | |||
5 | //+build !noasm,!appengine,gc | ||
6 | |||
7 | // This is the AVX512 implementation of the MD5 block function (16-way parallel) | ||
8 | |||
9 | #define prep(index) \ | ||
10 | KMOVQ kmask, ktmp \ | ||
11 | VPGATHERDD index*4(base)(ptrs*1), ktmp, mem | ||
12 | |||
13 | #define ROUND1(a, b, c, d, index, const, shift) \ | ||
14 | VPXORQ c, tmp, tmp \ | ||
15 | VPADDD 64*const(consts), a, a \ | ||
16 | VPADDD mem, a, a \ | ||
17 | VPTERNLOGD $0x6C, b, d, tmp \ | ||
18 | prep(index) \ | ||
19 | VPADDD tmp, a, a \ | ||
20 | VPROLD $shift, a, a \ | ||
21 | VMOVAPD c, tmp \ | ||
22 | VPADDD b, a, a | ||
23 | |||
24 | #define ROUND1noload(a, b, c, d, const, shift) \ | ||
25 | VPXORQ c, tmp, tmp \ | ||
26 | VPADDD 64*const(consts), a, a \ | ||
27 | VPADDD mem, a, a \ | ||
28 | VPTERNLOGD $0x6C, b, d, tmp \ | ||
29 | VPADDD tmp, a, a \ | ||
30 | VPROLD $shift, a, a \ | ||
31 | VMOVAPD c, tmp \ | ||
32 | VPADDD b, a, a | ||
33 | |||
34 | #define ROUND2(a, b, c, d, zreg, const, shift) \ | ||
35 | VPADDD 64*const(consts), a, a \ | ||
36 | VPADDD zreg, a, a \ | ||
37 | VANDNPD c, tmp, tmp \ | ||
38 | VPTERNLOGD $0xEC, b, tmp, tmp2 \ | ||
39 | VMOVAPD c, tmp \ | ||
40 | VPADDD tmp2, a, a \ | ||
41 | VMOVAPD c, tmp2 \ | ||
42 | VPROLD $shift, a, a \ | ||
43 | VPADDD b, a, a | ||
44 | |||
45 | #define ROUND3(a, b, c, d, zreg, const, shift) \ | ||
46 | VPADDD 64*const(consts), a, a \ | ||
47 | VPADDD zreg, a, a \ | ||
48 | VPTERNLOGD $0x96, b, d, tmp \ | ||
49 | VPADDD tmp, a, a \ | ||
50 | VPROLD $shift, a, a \ | ||
51 | VMOVAPD b, tmp \ | ||
52 | VPADDD b, a, a | ||
53 | |||
54 | #define ROUND4(a, b, c, d, zreg, const, shift) \ | ||
55 | VPADDD 64*const(consts), a, a \ | ||
56 | VPADDD zreg, a, a \ | ||
57 | VPTERNLOGD $0x36, b, c, tmp \ | ||
58 | VPADDD tmp, a, a \ | ||
59 | VPROLD $shift, a, a \ | ||
60 | VPXORQ c, ones, tmp \ | ||
61 | VPADDD b, a, a | ||
62 | |||
63 | TEXT ·block16(SB), 4, $0-40 | ||
64 | |||
65 | MOVQ state+0(FP), BX | ||
66 | MOVQ base+8(FP), SI | ||
67 | MOVQ ptrs+16(FP), AX | ||
68 | KMOVQ mask+24(FP), K1 | ||
69 | MOVQ n+32(FP), DX | ||
70 | MOVQ ·avx512md5consts+0(SB), DI | ||
71 | |||
72 | #define a Z0 | ||
73 | #define b Z1 | ||
74 | #define c Z2 | ||
75 | #define d Z3 | ||
76 | |||
77 | #define sa Z4 | ||
78 | #define sb Z5 | ||
79 | #define sc Z6 | ||
80 | #define sd Z7 | ||
81 | |||
82 | #define tmp Z8 | ||
83 | #define tmp2 Z9 | ||
84 | #define ptrs Z10 | ||
85 | #define ones Z12 | ||
86 | #define mem Z15 | ||
87 | |||
88 | #define kmask K1 | ||
89 | #define ktmp K3 | ||
90 | |||
91 | // ---------------------------------------------------------- | ||
92 | // Registers Z16 through to Z31 are used for caching purposes | ||
93 | // ---------------------------------------------------------- | ||
94 | |||
95 | #define dig BX | ||
96 | #define count DX | ||
97 | #define base SI | ||
98 | #define consts DI | ||
99 | |||
100 | // load digest into state registers | ||
101 | VMOVUPD (dig), a | ||
102 | VMOVUPD 0x40(dig), b | ||
103 | VMOVUPD 0x80(dig), c | ||
104 | VMOVUPD 0xc0(dig), d | ||
105 | |||
106 | // load source pointers | ||
107 | VMOVUPD 0x00(AX), ptrs | ||
108 | |||
109 | MOVQ $-1, AX | ||
110 | VPBROADCASTQ AX, ones | ||
111 | |||
112 | loop: | ||
113 | VMOVAPD a, sa | ||
114 | VMOVAPD b, sb | ||
115 | VMOVAPD c, sc | ||
116 | VMOVAPD d, sd | ||
117 | |||
118 | prep(0) | ||
119 | VMOVAPD d, tmp | ||
120 | VMOVAPD mem, Z16 | ||
121 | |||
122 | ROUND1(a,b,c,d, 1,0x00, 7) | ||
123 | VMOVAPD mem, Z17 | ||
124 | ROUND1(d,a,b,c, 2,0x01,12) | ||
125 | VMOVAPD mem, Z18 | ||
126 | ROUND1(c,d,a,b, 3,0x02,17) | ||
127 | VMOVAPD mem, Z19 | ||
128 | ROUND1(b,c,d,a, 4,0x03,22) | ||
129 | VMOVAPD mem, Z20 | ||
130 | ROUND1(a,b,c,d, 5,0x04, 7) | ||
131 | VMOVAPD mem, Z21 | ||
132 | ROUND1(d,a,b,c, 6,0x05,12) | ||
133 | VMOVAPD mem, Z22 | ||
134 | ROUND1(c,d,a,b, 7,0x06,17) | ||
135 | VMOVAPD mem, Z23 | ||
136 | ROUND1(b,c,d,a, 8,0x07,22) | ||
137 | VMOVAPD mem, Z24 | ||
138 | ROUND1(a,b,c,d, 9,0x08, 7) | ||
139 | VMOVAPD mem, Z25 | ||
140 | ROUND1(d,a,b,c,10,0x09,12) | ||
141 | VMOVAPD mem, Z26 | ||
142 | ROUND1(c,d,a,b,11,0x0a,17) | ||
143 | VMOVAPD mem, Z27 | ||
144 | ROUND1(b,c,d,a,12,0x0b,22) | ||
145 | VMOVAPD mem, Z28 | ||
146 | ROUND1(a,b,c,d,13,0x0c, 7) | ||
147 | VMOVAPD mem, Z29 | ||
148 | ROUND1(d,a,b,c,14,0x0d,12) | ||
149 | VMOVAPD mem, Z30 | ||
150 | ROUND1(c,d,a,b,15,0x0e,17) | ||
151 | VMOVAPD mem, Z31 | ||
152 | |||
153 | ROUND1noload(b,c,d,a, 0x0f,22) | ||
154 | |||
155 | VMOVAPD d, tmp | ||
156 | VMOVAPD d, tmp2 | ||
157 | |||
158 | ROUND2(a,b,c,d, Z17,0x10, 5) | ||
159 | ROUND2(d,a,b,c, Z22,0x11, 9) | ||
160 | ROUND2(c,d,a,b, Z27,0x12,14) | ||
161 | ROUND2(b,c,d,a, Z16,0x13,20) | ||
162 | ROUND2(a,b,c,d, Z21,0x14, 5) | ||
163 | ROUND2(d,a,b,c, Z26,0x15, 9) | ||
164 | ROUND2(c,d,a,b, Z31,0x16,14) | ||
165 | ROUND2(b,c,d,a, Z20,0x17,20) | ||
166 | ROUND2(a,b,c,d, Z25,0x18, 5) | ||
167 | ROUND2(d,a,b,c, Z30,0x19, 9) | ||
168 | ROUND2(c,d,a,b, Z19,0x1a,14) | ||
169 | ROUND2(b,c,d,a, Z24,0x1b,20) | ||
170 | ROUND2(a,b,c,d, Z29,0x1c, 5) | ||
171 | ROUND2(d,a,b,c, Z18,0x1d, 9) | ||
172 | ROUND2(c,d,a,b, Z23,0x1e,14) | ||
173 | ROUND2(b,c,d,a, Z28,0x1f,20) | ||
174 | |||
175 | VMOVAPD c, tmp | ||
176 | |||
177 | ROUND3(a,b,c,d, Z21,0x20, 4) | ||
178 | ROUND3(d,a,b,c, Z24,0x21,11) | ||
179 | ROUND3(c,d,a,b, Z27,0x22,16) | ||
180 | ROUND3(b,c,d,a, Z30,0x23,23) | ||
181 | ROUND3(a,b,c,d, Z17,0x24, 4) | ||
182 | ROUND3(d,a,b,c, Z20,0x25,11) | ||
183 | ROUND3(c,d,a,b, Z23,0x26,16) | ||
184 | ROUND3(b,c,d,a, Z26,0x27,23) | ||
185 | ROUND3(a,b,c,d, Z29,0x28, 4) | ||
186 | ROUND3(d,a,b,c, Z16,0x29,11) | ||
187 | ROUND3(c,d,a,b, Z19,0x2a,16) | ||
188 | ROUND3(b,c,d,a, Z22,0x2b,23) | ||
189 | ROUND3(a,b,c,d, Z25,0x2c, 4) | ||
190 | ROUND3(d,a,b,c, Z28,0x2d,11) | ||
191 | ROUND3(c,d,a,b, Z31,0x2e,16) | ||
192 | ROUND3(b,c,d,a, Z18,0x2f,23) | ||
193 | |||
194 | VPXORQ d, ones, tmp | ||
195 | |||
196 | ROUND4(a,b,c,d, Z16,0x30, 6) | ||
197 | ROUND4(d,a,b,c, Z23,0x31,10) | ||
198 | ROUND4(c,d,a,b, Z30,0x32,15) | ||
199 | ROUND4(b,c,d,a, Z21,0x33,21) | ||
200 | ROUND4(a,b,c,d, Z28,0x34, 6) | ||
201 | ROUND4(d,a,b,c, Z19,0x35,10) | ||
202 | ROUND4(c,d,a,b, Z26,0x36,15) | ||
203 | ROUND4(b,c,d,a, Z17,0x37,21) | ||
204 | ROUND4(a,b,c,d, Z24,0x38, 6) | ||
205 | ROUND4(d,a,b,c, Z31,0x39,10) | ||
206 | ROUND4(c,d,a,b, Z22,0x3a,15) | ||
207 | ROUND4(b,c,d,a, Z29,0x3b,21) | ||
208 | ROUND4(a,b,c,d, Z20,0x3c, 6) | ||
209 | ROUND4(d,a,b,c, Z27,0x3d,10) | ||
210 | ROUND4(c,d,a,b, Z18,0x3e,15) | ||
211 | ROUND4(b,c,d,a, Z25,0x3f,21) | ||
212 | |||
213 | VPADDD sa, a, a | ||
214 | VPADDD sb, b, b | ||
215 | VPADDD sc, c, c | ||
216 | VPADDD sd, d, d | ||
217 | |||
218 | LEAQ 64(base), base | ||
219 | SUBQ $64, count | ||
220 | JNE loop | ||
221 | |||
222 | VMOVUPD a, (dig) | ||
223 | VMOVUPD b, 0x40(dig) | ||
224 | VMOVUPD c, 0x80(dig) | ||
225 | VMOVUPD d, 0xc0(dig) | ||
226 | |||
227 | VZEROUPPER | ||
228 | RET | ||