aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/minio/md5-simd/block8_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block8_amd64.s')
-rw-r--r--vendor/github.com/minio/md5-simd/block8_amd64.s281
1 files changed, 281 insertions, 0 deletions
diff --git a/vendor/github.com/minio/md5-simd/block8_amd64.s b/vendor/github.com/minio/md5-simd/block8_amd64.s
new file mode 100644
index 0000000..f57db17
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/block8_amd64.s
@@ -0,0 +1,281 @@
1//+build !noasm,!appengine,gc
2
3// Copyright (c) 2018 Igneous Systems
4// MIT License
5//
6// Permission is hereby granted, free of charge, to any person obtaining a copy
7// of this software and associated documentation files (the "Software"), to deal
8// in the Software without restriction, including without limitation the rights
9// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10// copies of the Software, and to permit persons to whom the Software is
11// furnished to do so, subject to the following conditions:
12//
13// The above copyright notice and this permission notice shall be included in all
14// copies or substantial portions of the Software.
15//
16// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22// SOFTWARE.
23
24// Copyright (c) 2020 MinIO Inc. All rights reserved.
25// Use of this source code is governed by a license that can be
26// found in the LICENSE file.
27
28// This is the AVX2 implementation of the MD5 block function (8-way parallel)
29
30// block8(state *uint64, base uintptr, bufs *int32, cache *byte, n int)
31TEXT ·block8(SB), 4, $0-40
32 MOVQ state+0(FP), BX
33 MOVQ base+8(FP), SI
34 MOVQ bufs+16(FP), AX
35 MOVQ cache+24(FP), CX
36 MOVQ n+32(FP), DX
37 MOVQ ·avx256md5consts+0(SB), DI
38
39 // Align cache (which is stack allocated by the compiler)
40 // to a 256 bit boundary (ymm register alignment)
41 // The cache8 type is deliberately oversized to permit this.
42 ADDQ $31, CX
43 ANDB $-32, CL
44
45#define a Y0
46#define b Y1
47#define c Y2
48#define d Y3
49
50#define sa Y4
51#define sb Y5
52#define sc Y6
53#define sd Y7
54
55#define tmp Y8
56#define tmp2 Y9
57
58#define mask Y10
59#define off Y11
60
61#define ones Y12
62
63#define rtmp1 Y13
64#define rtmp2 Y14
65
66#define mem Y15
67
68#define dig BX
69#define cache CX
70#define count DX
71#define base SI
72#define consts DI
73
74#define prepmask \
75 VPXOR mask, mask, mask \
76 VPCMPGTD mask, off, mask
77
78#define prep(index) \
79 VMOVAPD mask, rtmp2 \
80 VPGATHERDD rtmp2, index*4(base)(off*1), mem
81
82#define load(index) \
83 VMOVAPD index*32(cache), mem
84
85#define store(index) \
86 VMOVAPD mem, index*32(cache)
87
88#define roll(shift, a) \
89 VPSLLD $shift, a, rtmp1 \
90 VPSRLD $32-shift, a, a \
91 VPOR rtmp1, a, a
92
93#define ROUND1(a, b, c, d, index, const, shift) \
94 VPXOR c, tmp, tmp \
95 VPADDD 32*const(consts), a, a \
96 VPADDD mem, a, a \
97 VPAND b, tmp, tmp \
98 VPXOR d, tmp, tmp \
99 prep(index) \
100 VPADDD tmp, a, a \
101 roll(shift,a) \
102 VMOVAPD c, tmp \
103 VPADDD b, a, a
104
105#define ROUND1load(a, b, c, d, index, const, shift) \
106 VXORPD c, tmp, tmp \
107 VPADDD 32*const(consts), a, a \
108 VPADDD mem, a, a \
109 VPAND b, tmp, tmp \
110 VPXOR d, tmp, tmp \
111 load(index) \
112 VPADDD tmp, a, a \
113 roll(shift,a) \
114 VMOVAPD c, tmp \
115 VPADDD b, a, a
116
117#define ROUND2(a, b, c, d, index, const, shift) \
118 VPADDD 32*const(consts), a, a \
119 VPADDD mem, a, a \
120 VPAND b, tmp2, tmp2 \
121 VANDNPD c, tmp, tmp \
122 load(index) \
123 VPOR tmp, tmp2, tmp2 \
124 VMOVAPD c, tmp \
125 VPADDD tmp2, a, a \
126 VMOVAPD c, tmp2 \
127 roll(shift,a) \
128 VPADDD b, a, a
129
130#define ROUND3(a, b, c, d, index, const, shift) \
131 VPADDD 32*const(consts), a, a \
132 VPADDD mem, a, a \
133 load(index) \
134 VPXOR d, tmp, tmp \
135 VPXOR b, tmp, tmp \
136 VPADDD tmp, a, a \
137 roll(shift,a) \
138 VMOVAPD b, tmp \
139 VPADDD b, a, a
140
141#define ROUND4(a, b, c, d, index, const, shift) \
142 VPADDD 32*const(consts), a, a \
143 VPADDD mem, a, a \
144 VPOR b, tmp, tmp \
145 VPXOR c, tmp, tmp \
146 VPADDD tmp, a, a \
147 load(index) \
148 roll(shift,a) \
149 VPXOR c, ones, tmp \
150 VPADDD b, a, a
151
152 // load digest into state registers
153 VMOVUPD (dig), a
154 VMOVUPD 32(dig), b
155 VMOVUPD 64(dig), c
156 VMOVUPD 96(dig), d
157
158 // load source buffer offsets
159 VMOVUPD (AX), off
160
161 prepmask
162 VPCMPEQD ones, ones, ones
163
164loop:
165 VMOVAPD a, sa
166 VMOVAPD b, sb
167 VMOVAPD c, sc
168 VMOVAPD d, sd
169
170 prep(0)
171 VMOVAPD d, tmp
172 store(0)
173
174 ROUND1(a,b,c,d, 1,0x00, 7)
175 store(1)
176 ROUND1(d,a,b,c, 2,0x01,12)
177 store(2)
178 ROUND1(c,d,a,b, 3,0x02,17)
179 store(3)
180 ROUND1(b,c,d,a, 4,0x03,22)
181 store(4)
182 ROUND1(a,b,c,d, 5,0x04, 7)
183 store(5)
184 ROUND1(d,a,b,c, 6,0x05,12)
185 store(6)
186 ROUND1(c,d,a,b, 7,0x06,17)
187 store(7)
188 ROUND1(b,c,d,a, 8,0x07,22)
189 store(8)
190 ROUND1(a,b,c,d, 9,0x08, 7)
191 store(9)
192 ROUND1(d,a,b,c,10,0x09,12)
193 store(10)
194 ROUND1(c,d,a,b,11,0x0a,17)
195 store(11)
196 ROUND1(b,c,d,a,12,0x0b,22)
197 store(12)
198 ROUND1(a,b,c,d,13,0x0c, 7)
199 store(13)
200 ROUND1(d,a,b,c,14,0x0d,12)
201 store(14)
202 ROUND1(c,d,a,b,15,0x0e,17)
203 store(15)
204 ROUND1load(b,c,d,a, 1,0x0f,22)
205
206 VMOVAPD d, tmp
207 VMOVAPD d, tmp2
208
209 ROUND2(a,b,c,d, 6,0x10, 5)
210 ROUND2(d,a,b,c,11,0x11, 9)
211 ROUND2(c,d,a,b, 0,0x12,14)
212 ROUND2(b,c,d,a, 5,0x13,20)
213 ROUND2(a,b,c,d,10,0x14, 5)
214 ROUND2(d,a,b,c,15,0x15, 9)
215 ROUND2(c,d,a,b, 4,0x16,14)
216 ROUND2(b,c,d,a, 9,0x17,20)
217 ROUND2(a,b,c,d,14,0x18, 5)
218 ROUND2(d,a,b,c, 3,0x19, 9)
219 ROUND2(c,d,a,b, 8,0x1a,14)
220 ROUND2(b,c,d,a,13,0x1b,20)
221 ROUND2(a,b,c,d, 2,0x1c, 5)
222 ROUND2(d,a,b,c, 7,0x1d, 9)
223 ROUND2(c,d,a,b,12,0x1e,14)
224 ROUND2(b,c,d,a, 0,0x1f,20)
225
226 load(5)
227 VMOVAPD c, tmp
228
229 ROUND3(a,b,c,d, 8,0x20, 4)
230 ROUND3(d,a,b,c,11,0x21,11)
231 ROUND3(c,d,a,b,14,0x22,16)
232 ROUND3(b,c,d,a, 1,0x23,23)
233 ROUND3(a,b,c,d, 4,0x24, 4)
234 ROUND3(d,a,b,c, 7,0x25,11)
235 ROUND3(c,d,a,b,10,0x26,16)
236 ROUND3(b,c,d,a,13,0x27,23)
237 ROUND3(a,b,c,d, 0,0x28, 4)
238 ROUND3(d,a,b,c, 3,0x29,11)
239 ROUND3(c,d,a,b, 6,0x2a,16)
240 ROUND3(b,c,d,a, 9,0x2b,23)
241 ROUND3(a,b,c,d,12,0x2c, 4)
242 ROUND3(d,a,b,c,15,0x2d,11)
243 ROUND3(c,d,a,b, 2,0x2e,16)
244 ROUND3(b,c,d,a, 0,0x2f,23)
245
246 load(0)
247 VPXOR d, ones, tmp
248
249 ROUND4(a,b,c,d, 7,0x30, 6)
250 ROUND4(d,a,b,c,14,0x31,10)
251 ROUND4(c,d,a,b, 5,0x32,15)
252 ROUND4(b,c,d,a,12,0x33,21)
253 ROUND4(a,b,c,d, 3,0x34, 6)
254 ROUND4(d,a,b,c,10,0x35,10)
255 ROUND4(c,d,a,b, 1,0x36,15)
256 ROUND4(b,c,d,a, 8,0x37,21)
257 ROUND4(a,b,c,d,15,0x38, 6)
258 ROUND4(d,a,b,c, 6,0x39,10)
259 ROUND4(c,d,a,b,13,0x3a,15)
260 ROUND4(b,c,d,a, 4,0x3b,21)
261 ROUND4(a,b,c,d,11,0x3c, 6)
262 ROUND4(d,a,b,c, 2,0x3d,10)
263 ROUND4(c,d,a,b, 9,0x3e,15)
264 ROUND4(b,c,d,a, 0,0x3f,21)
265
266 VPADDD sa, a, a
267 VPADDD sb, b, b
268 VPADDD sc, c, c
269 VPADDD sd, d, d
270
271 LEAQ 64(base), base
272 SUBQ $64, count
273 JNE loop
274
275 VMOVUPD a, (dig)
276 VMOVUPD b, 32(dig)
277 VMOVUPD c, 64(dig)
278 VMOVUPD d, 96(dig)
279
280 VZEROUPPER
281 RET