aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/minio/md5-simd/block16_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block16_amd64.s')
-rw-r--r--vendor/github.com/minio/md5-simd/block16_amd64.s228
1 files changed, 228 insertions, 0 deletions
diff --git a/vendor/github.com/minio/md5-simd/block16_amd64.s b/vendor/github.com/minio/md5-simd/block16_amd64.s
new file mode 100644
index 0000000..be0a43a
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/block16_amd64.s
@@ -0,0 +1,228 @@
1// Copyright (c) 2020 MinIO Inc. All rights reserved.
2// Use of this source code is governed by a license that can be
3// found in the LICENSE file.
4
5//+build !noasm,!appengine,gc
6
7// This is the AVX512 implementation of the MD5 block function (16-way parallel)
8
9#define prep(index) \
10 KMOVQ kmask, ktmp \
11 VPGATHERDD index*4(base)(ptrs*1), ktmp, mem
12
13#define ROUND1(a, b, c, d, index, const, shift) \
14 VPXORQ c, tmp, tmp \
15 VPADDD 64*const(consts), a, a \
16 VPADDD mem, a, a \
17 VPTERNLOGD $0x6C, b, d, tmp \
18 prep(index) \
19 VPADDD tmp, a, a \
20 VPROLD $shift, a, a \
21 VMOVAPD c, tmp \
22 VPADDD b, a, a
23
24#define ROUND1noload(a, b, c, d, const, shift) \
25 VPXORQ c, tmp, tmp \
26 VPADDD 64*const(consts), a, a \
27 VPADDD mem, a, a \
28 VPTERNLOGD $0x6C, b, d, tmp \
29 VPADDD tmp, a, a \
30 VPROLD $shift, a, a \
31 VMOVAPD c, tmp \
32 VPADDD b, a, a
33
34#define ROUND2(a, b, c, d, zreg, const, shift) \
35 VPADDD 64*const(consts), a, a \
36 VPADDD zreg, a, a \
37 VANDNPD c, tmp, tmp \
38 VPTERNLOGD $0xEC, b, tmp, tmp2 \
39 VMOVAPD c, tmp \
40 VPADDD tmp2, a, a \
41 VMOVAPD c, tmp2 \
42 VPROLD $shift, a, a \
43 VPADDD b, a, a
44
45#define ROUND3(a, b, c, d, zreg, const, shift) \
46 VPADDD 64*const(consts), a, a \
47 VPADDD zreg, a, a \
48 VPTERNLOGD $0x96, b, d, tmp \
49 VPADDD tmp, a, a \
50 VPROLD $shift, a, a \
51 VMOVAPD b, tmp \
52 VPADDD b, a, a
53
54#define ROUND4(a, b, c, d, zreg, const, shift) \
55 VPADDD 64*const(consts), a, a \
56 VPADDD zreg, a, a \
57 VPTERNLOGD $0x36, b, c, tmp \
58 VPADDD tmp, a, a \
59 VPROLD $shift, a, a \
60 VPXORQ c, ones, tmp \
61 VPADDD b, a, a
62
63TEXT ·block16(SB), 4, $0-40
64
65 MOVQ state+0(FP), BX
66 MOVQ base+8(FP), SI
67 MOVQ ptrs+16(FP), AX
68 KMOVQ mask+24(FP), K1
69 MOVQ n+32(FP), DX
70 MOVQ ·avx512md5consts+0(SB), DI
71
72#define a Z0
73#define b Z1
74#define c Z2
75#define d Z3
76
77#define sa Z4
78#define sb Z5
79#define sc Z6
80#define sd Z7
81
82#define tmp Z8
83#define tmp2 Z9
84#define ptrs Z10
85#define ones Z12
86#define mem Z15
87
88#define kmask K1
89#define ktmp K3
90
91// ----------------------------------------------------------
92// Registers Z16 through to Z31 are used for caching purposes
93// ----------------------------------------------------------
94
95#define dig BX
96#define count DX
97#define base SI
98#define consts DI
99
100 // load digest into state registers
101 VMOVUPD (dig), a
102 VMOVUPD 0x40(dig), b
103 VMOVUPD 0x80(dig), c
104 VMOVUPD 0xc0(dig), d
105
106 // load source pointers
107 VMOVUPD 0x00(AX), ptrs
108
109 MOVQ $-1, AX
110 VPBROADCASTQ AX, ones
111
112loop:
113 VMOVAPD a, sa
114 VMOVAPD b, sb
115 VMOVAPD c, sc
116 VMOVAPD d, sd
117
118 prep(0)
119 VMOVAPD d, tmp
120 VMOVAPD mem, Z16
121
122 ROUND1(a,b,c,d, 1,0x00, 7)
123 VMOVAPD mem, Z17
124 ROUND1(d,a,b,c, 2,0x01,12)
125 VMOVAPD mem, Z18
126 ROUND1(c,d,a,b, 3,0x02,17)
127 VMOVAPD mem, Z19
128 ROUND1(b,c,d,a, 4,0x03,22)
129 VMOVAPD mem, Z20
130 ROUND1(a,b,c,d, 5,0x04, 7)
131 VMOVAPD mem, Z21
132 ROUND1(d,a,b,c, 6,0x05,12)
133 VMOVAPD mem, Z22
134 ROUND1(c,d,a,b, 7,0x06,17)
135 VMOVAPD mem, Z23
136 ROUND1(b,c,d,a, 8,0x07,22)
137 VMOVAPD mem, Z24
138 ROUND1(a,b,c,d, 9,0x08, 7)
139 VMOVAPD mem, Z25
140 ROUND1(d,a,b,c,10,0x09,12)
141 VMOVAPD mem, Z26
142 ROUND1(c,d,a,b,11,0x0a,17)
143 VMOVAPD mem, Z27
144 ROUND1(b,c,d,a,12,0x0b,22)
145 VMOVAPD mem, Z28
146 ROUND1(a,b,c,d,13,0x0c, 7)
147 VMOVAPD mem, Z29
148 ROUND1(d,a,b,c,14,0x0d,12)
149 VMOVAPD mem, Z30
150 ROUND1(c,d,a,b,15,0x0e,17)
151 VMOVAPD mem, Z31
152
153 ROUND1noload(b,c,d,a, 0x0f,22)
154
155 VMOVAPD d, tmp
156 VMOVAPD d, tmp2
157
158 ROUND2(a,b,c,d, Z17,0x10, 5)
159 ROUND2(d,a,b,c, Z22,0x11, 9)
160 ROUND2(c,d,a,b, Z27,0x12,14)
161 ROUND2(b,c,d,a, Z16,0x13,20)
162 ROUND2(a,b,c,d, Z21,0x14, 5)
163 ROUND2(d,a,b,c, Z26,0x15, 9)
164 ROUND2(c,d,a,b, Z31,0x16,14)
165 ROUND2(b,c,d,a, Z20,0x17,20)
166 ROUND2(a,b,c,d, Z25,0x18, 5)
167 ROUND2(d,a,b,c, Z30,0x19, 9)
168 ROUND2(c,d,a,b, Z19,0x1a,14)
169 ROUND2(b,c,d,a, Z24,0x1b,20)
170 ROUND2(a,b,c,d, Z29,0x1c, 5)
171 ROUND2(d,a,b,c, Z18,0x1d, 9)
172 ROUND2(c,d,a,b, Z23,0x1e,14)
173 ROUND2(b,c,d,a, Z28,0x1f,20)
174
175 VMOVAPD c, tmp
176
177 ROUND3(a,b,c,d, Z21,0x20, 4)
178 ROUND3(d,a,b,c, Z24,0x21,11)
179 ROUND3(c,d,a,b, Z27,0x22,16)
180 ROUND3(b,c,d,a, Z30,0x23,23)
181 ROUND3(a,b,c,d, Z17,0x24, 4)
182 ROUND3(d,a,b,c, Z20,0x25,11)
183 ROUND3(c,d,a,b, Z23,0x26,16)
184 ROUND3(b,c,d,a, Z26,0x27,23)
185 ROUND3(a,b,c,d, Z29,0x28, 4)
186 ROUND3(d,a,b,c, Z16,0x29,11)
187 ROUND3(c,d,a,b, Z19,0x2a,16)
188 ROUND3(b,c,d,a, Z22,0x2b,23)
189 ROUND3(a,b,c,d, Z25,0x2c, 4)
190 ROUND3(d,a,b,c, Z28,0x2d,11)
191 ROUND3(c,d,a,b, Z31,0x2e,16)
192 ROUND3(b,c,d,a, Z18,0x2f,23)
193
194 VPXORQ d, ones, tmp
195
196 ROUND4(a,b,c,d, Z16,0x30, 6)
197 ROUND4(d,a,b,c, Z23,0x31,10)
198 ROUND4(c,d,a,b, Z30,0x32,15)
199 ROUND4(b,c,d,a, Z21,0x33,21)
200 ROUND4(a,b,c,d, Z28,0x34, 6)
201 ROUND4(d,a,b,c, Z19,0x35,10)
202 ROUND4(c,d,a,b, Z26,0x36,15)
203 ROUND4(b,c,d,a, Z17,0x37,21)
204 ROUND4(a,b,c,d, Z24,0x38, 6)
205 ROUND4(d,a,b,c, Z31,0x39,10)
206 ROUND4(c,d,a,b, Z22,0x3a,15)
207 ROUND4(b,c,d,a, Z29,0x3b,21)
208 ROUND4(a,b,c,d, Z20,0x3c, 6)
209 ROUND4(d,a,b,c, Z27,0x3d,10)
210 ROUND4(c,d,a,b, Z18,0x3e,15)
211 ROUND4(b,c,d,a, Z25,0x3f,21)
212
213 VPADDD sa, a, a
214 VPADDD sb, b, b
215 VPADDD sc, c, c
216 VPADDD sd, d, d
217
218 LEAQ 64(base), base
219 SUBQ $64, count
220 JNE loop
221
222 VMOVUPD a, (dig)
223 VMOVUPD b, 0x40(dig)
224 VMOVUPD c, 0x80(dig)
225 VMOVUPD d, 0xc0(dig)
226
227 VZEROUPPER
228 RET