aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.asm
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.asm')
-rw-r--r--vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.asm686
1 files changed, 686 insertions, 0 deletions
diff --git a/vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.asm b/vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.asm
new file mode 100644
index 0000000..c959b1a
--- /dev/null
+++ b/vendor/github.com/minio/sha256-simd/sha256blockAvx512_amd64.asm
@@ -0,0 +1,686 @@
1
2// 16x Parallel implementation of SHA256 for AVX512
3
4//
5// Minio Cloud Storage, (C) 2017 Minio, Inc.
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10//
11// http://www.apache.org/licenses/LICENSE-2.0
12//
13// Unless required by applicable law or agreed to in writing, software
14// distributed under the License is distributed on an "AS IS" BASIS,
15// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16// See the License for the specific language governing permissions and
17// limitations under the License.
18
19//
20// This code is based on the Intel Multi-Buffer Crypto for IPSec library
21// and more specifically the following implementation:
22// https://github.com/intel/intel-ipsec-mb/blob/master/avx512/sha256_x16_avx512.asm
23//
24// For Golang it has been converted into Plan 9 assembly with the help of
25// github.com/minio/asm2plan9s to assemble the AVX512 instructions
26//
27
28// Copyright (c) 2017, Intel Corporation
29//
30// Redistribution and use in source and binary forms, with or without
31// modification, are permitted provided that the following conditions are met:
32//
33// * Redistributions of source code must retain the above copyright notice,
34// this list of conditions and the following disclaimer.
35// * Redistributions in binary form must reproduce the above copyright
36// notice, this list of conditions and the following disclaimer in the
37// documentation and/or other materials provided with the distribution.
38// * Neither the name of Intel Corporation nor the names of its contributors
39// may be used to endorse or promote products derived from this software
40// without specific prior written permission.
41//
42// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
43// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
45// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
46// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
48// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
49// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
50// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
51// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52
53#define SHA256_DIGEST_ROW_SIZE 64
54
55// arg1
56#define STATE rdi
57#define STATE_P9 DI
58// arg2
59#define INP_SIZE rsi
60#define INP_SIZE_P9 SI
61
62#define IDX rcx
63#define TBL rdx
64#define TBL_P9 DX
65
66#define INPUT rax
67#define INPUT_P9 AX
68
69#define inp0 r9
70#define SCRATCH_P9 R12
71#define SCRATCH r12
72#define maskp r13
73#define MASKP_P9 R13
74#define mask r14
75#define MASK_P9 R14
76
77#define A zmm0
78#define B zmm1
79#define C zmm2
80#define D zmm3
81#define E zmm4
82#define F zmm5
83#define G zmm6
84#define H zmm7
85#define T1 zmm8
86#define TMP0 zmm9
87#define TMP1 zmm10
88#define TMP2 zmm11
89#define TMP3 zmm12
90#define TMP4 zmm13
91#define TMP5 zmm14
92#define TMP6 zmm15
93
94#define W0 zmm16
95#define W1 zmm17
96#define W2 zmm18
97#define W3 zmm19
98#define W4 zmm20
99#define W5 zmm21
100#define W6 zmm22
101#define W7 zmm23
102#define W8 zmm24
103#define W9 zmm25
104#define W10 zmm26
105#define W11 zmm27
106#define W12 zmm28
107#define W13 zmm29
108#define W14 zmm30
109#define W15 zmm31
110
111
112#define TRANSPOSE16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _r10, _r11, _r12, _r13, _r14, _r15, _t0, _t1) \
113 \
114 \ // input r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
115 \ // r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
116 \ // r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
117 \ // r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
118 \ // r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
119 \ // r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
120 \ // r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
121 \ // r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
122 \ // r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
123 \ // r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
124 \ // r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
125 \ // r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
126 \ // r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
127 \ // r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
128 \ // r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
129 \ // r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
130 \
131 \ // output r0 = { p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
132 \ // r1 = { p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
133 \ // r2 = { p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
134 \ // r3 = { p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
135 \ // r4 = { p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
136 \ // r5 = { p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
137 \ // r6 = { p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
138 \ // r7 = { p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
139 \ // r8 = { p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
140 \ // r9 = { p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
141 \ // r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
142 \ // r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
143 \ // r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
144 \ // r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
145 \ // r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
146 \ // r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
147 \
148 \ // process top half
149 vshufps _t0, _r0, _r1, 0x44 \ // t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
150 vshufps _r0, _r0, _r1, 0xEE \ // r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
151 vshufps _t1, _r2, _r3, 0x44 \ // t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
152 vshufps _r2, _r2, _r3, 0xEE \ // r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
153 \
154 vshufps _r3, _t0, _t1, 0xDD \ // r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
155 vshufps _r1, _r0, _r2, 0x88 \ // r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
156 vshufps _r0, _r0, _r2, 0xDD \ // r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
157 vshufps _t0, _t0, _t1, 0x88 \ // t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
158 \
159 \ // use r2 in place of t0
160 vshufps _r2, _r4, _r5, 0x44 \ // r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
161 vshufps _r4, _r4, _r5, 0xEE \ // r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
162 vshufps _t1, _r6, _r7, 0x44 \ // t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
163 vshufps _r6, _r6, _r7, 0xEE \ // r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
164 \
165 vshufps _r7, _r2, _t1, 0xDD \ // r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
166 vshufps _r5, _r4, _r6, 0x88 \ // r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
167 vshufps _r4, _r4, _r6, 0xDD \ // r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
168 vshufps _r2, _r2, _t1, 0x88 \ // r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
169 \
170 \ // use r6 in place of t0
171 vshufps _r6, _r8, _r9, 0x44 \ // r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
172 vshufps _r8, _r8, _r9, 0xEE \ // r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
173 vshufps _t1, _r10, _r11, 0x44 \ // t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
174 vshufps _r10, _r10, _r11, 0xEE \ // r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
175 \
176 vshufps _r11, _r6, _t1, 0xDD \ // r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
177 vshufps _r9, _r8, _r10, 0x88 \ // r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
178 vshufps _r8, _r8, _r10, 0xDD \ // r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
179 vshufps _r6, _r6, _t1, 0x88 \ // r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
180 \
181 \ // use r10 in place of t0
182 vshufps _r10, _r12, _r13, 0x44 \ // r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
183 vshufps _r12, _r12, _r13, 0xEE \ // r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
184 vshufps _t1, _r14, _r15, 0x44 \ // t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
185 vshufps _r14, _r14, _r15, 0xEE \ // r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
186 \
187 vshufps _r15, _r10, _t1, 0xDD \ // r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
188 vshufps _r13, _r12, _r14, 0x88 \ // r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
189 vshufps _r12, _r12, _r14, 0xDD \ // r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
190 vshufps _r10, _r10, _t1, 0x88 \ // r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
191 \
192 \ // At this point, the registers that contain interesting data are:
193 \ // t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
194 \ // Can use t1 and r14 as scratch registers
195 LEAQ PSHUFFLE_TRANSPOSE16_MASK1<>(SB), BX \
196 LEAQ PSHUFFLE_TRANSPOSE16_MASK2<>(SB), R8 \
197 \
198 vmovdqu32 _r14, [rbx] \
199 vpermi2q _r14, _t0, _r2 \ // r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
200 vmovdqu32 _t1, [r8] \
201 vpermi2q _t1, _t0, _r2 \ // t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
202 \
203 vmovdqu32 _r2, [rbx] \
204 vpermi2q _r2, _r3, _r7 \ // r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
205 vmovdqu32 _t0, [r8] \
206 vpermi2q _t0, _r3, _r7 \ // t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
207 \
208 vmovdqu32 _r3, [rbx] \
209 vpermi2q _r3, _r1, _r5 \ // r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
210 vmovdqu32 _r7, [r8] \
211 vpermi2q _r7, _r1, _r5 \ // r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
212 \
213 vmovdqu32 _r1, [rbx] \
214 vpermi2q _r1, _r0, _r4 \ // r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
215 vmovdqu32 _r5, [r8] \
216 vpermi2q _r5, _r0, _r4 \ // r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
217 \
218 vmovdqu32 _r0, [rbx] \
219 vpermi2q _r0, _r6, _r10 \ // r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
220 vmovdqu32 _r4, [r8] \
221 vpermi2q _r4, _r6, _r10 \ // r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
222 \
223 vmovdqu32 _r6, [rbx] \
224 vpermi2q _r6, _r11, _r15 \ // r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
225 vmovdqu32 _r10, [r8] \
226 vpermi2q _r10, _r11, _r15 \ // r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
227 \
228 vmovdqu32 _r11, [rbx] \
229 vpermi2q _r11, _r9, _r13 \ // r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
230 vmovdqu32 _r15, [r8] \
231 vpermi2q _r15, _r9, _r13 \ // r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
232 \
233 vmovdqu32 _r9, [rbx] \
234 vpermi2q _r9, _r8, _r12 \ // r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
235 vmovdqu32 _r13, [r8] \
236 vpermi2q _r13, _r8, _r12 \ // r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
237 \
238 \ // At this point r8 and r12 can be used as scratch registers
239 vshuff64x2 _r8, _r14, _r0, 0xEE \ // r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
240 vshuff64x2 _r0, _r14, _r0, 0x44 \ // r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
241 \
242 vshuff64x2 _r12, _t1, _r4, 0xEE \ // r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
243 vshuff64x2 _r4, _t1, _r4, 0x44 \ // r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
244 \
245 vshuff64x2 _r14, _r7, _r15, 0xEE \ // r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
246 vshuff64x2 _t1, _r7, _r15, 0x44 \ // t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
247 \
248 vshuff64x2 _r15, _r5, _r13, 0xEE \ // r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
249 vshuff64x2 _r7, _r5, _r13, 0x44 \ // r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
250 \
251 vshuff64x2 _r13, _t0, _r10, 0xEE \ // r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
252 vshuff64x2 _r5, _t0, _r10, 0x44 \ // r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
253 \
254 vshuff64x2 _r10, _r3, _r11, 0xEE \ // r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
255 vshuff64x2 _t0, _r3, _r11, 0x44 \ // t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
256 \
257 vshuff64x2 _r11, _r1, _r9, 0xEE \ // r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
258 vshuff64x2 _r3, _r1, _r9, 0x44 \ // r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
259 \
260 vshuff64x2 _r9, _r2, _r6, 0xEE \ // r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
261 vshuff64x2 _r1, _r2, _r6, 0x44 \ // r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
262 \
263 vmovdqu32 _r2, _t0 \ // r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
264 vmovdqu32 _r6, _t1 \ // r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
265
266
267// CH(A, B, C) = (A&B) ^ (~A&C)
268// MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
269// SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22
270// SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25
271// sigma0 = ROR_7 ^ ROR_18 ^ SHR_3
272// sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
273
274// Main processing loop per round
275#define PROCESS_LOOP(_WT, _ROUND, _A, _B, _C, _D, _E, _F, _G, _H) \
276 \ // T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
277 \ // T2 = SIGMA0(A) + MAJ(A, B, C)
278 \ // H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
279 \
280 \ // H becomes T2, then add T1 for A
281 \ // D becomes D + T1 for E
282 \
283 vpaddd T1, _H, TMP3 \ // T1 = H + Kt
284 vmovdqu32 TMP0, _E \
285 vprord TMP1, _E, 6 \ // ROR_6(E)
286 vprord TMP2, _E, 11 \ // ROR_11(E)
287 vprord TMP3, _E, 25 \ // ROR_25(E)
288 vpternlogd TMP0, _F, _G, 0xCA \ // TMP0 = CH(E,F,G)
289 vpaddd T1, T1, _WT \ // T1 = T1 + Wt
290 vpternlogd TMP1, TMP2, TMP3, 0x96 \ // TMP1 = SIGMA1(E)
291 vpaddd T1, T1, TMP0 \ // T1 = T1 + CH(E,F,G)
292 vpaddd T1, T1, TMP1 \ // T1 = T1 + SIGMA1(E)
293 vpaddd _D, _D, T1 \ // D = D + T1
294 \
295 vprord _H, _A, 2 \ // ROR_2(A)
296 vprord TMP2, _A, 13 \ // ROR_13(A)
297 vprord TMP3, _A, 22 \ // ROR_22(A)
298 vmovdqu32 TMP0, _A \
299 vpternlogd TMP0, _B, _C, 0xE8 \ // TMP0 = MAJ(A,B,C)
300 vpternlogd _H, TMP2, TMP3, 0x96 \ // H(T2) = SIGMA0(A)
301 vpaddd _H, _H, TMP0 \ // H(T2) = SIGMA0(A) + MAJ(A,B,C)
302 vpaddd _H, _H, T1 \ // H(A) = H(T2) + T1
303 \
304 vmovdqu32 TMP3, [TBL + ((_ROUND+1)*64)] \ // Next Kt
305
306
307#define MSG_SCHED_ROUND_16_63(_WT, _WTp1, _WTp9, _WTp14) \
308 vprord TMP4, _WTp14, 17 \ // ROR_17(Wt-2)
309 vprord TMP5, _WTp14, 19 \ // ROR_19(Wt-2)
310 vpsrld TMP6, _WTp14, 10 \ // SHR_10(Wt-2)
311 vpternlogd TMP4, TMP5, TMP6, 0x96 \ // TMP4 = sigma1(Wt-2)
312 \
313 vpaddd _WT, _WT, TMP4 \ // Wt = Wt-16 + sigma1(Wt-2)
314 vpaddd _WT, _WT, _WTp9 \ // Wt = Wt-16 + sigma1(Wt-2) + Wt-7
315 \
316 vprord TMP4, _WTp1, 7 \ // ROR_7(Wt-15)
317 vprord TMP5, _WTp1, 18 \ // ROR_18(Wt-15)
318 vpsrld TMP6, _WTp1, 3 \ // SHR_3(Wt-15)
319 vpternlogd TMP4, TMP5, TMP6, 0x96 \ // TMP4 = sigma0(Wt-15)
320 \
321 vpaddd _WT, _WT, TMP4 \ // Wt = Wt-16 + sigma1(Wt-2) +
322 \ // Wt-7 + sigma0(Wt-15) +
323
324
325// Note this is reading in a block of data for one lane
326// When all 16 are read, the data must be transposed to build msg schedule
327#define MSG_SCHED_ROUND_00_15(_WT, OFFSET, LABEL) \
328 TESTQ $(1<<OFFSET), MASK_P9 \
329 JE LABEL \
330 MOVQ OFFSET*24(INPUT_P9), R9 \
331 vmovups _WT, [inp0+IDX] \
332LABEL: \
333
334#define MASKED_LOAD(_WT, OFFSET, LABEL) \
335 TESTQ $(1<<OFFSET), MASK_P9 \
336 JE LABEL \
337 MOVQ OFFSET*24(INPUT_P9), R9 \
338 vmovups _WT,[inp0+IDX] \
339LABEL: \
340
341TEXT ·sha256_x16_avx512(SB), 7, $0
342 MOVQ digests+0(FP), STATE_P9 //
343 MOVQ scratch+8(FP), SCRATCH_P9
344 MOVQ mask_len+32(FP), INP_SIZE_P9 // number of blocks to process
345 MOVQ mask+24(FP), MASKP_P9
346 MOVQ (MASKP_P9), MASK_P9
347 kmovq k1, mask
348 LEAQ inputs+48(FP), INPUT_P9
349
350 // Initialize digests
351 vmovdqu32 A, [STATE + 0*SHA256_DIGEST_ROW_SIZE]
352 vmovdqu32 B, [STATE + 1*SHA256_DIGEST_ROW_SIZE]
353 vmovdqu32 C, [STATE + 2*SHA256_DIGEST_ROW_SIZE]
354 vmovdqu32 D, [STATE + 3*SHA256_DIGEST_ROW_SIZE]
355 vmovdqu32 E, [STATE + 4*SHA256_DIGEST_ROW_SIZE]
356 vmovdqu32 F, [STATE + 5*SHA256_DIGEST_ROW_SIZE]
357 vmovdqu32 G, [STATE + 6*SHA256_DIGEST_ROW_SIZE]
358 vmovdqu32 H, [STATE + 7*SHA256_DIGEST_ROW_SIZE]
359
360 MOVQ table+16(FP), TBL_P9
361
362 xor IDX, IDX
363
364 // Read in first block of input data
365 MASKED_LOAD( W0, 0, skipInput0)
366 MASKED_LOAD( W1, 1, skipInput1)
367 MASKED_LOAD( W2, 2, skipInput2)
368 MASKED_LOAD( W3, 3, skipInput3)
369 MASKED_LOAD( W4, 4, skipInput4)
370 MASKED_LOAD( W5, 5, skipInput5)
371 MASKED_LOAD( W6, 6, skipInput6)
372 MASKED_LOAD( W7, 7, skipInput7)
373 MASKED_LOAD( W8, 8, skipInput8)
374 MASKED_LOAD( W9, 9, skipInput9)
375 MASKED_LOAD(W10, 10, skipInput10)
376 MASKED_LOAD(W11, 11, skipInput11)
377 MASKED_LOAD(W12, 12, skipInput12)
378 MASKED_LOAD(W13, 13, skipInput13)
379 MASKED_LOAD(W14, 14, skipInput14)
380 MASKED_LOAD(W15, 15, skipInput15)
381
382lloop:
383 LEAQ PSHUFFLE_BYTE_FLIP_MASK<>(SB), TBL_P9
384 vmovdqu32 TMP2, [TBL]
385
386 // Get first K from table
387 MOVQ table+16(FP), TBL_P9
388 vmovdqu32 TMP3, [TBL]
389
390 // Save digests for later addition
391 vmovdqu32 [SCRATCH + 64*0], A
392 vmovdqu32 [SCRATCH + 64*1], B
393 vmovdqu32 [SCRATCH + 64*2], C
394 vmovdqu32 [SCRATCH + 64*3], D
395 vmovdqu32 [SCRATCH + 64*4], E
396 vmovdqu32 [SCRATCH + 64*5], F
397 vmovdqu32 [SCRATCH + 64*6], G
398 vmovdqu32 [SCRATCH + 64*7], H
399
400 add IDX, 64
401
402 // Transpose input data
403 TRANSPOSE16(W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1)
404
405 vpshufb W0, W0, TMP2
406 vpshufb W1, W1, TMP2
407 vpshufb W2, W2, TMP2
408 vpshufb W3, W3, TMP2
409 vpshufb W4, W4, TMP2
410 vpshufb W5, W5, TMP2
411 vpshufb W6, W6, TMP2
412 vpshufb W7, W7, TMP2
413 vpshufb W8, W8, TMP2
414 vpshufb W9, W9, TMP2
415 vpshufb W10, W10, TMP2
416 vpshufb W11, W11, TMP2
417 vpshufb W12, W12, TMP2
418 vpshufb W13, W13, TMP2
419 vpshufb W14, W14, TMP2
420 vpshufb W15, W15, TMP2
421
422 // MSG Schedule for W0-W15 is now complete in registers
423 // Process first 48 rounds
424 // Calculate next Wt+16 after processing is complete and Wt is unneeded
425
426 PROCESS_LOOP( W0, 0, A, B, C, D, E, F, G, H)
427 MSG_SCHED_ROUND_16_63( W0, W1, W9, W14)
428 PROCESS_LOOP( W1, 1, H, A, B, C, D, E, F, G)
429 MSG_SCHED_ROUND_16_63( W1, W2, W10, W15)
430 PROCESS_LOOP( W2, 2, G, H, A, B, C, D, E, F)
431 MSG_SCHED_ROUND_16_63( W2, W3, W11, W0)
432 PROCESS_LOOP( W3, 3, F, G, H, A, B, C, D, E)
433 MSG_SCHED_ROUND_16_63( W3, W4, W12, W1)
434 PROCESS_LOOP( W4, 4, E, F, G, H, A, B, C, D)
435 MSG_SCHED_ROUND_16_63( W4, W5, W13, W2)
436 PROCESS_LOOP( W5, 5, D, E, F, G, H, A, B, C)
437 MSG_SCHED_ROUND_16_63( W5, W6, W14, W3)
438 PROCESS_LOOP( W6, 6, C, D, E, F, G, H, A, B)
439 MSG_SCHED_ROUND_16_63( W6, W7, W15, W4)
440 PROCESS_LOOP( W7, 7, B, C, D, E, F, G, H, A)
441 MSG_SCHED_ROUND_16_63( W7, W8, W0, W5)
442 PROCESS_LOOP( W8, 8, A, B, C, D, E, F, G, H)
443 MSG_SCHED_ROUND_16_63( W8, W9, W1, W6)
444 PROCESS_LOOP( W9, 9, H, A, B, C, D, E, F, G)
445 MSG_SCHED_ROUND_16_63( W9, W10, W2, W7)
446 PROCESS_LOOP(W10, 10, G, H, A, B, C, D, E, F)
447 MSG_SCHED_ROUND_16_63(W10, W11, W3, W8)
448 PROCESS_LOOP(W11, 11, F, G, H, A, B, C, D, E)
449 MSG_SCHED_ROUND_16_63(W11, W12, W4, W9)
450 PROCESS_LOOP(W12, 12, E, F, G, H, A, B, C, D)
451 MSG_SCHED_ROUND_16_63(W12, W13, W5, W10)
452 PROCESS_LOOP(W13, 13, D, E, F, G, H, A, B, C)
453 MSG_SCHED_ROUND_16_63(W13, W14, W6, W11)
454 PROCESS_LOOP(W14, 14, C, D, E, F, G, H, A, B)
455 MSG_SCHED_ROUND_16_63(W14, W15, W7, W12)
456 PROCESS_LOOP(W15, 15, B, C, D, E, F, G, H, A)
457 MSG_SCHED_ROUND_16_63(W15, W0, W8, W13)
458 PROCESS_LOOP( W0, 16, A, B, C, D, E, F, G, H)
459 MSG_SCHED_ROUND_16_63( W0, W1, W9, W14)
460 PROCESS_LOOP( W1, 17, H, A, B, C, D, E, F, G)
461 MSG_SCHED_ROUND_16_63( W1, W2, W10, W15)
462 PROCESS_LOOP( W2, 18, G, H, A, B, C, D, E, F)
463 MSG_SCHED_ROUND_16_63( W2, W3, W11, W0)
464 PROCESS_LOOP( W3, 19, F, G, H, A, B, C, D, E)
465 MSG_SCHED_ROUND_16_63( W3, W4, W12, W1)
466 PROCESS_LOOP( W4, 20, E, F, G, H, A, B, C, D)
467 MSG_SCHED_ROUND_16_63( W4, W5, W13, W2)
468 PROCESS_LOOP( W5, 21, D, E, F, G, H, A, B, C)
469 MSG_SCHED_ROUND_16_63( W5, W6, W14, W3)
470 PROCESS_LOOP( W6, 22, C, D, E, F, G, H, A, B)
471 MSG_SCHED_ROUND_16_63( W6, W7, W15, W4)
472 PROCESS_LOOP( W7, 23, B, C, D, E, F, G, H, A)
473 MSG_SCHED_ROUND_16_63( W7, W8, W0, W5)
474 PROCESS_LOOP( W8, 24, A, B, C, D, E, F, G, H)
475 MSG_SCHED_ROUND_16_63( W8, W9, W1, W6)
476 PROCESS_LOOP( W9, 25, H, A, B, C, D, E, F, G)
477 MSG_SCHED_ROUND_16_63( W9, W10, W2, W7)
478 PROCESS_LOOP(W10, 26, G, H, A, B, C, D, E, F)
479 MSG_SCHED_ROUND_16_63(W10, W11, W3, W8)
480 PROCESS_LOOP(W11, 27, F, G, H, A, B, C, D, E)
481 MSG_SCHED_ROUND_16_63(W11, W12, W4, W9)
482 PROCESS_LOOP(W12, 28, E, F, G, H, A, B, C, D)
483 MSG_SCHED_ROUND_16_63(W12, W13, W5, W10)
484 PROCESS_LOOP(W13, 29, D, E, F, G, H, A, B, C)
485 MSG_SCHED_ROUND_16_63(W13, W14, W6, W11)
486 PROCESS_LOOP(W14, 30, C, D, E, F, G, H, A, B)
487 MSG_SCHED_ROUND_16_63(W14, W15, W7, W12)
488 PROCESS_LOOP(W15, 31, B, C, D, E, F, G, H, A)
489 MSG_SCHED_ROUND_16_63(W15, W0, W8, W13)
490 PROCESS_LOOP( W0, 32, A, B, C, D, E, F, G, H)
491 MSG_SCHED_ROUND_16_63( W0, W1, W9, W14)
492 PROCESS_LOOP( W1, 33, H, A, B, C, D, E, F, G)
493 MSG_SCHED_ROUND_16_63( W1, W2, W10, W15)
494 PROCESS_LOOP( W2, 34, G, H, A, B, C, D, E, F)
495 MSG_SCHED_ROUND_16_63( W2, W3, W11, W0)
496 PROCESS_LOOP( W3, 35, F, G, H, A, B, C, D, E)
497 MSG_SCHED_ROUND_16_63( W3, W4, W12, W1)
498 PROCESS_LOOP( W4, 36, E, F, G, H, A, B, C, D)
499 MSG_SCHED_ROUND_16_63( W4, W5, W13, W2)
500 PROCESS_LOOP( W5, 37, D, E, F, G, H, A, B, C)
501 MSG_SCHED_ROUND_16_63( W5, W6, W14, W3)
502 PROCESS_LOOP( W6, 38, C, D, E, F, G, H, A, B)
503 MSG_SCHED_ROUND_16_63( W6, W7, W15, W4)
504 PROCESS_LOOP( W7, 39, B, C, D, E, F, G, H, A)
505 MSG_SCHED_ROUND_16_63( W7, W8, W0, W5)
506 PROCESS_LOOP( W8, 40, A, B, C, D, E, F, G, H)
507 MSG_SCHED_ROUND_16_63( W8, W9, W1, W6)
508 PROCESS_LOOP( W9, 41, H, A, B, C, D, E, F, G)
509 MSG_SCHED_ROUND_16_63( W9, W10, W2, W7)
510 PROCESS_LOOP(W10, 42, G, H, A, B, C, D, E, F)
511 MSG_SCHED_ROUND_16_63(W10, W11, W3, W8)
512 PROCESS_LOOP(W11, 43, F, G, H, A, B, C, D, E)
513 MSG_SCHED_ROUND_16_63(W11, W12, W4, W9)
514 PROCESS_LOOP(W12, 44, E, F, G, H, A, B, C, D)
515 MSG_SCHED_ROUND_16_63(W12, W13, W5, W10)
516 PROCESS_LOOP(W13, 45, D, E, F, G, H, A, B, C)
517 MSG_SCHED_ROUND_16_63(W13, W14, W6, W11)
518 PROCESS_LOOP(W14, 46, C, D, E, F, G, H, A, B)
519 MSG_SCHED_ROUND_16_63(W14, W15, W7, W12)
520 PROCESS_LOOP(W15, 47, B, C, D, E, F, G, H, A)
521 MSG_SCHED_ROUND_16_63(W15, W0, W8, W13)
522
523 // Check if this is the last block
524 sub INP_SIZE, 1
525 JE lastLoop
526
527 // Load next mask for inputs
528 ADDQ $8, MASKP_P9
529 MOVQ (MASKP_P9), MASK_P9
530
531 // Process last 16 rounds
532 // Read in next block msg data for use in first 16 words of msg sched
533
534 PROCESS_LOOP( W0, 48, A, B, C, D, E, F, G, H)
535 MSG_SCHED_ROUND_00_15( W0, 0, skipNext0)
536 PROCESS_LOOP( W1, 49, H, A, B, C, D, E, F, G)
537 MSG_SCHED_ROUND_00_15( W1, 1, skipNext1)
538 PROCESS_LOOP( W2, 50, G, H, A, B, C, D, E, F)
539 MSG_SCHED_ROUND_00_15( W2, 2, skipNext2)
540 PROCESS_LOOP( W3, 51, F, G, H, A, B, C, D, E)
541 MSG_SCHED_ROUND_00_15( W3, 3, skipNext3)
542 PROCESS_LOOP( W4, 52, E, F, G, H, A, B, C, D)
543 MSG_SCHED_ROUND_00_15( W4, 4, skipNext4)
544 PROCESS_LOOP( W5, 53, D, E, F, G, H, A, B, C)
545 MSG_SCHED_ROUND_00_15( W5, 5, skipNext5)
546 PROCESS_LOOP( W6, 54, C, D, E, F, G, H, A, B)
547 MSG_SCHED_ROUND_00_15( W6, 6, skipNext6)
548 PROCESS_LOOP( W7, 55, B, C, D, E, F, G, H, A)
549 MSG_SCHED_ROUND_00_15( W7, 7, skipNext7)
550 PROCESS_LOOP( W8, 56, A, B, C, D, E, F, G, H)
551 MSG_SCHED_ROUND_00_15( W8, 8, skipNext8)
552 PROCESS_LOOP( W9, 57, H, A, B, C, D, E, F, G)
553 MSG_SCHED_ROUND_00_15( W9, 9, skipNext9)
554 PROCESS_LOOP(W10, 58, G, H, A, B, C, D, E, F)
555 MSG_SCHED_ROUND_00_15(W10, 10, skipNext10)
556 PROCESS_LOOP(W11, 59, F, G, H, A, B, C, D, E)
557 MSG_SCHED_ROUND_00_15(W11, 11, skipNext11)
558 PROCESS_LOOP(W12, 60, E, F, G, H, A, B, C, D)
559 MSG_SCHED_ROUND_00_15(W12, 12, skipNext12)
560 PROCESS_LOOP(W13, 61, D, E, F, G, H, A, B, C)
561 MSG_SCHED_ROUND_00_15(W13, 13, skipNext13)
562 PROCESS_LOOP(W14, 62, C, D, E, F, G, H, A, B)
563 MSG_SCHED_ROUND_00_15(W14, 14, skipNext14)
564 PROCESS_LOOP(W15, 63, B, C, D, E, F, G, H, A)
565 MSG_SCHED_ROUND_00_15(W15, 15, skipNext15)
566
567 // Add old digest
568 vmovdqu32 TMP2, A
569 vmovdqu32 A, [SCRATCH + 64*0]
570 vpaddd A{k1}, A, TMP2
571 vmovdqu32 TMP2, B
572 vmovdqu32 B, [SCRATCH + 64*1]
573 vpaddd B{k1}, B, TMP2
574 vmovdqu32 TMP2, C
575 vmovdqu32 C, [SCRATCH + 64*2]
576 vpaddd C{k1}, C, TMP2
577 vmovdqu32 TMP2, D
578 vmovdqu32 D, [SCRATCH + 64*3]
579 vpaddd D{k1}, D, TMP2
580 vmovdqu32 TMP2, E
581 vmovdqu32 E, [SCRATCH + 64*4]
582 vpaddd E{k1}, E, TMP2
583 vmovdqu32 TMP2, F
584 vmovdqu32 F, [SCRATCH + 64*5]
585 vpaddd F{k1}, F, TMP2
586 vmovdqu32 TMP2, G
587 vmovdqu32 G, [SCRATCH + 64*6]
588 vpaddd G{k1}, G, TMP2
589 vmovdqu32 TMP2, H
590 vmovdqu32 H, [SCRATCH + 64*7]
591 vpaddd H{k1}, H, TMP2
592
593 kmovq k1, mask
594 JMP lloop
595
596lastLoop:
597 // Process last 16 rounds
598 PROCESS_LOOP( W0, 48, A, B, C, D, E, F, G, H)
599 PROCESS_LOOP( W1, 49, H, A, B, C, D, E, F, G)
600 PROCESS_LOOP( W2, 50, G, H, A, B, C, D, E, F)
601 PROCESS_LOOP( W3, 51, F, G, H, A, B, C, D, E)
602 PROCESS_LOOP( W4, 52, E, F, G, H, A, B, C, D)
603 PROCESS_LOOP( W5, 53, D, E, F, G, H, A, B, C)
604 PROCESS_LOOP( W6, 54, C, D, E, F, G, H, A, B)
605 PROCESS_LOOP( W7, 55, B, C, D, E, F, G, H, A)
606 PROCESS_LOOP( W8, 56, A, B, C, D, E, F, G, H)
607 PROCESS_LOOP( W9, 57, H, A, B, C, D, E, F, G)
608 PROCESS_LOOP(W10, 58, G, H, A, B, C, D, E, F)
609 PROCESS_LOOP(W11, 59, F, G, H, A, B, C, D, E)
610 PROCESS_LOOP(W12, 60, E, F, G, H, A, B, C, D)
611 PROCESS_LOOP(W13, 61, D, E, F, G, H, A, B, C)
612 PROCESS_LOOP(W14, 62, C, D, E, F, G, H, A, B)
613 PROCESS_LOOP(W15, 63, B, C, D, E, F, G, H, A)
614
615 // Add old digest
616 vmovdqu32 TMP2, A
617 vmovdqu32 A, [SCRATCH + 64*0]
618 vpaddd A{k1}, A, TMP2
619 vmovdqu32 TMP2, B
620 vmovdqu32 B, [SCRATCH + 64*1]
621 vpaddd B{k1}, B, TMP2
622 vmovdqu32 TMP2, C
623 vmovdqu32 C, [SCRATCH + 64*2]
624 vpaddd C{k1}, C, TMP2
625 vmovdqu32 TMP2, D
626 vmovdqu32 D, [SCRATCH + 64*3]
627 vpaddd D{k1}, D, TMP2
628 vmovdqu32 TMP2, E
629 vmovdqu32 E, [SCRATCH + 64*4]
630 vpaddd E{k1}, E, TMP2
631 vmovdqu32 TMP2, F
632 vmovdqu32 F, [SCRATCH + 64*5]
633 vpaddd F{k1}, F, TMP2
634 vmovdqu32 TMP2, G
635 vmovdqu32 G, [SCRATCH + 64*6]
636 vpaddd G{k1}, G, TMP2
637 vmovdqu32 TMP2, H
638 vmovdqu32 H, [SCRATCH + 64*7]
639 vpaddd H{k1}, H, TMP2
640
641 // Write out digest
642 vmovdqu32 [STATE + 0*SHA256_DIGEST_ROW_SIZE], A
643 vmovdqu32 [STATE + 1*SHA256_DIGEST_ROW_SIZE], B
644 vmovdqu32 [STATE + 2*SHA256_DIGEST_ROW_SIZE], C
645 vmovdqu32 [STATE + 3*SHA256_DIGEST_ROW_SIZE], D
646 vmovdqu32 [STATE + 4*SHA256_DIGEST_ROW_SIZE], E
647 vmovdqu32 [STATE + 5*SHA256_DIGEST_ROW_SIZE], F
648 vmovdqu32 [STATE + 6*SHA256_DIGEST_ROW_SIZE], G
649 vmovdqu32 [STATE + 7*SHA256_DIGEST_ROW_SIZE], H
650
651 VZEROUPPER
652 RET
653
654//
655// Tables
656//
657
658DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x000(SB)/8, $0x0405060700010203
659DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x008(SB)/8, $0x0c0d0e0f08090a0b
660DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x010(SB)/8, $0x0405060700010203
661DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x018(SB)/8, $0x0c0d0e0f08090a0b
662DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x020(SB)/8, $0x0405060700010203
663DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x028(SB)/8, $0x0c0d0e0f08090a0b
664DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x030(SB)/8, $0x0405060700010203
665DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x038(SB)/8, $0x0c0d0e0f08090a0b
666GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), 8, $64
667
668DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x000(SB)/8, $0x0000000000000000
669DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x008(SB)/8, $0x0000000000000001
670DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x010(SB)/8, $0x0000000000000008
671DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x018(SB)/8, $0x0000000000000009
672DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x020(SB)/8, $0x0000000000000004
673DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x028(SB)/8, $0x0000000000000005
674DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x030(SB)/8, $0x000000000000000C
675DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x038(SB)/8, $0x000000000000000D
676GLOBL PSHUFFLE_TRANSPOSE16_MASK1<>(SB), 8, $64
677
678DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x000(SB)/8, $0x0000000000000002
679DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x008(SB)/8, $0x0000000000000003
680DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x010(SB)/8, $0x000000000000000A
681DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x018(SB)/8, $0x000000000000000B
682DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x020(SB)/8, $0x0000000000000006
683DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x028(SB)/8, $0x0000000000000007
684DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x030(SB)/8, $0x000000000000000E
685DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x038(SB)/8, $0x000000000000000F
686GLOBL PSHUFFLE_TRANSPOSE16_MASK2<>(SB), 8, $64