1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
|
//+build !noasm,!appengine,gc
// SHA intrinsic version of SHA256
// Kristofer Peterson, (C) 2018.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "textflag.h"
DATA K<>+0x00(SB)/4, $0x428a2f98
DATA K<>+0x04(SB)/4, $0x71374491
DATA K<>+0x08(SB)/4, $0xb5c0fbcf
DATA K<>+0x0c(SB)/4, $0xe9b5dba5
DATA K<>+0x10(SB)/4, $0x3956c25b
DATA K<>+0x14(SB)/4, $0x59f111f1
DATA K<>+0x18(SB)/4, $0x923f82a4
DATA K<>+0x1c(SB)/4, $0xab1c5ed5
DATA K<>+0x20(SB)/4, $0xd807aa98
DATA K<>+0x24(SB)/4, $0x12835b01
DATA K<>+0x28(SB)/4, $0x243185be
DATA K<>+0x2c(SB)/4, $0x550c7dc3
DATA K<>+0x30(SB)/4, $0x72be5d74
DATA K<>+0x34(SB)/4, $0x80deb1fe
DATA K<>+0x38(SB)/4, $0x9bdc06a7
DATA K<>+0x3c(SB)/4, $0xc19bf174
DATA K<>+0x40(SB)/4, $0xe49b69c1
DATA K<>+0x44(SB)/4, $0xefbe4786
DATA K<>+0x48(SB)/4, $0x0fc19dc6
DATA K<>+0x4c(SB)/4, $0x240ca1cc
DATA K<>+0x50(SB)/4, $0x2de92c6f
DATA K<>+0x54(SB)/4, $0x4a7484aa
DATA K<>+0x58(SB)/4, $0x5cb0a9dc
DATA K<>+0x5c(SB)/4, $0x76f988da
DATA K<>+0x60(SB)/4, $0x983e5152
DATA K<>+0x64(SB)/4, $0xa831c66d
DATA K<>+0x68(SB)/4, $0xb00327c8
DATA K<>+0x6c(SB)/4, $0xbf597fc7
DATA K<>+0x70(SB)/4, $0xc6e00bf3
DATA K<>+0x74(SB)/4, $0xd5a79147
DATA K<>+0x78(SB)/4, $0x06ca6351
DATA K<>+0x7c(SB)/4, $0x14292967
DATA K<>+0x80(SB)/4, $0x27b70a85
DATA K<>+0x84(SB)/4, $0x2e1b2138
DATA K<>+0x88(SB)/4, $0x4d2c6dfc
DATA K<>+0x8c(SB)/4, $0x53380d13
DATA K<>+0x90(SB)/4, $0x650a7354
DATA K<>+0x94(SB)/4, $0x766a0abb
DATA K<>+0x98(SB)/4, $0x81c2c92e
DATA K<>+0x9c(SB)/4, $0x92722c85
DATA K<>+0xa0(SB)/4, $0xa2bfe8a1
DATA K<>+0xa4(SB)/4, $0xa81a664b
DATA K<>+0xa8(SB)/4, $0xc24b8b70
DATA K<>+0xac(SB)/4, $0xc76c51a3
DATA K<>+0xb0(SB)/4, $0xd192e819
DATA K<>+0xb4(SB)/4, $0xd6990624
DATA K<>+0xb8(SB)/4, $0xf40e3585
DATA K<>+0xbc(SB)/4, $0x106aa070
DATA K<>+0xc0(SB)/4, $0x19a4c116
DATA K<>+0xc4(SB)/4, $0x1e376c08
DATA K<>+0xc8(SB)/4, $0x2748774c
DATA K<>+0xcc(SB)/4, $0x34b0bcb5
DATA K<>+0xd0(SB)/4, $0x391c0cb3
DATA K<>+0xd4(SB)/4, $0x4ed8aa4a
DATA K<>+0xd8(SB)/4, $0x5b9cca4f
DATA K<>+0xdc(SB)/4, $0x682e6ff3
DATA K<>+0xe0(SB)/4, $0x748f82ee
DATA K<>+0xe4(SB)/4, $0x78a5636f
DATA K<>+0xe8(SB)/4, $0x84c87814
DATA K<>+0xec(SB)/4, $0x8cc70208
DATA K<>+0xf0(SB)/4, $0x90befffa
DATA K<>+0xf4(SB)/4, $0xa4506ceb
DATA K<>+0xf8(SB)/4, $0xbef9a3f7
DATA K<>+0xfc(SB)/4, $0xc67178f2
GLOBL K<>(SB), RODATA|NOPTR, $256
DATA SHUF_MASK<>+0x00(SB)/8, $0x0405060700010203
DATA SHUF_MASK<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
GLOBL SHUF_MASK<>(SB), RODATA|NOPTR, $16
// Register Usage
// BX base address of constant table (constant)
// DX hash_state (constant)
// SI hash_data.data
// DI hash_data.data + hash_data.length - 64 (constant)
// X0 scratch
// X1 scratch
// X2 working hash state // ABEF
// X3 working hash state // CDGH
// X4 first 16 bytes of block
// X5 second 16 bytes of block
// X6 third 16 bytes of block
// X7 fourth 16 bytes of block
// X12 saved hash state // ABEF
// X13 saved hash state // CDGH
// X15 data shuffle mask (constant)
TEXT ·blockIntelSha(SB), NOSPLIT, $0-32
MOVQ h+0(FP), DX
MOVQ message_base+8(FP), SI
MOVQ message_len+16(FP), DI
LEAQ -64(SI)(DI*1), DI
MOVOU (DX), X2
MOVOU 16(DX), X1
MOVO X2, X3
PUNPCKLLQ X1, X2
PUNPCKHLQ X1, X3
PSHUFD $0x27, X2, X2
PSHUFD $0x27, X3, X3
MOVO SHUF_MASK<>(SB), X15
LEAQ K<>(SB), BX
JMP TEST
LOOP:
MOVO X2, X12
MOVO X3, X13
// load block and shuffle
MOVOU (SI), X4
MOVOU 16(SI), X5
MOVOU 32(SI), X6
MOVOU 48(SI), X7
PSHUFB X15, X4
PSHUFB X15, X5
PSHUFB X15, X6
PSHUFB X15, X7
#define ROUND456 \
PADDL X5, X0 \
LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
MOVO X5, X1 \
LONG $0x0f3a0f66; WORD $0x04cc \ // PALIGNR XMM1, XMM4, 4
PADDL X1, X6 \
LONG $0xf5cd380f \ // SHA256MSG2 XMM6, XMM5
PSHUFD $0x4e, X0, X0 \
LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
LONG $0xe5cc380f // SHA256MSG1 XMM4, XMM5
#define ROUND567 \
PADDL X6, X0 \
LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
MOVO X6, X1 \
LONG $0x0f3a0f66; WORD $0x04cd \ // PALIGNR XMM1, XMM5, 4
PADDL X1, X7 \
LONG $0xfecd380f \ // SHA256MSG2 XMM7, XMM6
PSHUFD $0x4e, X0, X0 \
LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
LONG $0xeecc380f // SHA256MSG1 XMM5, XMM6
#define ROUND674 \
PADDL X7, X0 \
LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
MOVO X7, X1 \
LONG $0x0f3a0f66; WORD $0x04ce \ // PALIGNR XMM1, XMM6, 4
PADDL X1, X4 \
LONG $0xe7cd380f \ // SHA256MSG2 XMM4, XMM7
PSHUFD $0x4e, X0, X0 \
LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
LONG $0xf7cc380f // SHA256MSG1 XMM6, XMM7
#define ROUND745 \
PADDL X4, X0 \
LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
MOVO X4, X1 \
LONG $0x0f3a0f66; WORD $0x04cf \ // PALIGNR XMM1, XMM7, 4
PADDL X1, X5 \
LONG $0xeccd380f \ // SHA256MSG2 XMM5, XMM4
PSHUFD $0x4e, X0, X0 \
LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
LONG $0xfccc380f // SHA256MSG1 XMM7, XMM4
// rounds 0-3
MOVO (BX), X0
PADDL X4, X0
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
PSHUFD $0x4e, X0, X0
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
// rounds 4-7
MOVO 1*16(BX), X0
PADDL X5, X0
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
PSHUFD $0x4e, X0, X0
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
LONG $0xe5cc380f // SHA256MSG1 XMM4, XMM5
// rounds 8-11
MOVO 2*16(BX), X0
PADDL X6, X0
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
PSHUFD $0x4e, X0, X0
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
LONG $0xeecc380f // SHA256MSG1 XMM5, XMM6
MOVO 3*16(BX), X0; ROUND674 // rounds 12-15
MOVO 4*16(BX), X0; ROUND745 // rounds 16-19
MOVO 5*16(BX), X0; ROUND456 // rounds 20-23
MOVO 6*16(BX), X0; ROUND567 // rounds 24-27
MOVO 7*16(BX), X0; ROUND674 // rounds 28-31
MOVO 8*16(BX), X0; ROUND745 // rounds 32-35
MOVO 9*16(BX), X0; ROUND456 // rounds 36-39
MOVO 10*16(BX), X0; ROUND567 // rounds 40-43
MOVO 11*16(BX), X0; ROUND674 // rounds 44-47
MOVO 12*16(BX), X0; ROUND745 // rounds 48-51
// rounds 52-55
MOVO 13*16(BX), X0
PADDL X5, X0
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
MOVO X5, X1
LONG $0x0f3a0f66; WORD $0x04cc // PALIGNR XMM1, XMM4, 4
PADDL X1, X6
LONG $0xf5cd380f // SHA256MSG2 XMM6, XMM5
PSHUFD $0x4e, X0, X0
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
// rounds 56-59
MOVO 14*16(BX), X0
PADDL X6, X0
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
MOVO X6, X1
LONG $0x0f3a0f66; WORD $0x04cd // PALIGNR XMM1, XMM5, 4
PADDL X1, X7
LONG $0xfecd380f // SHA256MSG2 XMM7, XMM6
PSHUFD $0x4e, X0, X0
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
// rounds 60-63
MOVO 15*16(BX), X0
PADDL X7, X0
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
PSHUFD $0x4e, X0, X0
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
PADDL X12, X2
PADDL X13, X3
ADDQ $64, SI
TEST:
CMPQ SI, DI
JBE LOOP
PSHUFD $0x4e, X3, X0
LONG $0x0e3a0f66; WORD $0xf0c2 // PBLENDW XMM0, XMM2, 0xf0
PSHUFD $0x4e, X2, X1
LONG $0x0e3a0f66; WORD $0x0fcb // PBLENDW XMM1, XMM3, 0x0f
PSHUFD $0x1b, X0, X0
PSHUFD $0x1b, X1, X1
MOVOU X0, (DX)
MOVOU X1, 16(DX)
RET
|