aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/minio/md5-simd/block_amd64.go
diff options
context:
space:
mode:
authorLibravatar Rutger Broekhoff2023-12-29 21:31:53 +0100
committerLibravatar Rutger Broekhoff2023-12-29 21:31:53 +0100
commit404aeae4545d2426c089a5f8d5e82dae56f5212b (patch)
tree2d84e00af272b39fc04f3795ae06bc48970e57b5 /vendor/github.com/minio/md5-simd/block_amd64.go
parent209d8b0187ed025dec9ac149ebcced3462877bff (diff)
downloadgitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.tar.gz
gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.zip
Make Nix builds work
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block_amd64.go')
-rw-r--r--vendor/github.com/minio/md5-simd/block_amd64.go210
1 files changed, 210 insertions, 0 deletions
diff --git a/vendor/github.com/minio/md5-simd/block_amd64.go b/vendor/github.com/minio/md5-simd/block_amd64.go
new file mode 100644
index 0000000..16edda2
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/block_amd64.go
@@ -0,0 +1,210 @@
1//+build !noasm,!appengine,gc
2
3// Copyright (c) 2020 MinIO Inc. All rights reserved.
4// Use of this source code is governed by a license that can be
5// found in the LICENSE file.
6
7package md5simd
8
9import (
10 "fmt"
11 "math"
12 "unsafe"
13
14 "github.com/klauspost/cpuid/v2"
15)
16
17var hasAVX512 bool
18
19func init() {
20 // VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F.
21 hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ)
22}
23
24//go:noescape
25func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
26
27//go:noescape
28func block16(state *uint32, base uintptr, ptrs *int32, mask uint64, n int)
29
30// 8-way 4x uint32 digests in 4 ymm registers
31// (ymm0, ymm1, ymm2, ymm3)
32type digest8 struct {
33 v0, v1, v2, v3 [8]uint32
34}
35
36// Stack cache for 8x64 byte md5.BlockSize bytes.
37// Must be 32-byte aligned, so allocate 512+32 and
38// align upwards at runtime.
39type cache8 [512 + 32]byte
40
41// MD5 magic numbers for one lane of hashing; inflated
42// 8x below at init time.
43var md5consts = [64]uint32{
44 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
45 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
46 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
47 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
48 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
49 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
50 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
51 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
52 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
53 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
54 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
55 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
56 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
57 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
58 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
59 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
60}
61
62// inflate the consts 8-way for 8x md5 (256 bit ymm registers)
63var avx256md5consts = func(c []uint32) []uint32 {
64 inf := make([]uint32, 8*len(c))
65 for i := range c {
66 for j := 0; j < 8; j++ {
67 inf[(i*8)+j] = c[i]
68 }
69 }
70 return inf
71}(md5consts[:])
72
73// 16-way 4x uint32 digests in 4 zmm registers
74type digest16 struct {
75 v0, v1, v2, v3 [16]uint32
76}
77
78// inflate the consts 16-way for 16x md5 (512 bit zmm registers)
79var avx512md5consts = func(c []uint32) []uint32 {
80 inf := make([]uint32, 16*len(c))
81 for i := range c {
82 for j := 0; j < 16; j++ {
83 inf[(i*16)+j] = c[i]
84 }
85 }
86 return inf
87}(md5consts[:])
88
89// Interface function to assembly code
90func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
91 if hasAVX512 {
92 blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
93 return
94 }
95
96 // Preparing data using copy is slower since copies aren't inlined.
97
98 // Calculate on this goroutine
99 if half {
100 for i := range s.i8[0][:] {
101 s.i8[0][i] = input[i]
102 }
103 for i := range s.d8a.v0[:] {
104 s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
105 }
106 blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a)
107 for i := range s.d8a.v0[:] {
108 d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
109 }
110 return
111 }
112
113 for i := range s.i8[0][:] {
114 s.i8[0][i], s.i8[1][i] = input[i], input[8+i]
115 }
116
117 for i := range s.d8a.v0[:] {
118 j := (i + 8) & 15
119 s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
120 s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
121 }
122
123 // Benchmarks appears to be slightly faster when spinning up 2 goroutines instead
124 // of using the current for one of the blocks.
125 s.wg.Add(2)
126 go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }()
127 go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }()
128 s.wg.Wait()
129 for i := range s.d8a.v0[:] {
130 d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
131 }
132 for i := range s.d8b.v0[:] {
133 j := (i + 8) & 15
134 d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i]
135 }
136}
137
138// Interface function to AVX512 assembly code
139func blockMd5_avx512(s *digest16, input [16][]byte, base []byte, maskRounds *[16]maskRounds) {
140 baseMin := uint64(uintptr(unsafe.Pointer(&(base[0]))))
141 ptrs := [16]int32{}
142
143 for i := range ptrs {
144 if len(input[i]) > 0 {
145 if len(input[i]) > internalBlockSize {
146 panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
147 }
148
149 off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
150 if off > math.MaxUint32 {
151 panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
152 }
153 ptrs[i] = int32(off)
154 }
155 }
156
157 sdup := *s // create copy of initial states to receive intermediate updates
158
159 rounds := generateMaskAndRounds16(input, maskRounds)
160
161 for r := 0; r < rounds; r++ {
162 m := maskRounds[r]
163
164 block16(&sdup.v0[0], uintptr(baseMin), &ptrs[0], m.mask, int(64*m.rounds))
165
166 for j := 0; j < len(ptrs); j++ {
167 ptrs[j] += int32(64 * m.rounds) // update pointers for next round
168 if m.mask&(1<<j) != 0 { // update digest if still masked as active
169 (*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
170 }
171 }
172 }
173}
174
175// Interface function to AVX2 assembly code
176func blockMd5_avx2(s *digest8, input [8][]byte, base []byte, maskRounds *[8]maskRounds) {
177 baseMin := uint64(uintptr(unsafe.Pointer(&(base[0])))) - 4
178 ptrs := [8]int32{}
179
180 for i := range ptrs {
181 if len(input[i]) > 0 {
182 if len(input[i]) > internalBlockSize {
183 panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
184 }
185
186 off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
187 if off > math.MaxUint32 {
188 panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
189 }
190 ptrs[i] = int32(off)
191 }
192 }
193
194 sdup := *s // create copy of initial states to receive intermediate updates
195
196 rounds := generateMaskAndRounds8(input, maskRounds)
197
198 for r := 0; r < rounds; r++ {
199 m := maskRounds[r]
200 var cache cache8 // stack storage for block8 tmp state
201 block8(&sdup.v0[0], uintptr(baseMin), &ptrs[0], &cache[0], int(64*m.rounds))
202
203 for j := 0; j < len(ptrs); j++ {
204 ptrs[j] += int32(64 * m.rounds) // update pointers for next round
205 if m.mask&(1<<j) != 0 { // update digest if still masked as active
206 (*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
207 }
208 }
209 }
210}