diff options
| author | Rutger Broekhoff | 2023-12-29 21:31:53 +0100 |
|---|---|---|
| committer | Rutger Broekhoff | 2023-12-29 21:31:53 +0100 |
| commit | 404aeae4545d2426c089a5f8d5e82dae56f5212b (patch) | |
| tree | 2d84e00af272b39fc04f3795ae06bc48970e57b5 /vendor/github.com/minio/md5-simd/block_amd64.go | |
| parent | 209d8b0187ed025dec9ac149ebcced3462877bff (diff) | |
| download | gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.tar.gz gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.zip | |
Make Nix builds work
Diffstat (limited to 'vendor/github.com/minio/md5-simd/block_amd64.go')
| -rw-r--r-- | vendor/github.com/minio/md5-simd/block_amd64.go | 210 |
1 files changed, 210 insertions, 0 deletions
diff --git a/vendor/github.com/minio/md5-simd/block_amd64.go b/vendor/github.com/minio/md5-simd/block_amd64.go new file mode 100644 index 0000000..16edda2 --- /dev/null +++ b/vendor/github.com/minio/md5-simd/block_amd64.go | |||
| @@ -0,0 +1,210 @@ | |||
| 1 | //+build !noasm,!appengine,gc | ||
| 2 | |||
| 3 | // Copyright (c) 2020 MinIO Inc. All rights reserved. | ||
| 4 | // Use of this source code is governed by a license that can be | ||
| 5 | // found in the LICENSE file. | ||
| 6 | |||
| 7 | package md5simd | ||
| 8 | |||
| 9 | import ( | ||
| 10 | "fmt" | ||
| 11 | "math" | ||
| 12 | "unsafe" | ||
| 13 | |||
| 14 | "github.com/klauspost/cpuid/v2" | ||
| 15 | ) | ||
| 16 | |||
| 17 | var hasAVX512 bool | ||
| 18 | |||
| 19 | func init() { | ||
| 20 | // VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F. | ||
| 21 | hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ) | ||
| 22 | } | ||
| 23 | |||
| 24 | //go:noescape | ||
| 25 | func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int) | ||
| 26 | |||
| 27 | //go:noescape | ||
| 28 | func block16(state *uint32, base uintptr, ptrs *int32, mask uint64, n int) | ||
| 29 | |||
| 30 | // 8-way 4x uint32 digests in 4 ymm registers | ||
| 31 | // (ymm0, ymm1, ymm2, ymm3) | ||
| 32 | type digest8 struct { | ||
| 33 | v0, v1, v2, v3 [8]uint32 | ||
| 34 | } | ||
| 35 | |||
| 36 | // Stack cache for 8x64 byte md5.BlockSize bytes. | ||
| 37 | // Must be 32-byte aligned, so allocate 512+32 and | ||
| 38 | // align upwards at runtime. | ||
| 39 | type cache8 [512 + 32]byte | ||
| 40 | |||
| 41 | // MD5 magic numbers for one lane of hashing; inflated | ||
| 42 | // 8x below at init time. | ||
| 43 | var md5consts = [64]uint32{ | ||
| 44 | 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, | ||
| 45 | 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, | ||
| 46 | 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, | ||
| 47 | 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, | ||
| 48 | 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, | ||
| 49 | 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, | ||
| 50 | 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, | ||
| 51 | 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, | ||
| 52 | 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, | ||
| 53 | 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, | ||
| 54 | 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, | ||
| 55 | 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, | ||
| 56 | 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, | ||
| 57 | 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, | ||
| 58 | 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, | ||
| 59 | 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391, | ||
| 60 | } | ||
| 61 | |||
| 62 | // inflate the consts 8-way for 8x md5 (256 bit ymm registers) | ||
| 63 | var avx256md5consts = func(c []uint32) []uint32 { | ||
| 64 | inf := make([]uint32, 8*len(c)) | ||
| 65 | for i := range c { | ||
| 66 | for j := 0; j < 8; j++ { | ||
| 67 | inf[(i*8)+j] = c[i] | ||
| 68 | } | ||
| 69 | } | ||
| 70 | return inf | ||
| 71 | }(md5consts[:]) | ||
| 72 | |||
| 73 | // 16-way 4x uint32 digests in 4 zmm registers | ||
| 74 | type digest16 struct { | ||
| 75 | v0, v1, v2, v3 [16]uint32 | ||
| 76 | } | ||
| 77 | |||
| 78 | // inflate the consts 16-way for 16x md5 (512 bit zmm registers) | ||
| 79 | var avx512md5consts = func(c []uint32) []uint32 { | ||
| 80 | inf := make([]uint32, 16*len(c)) | ||
| 81 | for i := range c { | ||
| 82 | for j := 0; j < 16; j++ { | ||
| 83 | inf[(i*16)+j] = c[i] | ||
| 84 | } | ||
| 85 | } | ||
| 86 | return inf | ||
| 87 | }(md5consts[:]) | ||
| 88 | |||
| 89 | // Interface function to assembly code | ||
| 90 | func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) { | ||
| 91 | if hasAVX512 { | ||
| 92 | blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16) | ||
| 93 | return | ||
| 94 | } | ||
| 95 | |||
| 96 | // Preparing data using copy is slower since copies aren't inlined. | ||
| 97 | |||
| 98 | // Calculate on this goroutine | ||
| 99 | if half { | ||
| 100 | for i := range s.i8[0][:] { | ||
| 101 | s.i8[0][i] = input[i] | ||
| 102 | } | ||
| 103 | for i := range s.d8a.v0[:] { | ||
| 104 | s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i] | ||
| 105 | } | ||
| 106 | blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a) | ||
| 107 | for i := range s.d8a.v0[:] { | ||
| 108 | d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] | ||
| 109 | } | ||
| 110 | return | ||
| 111 | } | ||
| 112 | |||
| 113 | for i := range s.i8[0][:] { | ||
| 114 | s.i8[0][i], s.i8[1][i] = input[i], input[8+i] | ||
| 115 | } | ||
| 116 | |||
| 117 | for i := range s.d8a.v0[:] { | ||
| 118 | j := (i + 8) & 15 | ||
| 119 | s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i] | ||
| 120 | s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j] | ||
| 121 | } | ||
| 122 | |||
| 123 | // Benchmarks appears to be slightly faster when spinning up 2 goroutines instead | ||
| 124 | // of using the current for one of the blocks. | ||
| 125 | s.wg.Add(2) | ||
| 126 | go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }() | ||
| 127 | go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }() | ||
| 128 | s.wg.Wait() | ||
| 129 | for i := range s.d8a.v0[:] { | ||
| 130 | d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] | ||
| 131 | } | ||
| 132 | for i := range s.d8b.v0[:] { | ||
| 133 | j := (i + 8) & 15 | ||
| 134 | d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] | ||
| 135 | } | ||
| 136 | } | ||
| 137 | |||
| 138 | // Interface function to AVX512 assembly code | ||
| 139 | func blockMd5_avx512(s *digest16, input [16][]byte, base []byte, maskRounds *[16]maskRounds) { | ||
| 140 | baseMin := uint64(uintptr(unsafe.Pointer(&(base[0])))) | ||
| 141 | ptrs := [16]int32{} | ||
| 142 | |||
| 143 | for i := range ptrs { | ||
| 144 | if len(input[i]) > 0 { | ||
| 145 | if len(input[i]) > internalBlockSize { | ||
| 146 | panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i)) | ||
| 147 | } | ||
| 148 | |||
| 149 | off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin | ||
| 150 | if off > math.MaxUint32 { | ||
| 151 | panic(fmt.Sprintf("invalid buffer sent with offset %x", off)) | ||
| 152 | } | ||
| 153 | ptrs[i] = int32(off) | ||
| 154 | } | ||
| 155 | } | ||
| 156 | |||
| 157 | sdup := *s // create copy of initial states to receive intermediate updates | ||
| 158 | |||
| 159 | rounds := generateMaskAndRounds16(input, maskRounds) | ||
| 160 | |||
| 161 | for r := 0; r < rounds; r++ { | ||
| 162 | m := maskRounds[r] | ||
| 163 | |||
| 164 | block16(&sdup.v0[0], uintptr(baseMin), &ptrs[0], m.mask, int(64*m.rounds)) | ||
| 165 | |||
| 166 | for j := 0; j < len(ptrs); j++ { | ||
| 167 | ptrs[j] += int32(64 * m.rounds) // update pointers for next round | ||
| 168 | if m.mask&(1<<j) != 0 { // update digest if still masked as active | ||
| 169 | (*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j] | ||
| 170 | } | ||
| 171 | } | ||
| 172 | } | ||
| 173 | } | ||
| 174 | |||
| 175 | // Interface function to AVX2 assembly code | ||
| 176 | func blockMd5_avx2(s *digest8, input [8][]byte, base []byte, maskRounds *[8]maskRounds) { | ||
| 177 | baseMin := uint64(uintptr(unsafe.Pointer(&(base[0])))) - 4 | ||
| 178 | ptrs := [8]int32{} | ||
| 179 | |||
| 180 | for i := range ptrs { | ||
| 181 | if len(input[i]) > 0 { | ||
| 182 | if len(input[i]) > internalBlockSize { | ||
| 183 | panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i)) | ||
| 184 | } | ||
| 185 | |||
| 186 | off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin | ||
| 187 | if off > math.MaxUint32 { | ||
| 188 | panic(fmt.Sprintf("invalid buffer sent with offset %x", off)) | ||
| 189 | } | ||
| 190 | ptrs[i] = int32(off) | ||
| 191 | } | ||
| 192 | } | ||
| 193 | |||
| 194 | sdup := *s // create copy of initial states to receive intermediate updates | ||
| 195 | |||
| 196 | rounds := generateMaskAndRounds8(input, maskRounds) | ||
| 197 | |||
| 198 | for r := 0; r < rounds; r++ { | ||
| 199 | m := maskRounds[r] | ||
| 200 | var cache cache8 // stack storage for block8 tmp state | ||
| 201 | block8(&sdup.v0[0], uintptr(baseMin), &ptrs[0], &cache[0], int(64*m.rounds)) | ||
| 202 | |||
| 203 | for j := 0; j < len(ptrs); j++ { | ||
| 204 | ptrs[j] += int32(64 * m.rounds) // update pointers for next round | ||
| 205 | if m.mask&(1<<j) != 0 { // update digest if still masked as active | ||
| 206 | (*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j] | ||
| 207 | } | ||
| 208 | } | ||
| 209 | } | ||
| 210 | } | ||