aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/minio/md5-simd/md5-server_amd64.go
blob: 94f741c545350bb5ea197ee9263436be0bd7d461 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
//+build !noasm,!appengine,gc

// Copyright (c) 2020 MinIO Inc. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.

package md5simd

import (
	"encoding/binary"
	"fmt"
	"runtime"
	"sync"

	"github.com/klauspost/cpuid/v2"
)

// MD5 initialization constants
const (
	// Lanes is the number of concurrently calculated hashes.
	Lanes = 16

	init0 = 0x67452301
	init1 = 0xefcdab89
	init2 = 0x98badcfe
	init3 = 0x10325476

	// Use scalar routine when below this many lanes
	useScalarBelow = 3
)

// md5ServerUID - Does not start at 0 but next multiple of 16 so as to be able to
// differentiate with default initialisation value of 0
const md5ServerUID = Lanes

const buffersPerLane = 3

// Message to send across input channel
type blockInput struct {
	uid   uint64
	msg   []byte
	sumCh chan sumResult
	reset bool
}

type sumResult struct {
	digest [Size]byte
}

type lanesInfo [Lanes]blockInput

// md5Server - Type to implement parallel handling of MD5 invocations
type md5Server struct {
	uidCounter   uint64
	cycle        chan uint64           // client with uid has update.
	newInput     chan newClient        // Add new client.
	digests      map[uint64][Size]byte // Map of uids to (interim) digest results
	maskRounds16 [16]maskRounds        // Pre-allocated static array for max 16 rounds
	maskRounds8a [8]maskRounds         // Pre-allocated static array for max 8 rounds (1st AVX2 core)
	maskRounds8b [8]maskRounds         // Pre-allocated static array for max 8 rounds (2nd AVX2 core)
	allBufs      []byte                // Preallocated buffer.
	buffers      chan []byte           // Preallocated buffers, sliced from allBufs.

	i8       [2][8][]byte // avx2 temporary vars
	d8a, d8b digest8
	wg       sync.WaitGroup
}

// NewServer - Create new object for parallel processing handling
func NewServer() Server {
	if !cpuid.CPU.Supports(cpuid.AVX2) {
		return &fallbackServer{}
	}
	md5srv := &md5Server{}
	md5srv.digests = make(map[uint64][Size]byte)
	md5srv.newInput = make(chan newClient, Lanes)
	md5srv.cycle = make(chan uint64, Lanes*10)
	md5srv.uidCounter = md5ServerUID - 1
	md5srv.allBufs = make([]byte, 32+buffersPerLane*Lanes*internalBlockSize)
	md5srv.buffers = make(chan []byte, buffersPerLane*Lanes)
	// Fill buffers.
	for i := 0; i < buffersPerLane*Lanes; i++ {
		s := 32 + i*internalBlockSize
		md5srv.buffers <- md5srv.allBufs[s : s+internalBlockSize : s+internalBlockSize]
	}

	// Start a single thread for reading from the input channel
	go md5srv.process(md5srv.newInput)
	return md5srv
}

type newClient struct {
	uid   uint64
	input chan blockInput
}

// process - Sole handler for reading from the input channel.
func (s *md5Server) process(newClients chan newClient) {
	// To fill up as many lanes as possible:
	//
	// 1. Wait for a cycle id.
	// 2. If not already in a lane, add, otherwise leave on channel
	// 3. Start timer
	// 4. Check if lanes is full, if so, goto 10 (process).
	// 5. If timeout, goto 10.
	// 6. Wait for new id (goto 2)  or timeout (goto 10).
	// 10. Process.
	// 11. Check all input if there is already input, if so add to lanes.
	// 12. Goto 1

	// lanes contains the lanes.
	var lanes lanesInfo
	// lanesFilled contains the number of filled lanes for current cycle.
	var lanesFilled int
	// clients contains active clients
	var clients = make(map[uint64]chan blockInput, Lanes)

	addToLane := func(uid uint64) {
		cl, ok := clients[uid]
		if !ok {
			// Unknown client. Maybe it was already removed.
			return
		}
		// Check if we already have it.
		for _, lane := range lanes[:lanesFilled] {
			if lane.uid == uid {
				return
			}
		}
		// Continue until we get a block or there is nothing on channel
		for {
			select {
			case block, ok := <-cl:
				if !ok {
					// Client disconnected
					delete(clients, block.uid)
					return
				}
				if block.uid != uid {
					panic(fmt.Errorf("uid mismatch, %d (block) != %d (client)", block.uid, uid))
				}
				// If reset message, reset and we're done
				if block.reset {
					delete(s.digests, uid)
					continue
				}

				// If requesting sum, we will need to maintain state.
				if block.sumCh != nil {
					var dig digest
					d, ok := s.digests[uid]
					if ok {
						dig.s[0] = binary.LittleEndian.Uint32(d[0:4])
						dig.s[1] = binary.LittleEndian.Uint32(d[4:8])
						dig.s[2] = binary.LittleEndian.Uint32(d[8:12])
						dig.s[3] = binary.LittleEndian.Uint32(d[12:16])
					} else {
						dig.s[0], dig.s[1], dig.s[2], dig.s[3] = init0, init1, init2, init3
					}

					sum := sumResult{}
					// Add end block to current digest.
					blockScalar(&dig.s, block.msg)

					binary.LittleEndian.PutUint32(sum.digest[0:], dig.s[0])
					binary.LittleEndian.PutUint32(sum.digest[4:], dig.s[1])
					binary.LittleEndian.PutUint32(sum.digest[8:], dig.s[2])
					binary.LittleEndian.PutUint32(sum.digest[12:], dig.s[3])
					block.sumCh <- sum
					if block.msg != nil {
						s.buffers <- block.msg
					}
					continue
				}
				if len(block.msg) == 0 {
					continue
				}
				lanes[lanesFilled] = block
				lanesFilled++
				return
			default:
				return
			}
		}
	}
	addNewClient := func(cl newClient) {
		if _, ok := clients[cl.uid]; ok {
			panic("internal error: duplicate client registration")
		}
		clients[cl.uid] = cl.input
	}

	allLanesFilled := func() bool {
		return lanesFilled == Lanes || lanesFilled >= len(clients)
	}

	for {
		// Step 1.
		for lanesFilled == 0 {
			select {
			case cl, ok := <-newClients:
				if !ok {
					return
				}
				addNewClient(cl)
				// Check if it already sent a payload.
				addToLane(cl.uid)
				continue
			case uid := <-s.cycle:
				addToLane(uid)
			}
		}

	fillLanes:
		for !allLanesFilled() {
			select {
			case cl, ok := <-newClients:
				if !ok {
					return
				}
				addNewClient(cl)

			case uid := <-s.cycle:
				addToLane(uid)
			default:
				// Nothing more queued...
				break fillLanes
			}
		}

		// If we did not fill all lanes, check if there is more waiting
		if !allLanesFilled() {
			runtime.Gosched()
			for uid := range clients {
				addToLane(uid)
				if allLanesFilled() {
					break
				}
			}
		}
		if false {
			if !allLanesFilled() {
				fmt.Println("Not all lanes filled", lanesFilled, "of", len(clients))
				//pprof.Lookup("goroutine").WriteTo(os.Stdout, 1)
			} else if true {
				fmt.Println("all lanes filled")
			}
		}
		// Process the lanes we could collect
		s.blocks(lanes[:lanesFilled])

		// Clear lanes...
		lanesFilled = 0
		// Add all current queued
		for uid := range clients {
			addToLane(uid)
			if allLanesFilled() {
				break
			}
		}
	}
}

func (s *md5Server) Close() {
	if s.newInput != nil {
		close(s.newInput)
		s.newInput = nil
	}
}

// Invoke assembly and send results back
func (s *md5Server) blocks(lanes []blockInput) {
	if len(lanes) < useScalarBelow {
		// Use scalar routine when below this many lanes
		switch len(lanes) {
		case 0:
		case 1:
			lane := lanes[0]
			var d digest
			a, ok := s.digests[lane.uid]
			if ok {
				d.s[0] = binary.LittleEndian.Uint32(a[0:4])
				d.s[1] = binary.LittleEndian.Uint32(a[4:8])
				d.s[2] = binary.LittleEndian.Uint32(a[8:12])
				d.s[3] = binary.LittleEndian.Uint32(a[12:16])
			} else {
				d.s[0] = init0
				d.s[1] = init1
				d.s[2] = init2
				d.s[3] = init3
			}
			if len(lane.msg) > 0 {
				// Update...
				blockScalar(&d.s, lane.msg)
			}
			dig := [Size]byte{}
			binary.LittleEndian.PutUint32(dig[0:], d.s[0])
			binary.LittleEndian.PutUint32(dig[4:], d.s[1])
			binary.LittleEndian.PutUint32(dig[8:], d.s[2])
			binary.LittleEndian.PutUint32(dig[12:], d.s[3])
			s.digests[lane.uid] = dig

			if lane.msg != nil {
				s.buffers <- lane.msg
			}
			lanes[0] = blockInput{}

		default:
			s.wg.Add(len(lanes))
			var results [useScalarBelow]digest
			for i := range lanes {
				lane := lanes[i]
				go func(i int) {
					var d digest
					defer s.wg.Done()
					a, ok := s.digests[lane.uid]
					if ok {
						d.s[0] = binary.LittleEndian.Uint32(a[0:4])
						d.s[1] = binary.LittleEndian.Uint32(a[4:8])
						d.s[2] = binary.LittleEndian.Uint32(a[8:12])
						d.s[3] = binary.LittleEndian.Uint32(a[12:16])
					} else {
						d.s[0] = init0
						d.s[1] = init1
						d.s[2] = init2
						d.s[3] = init3
					}
					if len(lane.msg) == 0 {
						results[i] = d
						return
					}
					// Update...
					blockScalar(&d.s, lane.msg)
					results[i] = d
				}(i)
			}
			s.wg.Wait()
			for i, lane := range lanes {
				dig := [Size]byte{}
				binary.LittleEndian.PutUint32(dig[0:], results[i].s[0])
				binary.LittleEndian.PutUint32(dig[4:], results[i].s[1])
				binary.LittleEndian.PutUint32(dig[8:], results[i].s[2])
				binary.LittleEndian.PutUint32(dig[12:], results[i].s[3])
				s.digests[lane.uid] = dig

				if lane.msg != nil {
					s.buffers <- lane.msg
				}
				lanes[i] = blockInput{}
			}
		}
		return
	}

	inputs := [16][]byte{}
	for i := range lanes {
		inputs[i] = lanes[i].msg
	}

	// Collect active digests...
	state := s.getDigests(lanes)
	// Process all lanes...
	s.blockMd5_x16(&state, inputs, len(lanes) <= 8)

	for i, lane := range lanes {
		uid := lane.uid
		dig := [Size]byte{}
		binary.LittleEndian.PutUint32(dig[0:], state.v0[i])
		binary.LittleEndian.PutUint32(dig[4:], state.v1[i])
		binary.LittleEndian.PutUint32(dig[8:], state.v2[i])
		binary.LittleEndian.PutUint32(dig[12:], state.v3[i])

		s.digests[uid] = dig
		if lane.msg != nil {
			s.buffers <- lane.msg
		}
		lanes[i] = blockInput{}
	}
}

func (s *md5Server) getDigests(lanes []blockInput) (d digest16) {
	for i, lane := range lanes {
		a, ok := s.digests[lane.uid]
		if ok {
			d.v0[i] = binary.LittleEndian.Uint32(a[0:4])
			d.v1[i] = binary.LittleEndian.Uint32(a[4:8])
			d.v2[i] = binary.LittleEndian.Uint32(a[8:12])
			d.v3[i] = binary.LittleEndian.Uint32(a[12:16])
		} else {
			d.v0[i] = init0
			d.v1[i] = init1
			d.v2[i] = init2
			d.v3[i] = init3
		}
	}
	return
}