1 files changed, 281 insertions, 0 deletions
diff --git a/vendor/github.com/minio/md5-simd/block8_amd64.s b/vendor/github.com/minio/md5-simd/block8_amd64.s
new file mode 100644
index 0000000..f57db17
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/block8_amd64.s
@@ -0,0 +1,281 @@
+//+build !noasm,!appengine,gc
+// Copyright (c) 2018 Igneous Systems
+//   MIT License
+//
+//   Permission is hereby granted, free of charge, to any person obtaining a copy
+//   of this software and associated documentation files (the "Software"), to deal
+//   in the Software without restriction, including without limitation the rights
+//   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//   copies of the Software, and to permit persons to whom the Software is
+//   furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all
+//   copies or substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+//   SOFTWARE.
+// Copyright (c) 2020 MinIO Inc. All rights reserved.
+//   Use of this source code is governed by a license that can be
+//   found in the LICENSE file.
+// This is the AVX2 implementation of the MD5 block function (8-way parallel)
+// block8(state *uint64, base uintptr, bufs *int32, cache *byte, n int)
+TEXT ·block8(SB), 4, $0-40
+        MOVQ state+0(FP), BX
+        MOVQ base+8(FP), SI
+        MOVQ bufs+16(FP), AX
+        MOVQ cache+24(FP), CX
+        MOVQ n+32(FP), DX
+        MOVQ ·avx256md5consts+0(SB), DI
+        // Align cache (which is stack allocated by the compiler)
+        // to a 256 bit boundary (ymm register alignment)
+        // The cache8 type is deliberately oversized to permit this.
+        ADDQ $31, CX
+        ANDB $-32, CL
+#define a Y0
+#define b Y1
+#define c Y2
+#define d Y3
+#define sa Y4
+#define sb Y5
+#define sc Y6
+#define sd Y7
+#define tmp  Y8
+#define tmp2 Y9
+#define mask Y10
+#define off  Y11
+#define ones Y12
+#define rtmp1  Y13
+#define rtmp2  Y14
+#define mem   Y15
+#define dig    BX
+#define cache  CX
+#define count  DX
+#define base   SI
+#define consts DI
+#define prepmask \
+        VPXOR    mask, mask, mask \
+        VPCMPGTD mask, off, mask
+#define prep(index) \
+        VMOVAPD    mask, rtmp2                      \
+        VPGATHERDD rtmp2, index*4(base)(off*1), mem
+#define load(index) \
+        VMOVAPD index*32(cache), mem
+#define store(index) \
+        VMOVAPD mem, index*32(cache)
+#define roll(shift, a) \
+        VPSLLD $shift, a, rtmp1 \
+        VPSRLD $32-shift, a, a  \
+        VPOR   rtmp1, a, a
+#define ROUND1(a, b, c, d, index, const, shift) \
+        VPXOR   c, tmp, tmp            \
+        VPADDD  32*const(consts), a, a \
+        VPADDD  mem, a, a              \
+        VPAND   b, tmp, tmp            \
+        VPXOR   d, tmp, tmp            \
+        prep(index)                    \
+        VPADDD  tmp, a, a              \
+        roll(shift,a)                  \
+        VMOVAPD c, tmp                 \
+        VPADDD  b, a, a
+#define ROUND1load(a, b, c, d, index, const, shift) \
+        VXORPD  c, tmp, tmp            \
+        VPADDD  32*const(consts), a, a \
+        VPADDD  mem, a, a              \
+        VPAND   b, tmp, tmp            \
+        VPXOR   d, tmp, tmp            \
+        load(index)                    \
+        VPADDD  tmp, a, a              \
+        roll(shift,a)                  \
+        VMOVAPD c, tmp                 \
+        VPADDD  b, a, a
+#define ROUND2(a, b, c, d, index, const, shift) \
+        VPADDD  32*const(consts), a, a \
+        VPADDD  mem, a, a              \
+        VPAND   b, tmp2, tmp2          \
+        VANDNPD c, tmp, tmp            \
+        load(index)                    \
+        VPOR    tmp, tmp2, tmp2        \
+        VMOVAPD c, tmp                 \
+        VPADDD  tmp2, a, a             \
+        VMOVAPD c, tmp2                \
+        roll(shift,a)                  \
+        VPADDD  b, a, a
+#define ROUND3(a, b, c, d, index, const, shift) \
+        VPADDD  32*const(consts), a, a \
+        VPADDD  mem, a, a              \
+        load(index)                    \
+        VPXOR   d, tmp, tmp            \
+        VPXOR   b, tmp, tmp            \
+        VPADDD  tmp, a, a              \
+        roll(shift,a)                  \
+        VMOVAPD b, tmp                 \
+        VPADDD  b, a, a
+#define ROUND4(a, b, c, d, index, const, shift) \
+        VPADDD 32*const(consts), a, a \
+        VPADDD mem, a, a              \
+        VPOR   b, tmp, tmp            \
+        VPXOR  c, tmp, tmp            \
+        VPADDD tmp, a, a              \
+        load(index)                   \
+        roll(shift,a)                 \
+        VPXOR  c, ones, tmp           \
+        VPADDD b, a, a
+        // load digest into state registers
+        VMOVUPD (dig), a
+        VMOVUPD 32(dig), b
+        VMOVUPD 64(dig), c
+        VMOVUPD 96(dig), d
+        // load source buffer offsets
+        VMOVUPD (AX), off
+        prepmask
+        VPCMPEQD ones, ones, ones
+loop:
+        VMOVAPD a, sa
+        VMOVAPD b, sb
+        VMOVAPD c, sc
+        VMOVAPD d, sd
+        prep(0)
+        VMOVAPD d, tmp
+        store(0)
+        ROUND1(a,b,c,d, 1,0x00, 7)
+        store(1)
+        ROUND1(d,a,b,c, 2,0x01,12)
+        store(2)
+        ROUND1(c,d,a,b, 3,0x02,17)
+        store(3)
+        ROUND1(b,c,d,a, 4,0x03,22)
+        store(4)
+        ROUND1(a,b,c,d, 5,0x04, 7)
+        store(5)
+        ROUND1(d,a,b,c, 6,0x05,12)
+        store(6)
+        ROUND1(c,d,a,b, 7,0x06,17)
+        store(7)
+        ROUND1(b,c,d,a, 8,0x07,22)
+        store(8)
+        ROUND1(a,b,c,d, 9,0x08, 7)
+        store(9)
+        ROUND1(d,a,b,c,10,0x09,12)
+        store(10)
+        ROUND1(c,d,a,b,11,0x0a,17)
+        store(11)
+        ROUND1(b,c,d,a,12,0x0b,22)
+        store(12)
+        ROUND1(a,b,c,d,13,0x0c, 7)
+        store(13)
+        ROUND1(d,a,b,c,14,0x0d,12)
+        store(14)
+        ROUND1(c,d,a,b,15,0x0e,17)
+        store(15)
+        ROUND1load(b,c,d,a, 1,0x0f,22)
+        VMOVAPD d, tmp
+        VMOVAPD d, tmp2
+        ROUND2(a,b,c,d, 6,0x10, 5)
+        ROUND2(d,a,b,c,11,0x11, 9)
+        ROUND2(c,d,a,b, 0,0x12,14)
+        ROUND2(b,c,d,a, 5,0x13,20)
+        ROUND2(a,b,c,d,10,0x14, 5)
+        ROUND2(d,a,b,c,15,0x15, 9)
+        ROUND2(c,d,a,b, 4,0x16,14)
+        ROUND2(b,c,d,a, 9,0x17,20)
+        ROUND2(a,b,c,d,14,0x18, 5)
+        ROUND2(d,a,b,c, 3,0x19, 9)
+        ROUND2(c,d,a,b, 8,0x1a,14)
+        ROUND2(b,c,d,a,13,0x1b,20)
+        ROUND2(a,b,c,d, 2,0x1c, 5)
+        ROUND2(d,a,b,c, 7,0x1d, 9)
+        ROUND2(c,d,a,b,12,0x1e,14)
+        ROUND2(b,c,d,a, 0,0x1f,20)
+        load(5)
+        VMOVAPD c, tmp
+        ROUND3(a,b,c,d, 8,0x20, 4)
+        ROUND3(d,a,b,c,11,0x21,11)
+        ROUND3(c,d,a,b,14,0x22,16)
+        ROUND3(b,c,d,a, 1,0x23,23)
+        ROUND3(a,b,c,d, 4,0x24, 4)
+        ROUND3(d,a,b,c, 7,0x25,11)
+        ROUND3(c,d,a,b,10,0x26,16)
+        ROUND3(b,c,d,a,13,0x27,23)
+        ROUND3(a,b,c,d, 0,0x28, 4)
+        ROUND3(d,a,b,c, 3,0x29,11)
+        ROUND3(c,d,a,b, 6,0x2a,16)
+        ROUND3(b,c,d,a, 9,0x2b,23)
+        ROUND3(a,b,c,d,12,0x2c, 4)
+        ROUND3(d,a,b,c,15,0x2d,11)
+        ROUND3(c,d,a,b, 2,0x2e,16)
+        ROUND3(b,c,d,a, 0,0x2f,23)
+        load(0)
+        VPXOR d, ones, tmp
+        ROUND4(a,b,c,d, 7,0x30, 6)
+        ROUND4(d,a,b,c,14,0x31,10)
+        ROUND4(c,d,a,b, 5,0x32,15)
+        ROUND4(b,c,d,a,12,0x33,21)
+        ROUND4(a,b,c,d, 3,0x34, 6)
+        ROUND4(d,a,b,c,10,0x35,10)
+        ROUND4(c,d,a,b, 1,0x36,15)
+        ROUND4(b,c,d,a, 8,0x37,21)
+        ROUND4(a,b,c,d,15,0x38, 6)
+        ROUND4(d,a,b,c, 6,0x39,10)
+        ROUND4(c,d,a,b,13,0x3a,15)
+        ROUND4(b,c,d,a, 4,0x3b,21)
+        ROUND4(a,b,c,d,11,0x3c, 6)
+        ROUND4(d,a,b,c, 2,0x3d,10)
+        ROUND4(c,d,a,b, 9,0x3e,15)
+        ROUND4(b,c,d,a, 0,0x3f,21)
+        VPADDD sa, a, a
+        VPADDD sb, b, b
+        VPADDD sc, c, c
+        VPADDD sd, d, d
+        LEAQ 64(base), base
+        SUBQ $64, count
+        JNE  loop
+        VMOVUPD a, (dig)
+        VMOVUPD b, 32(dig)
+        VMOVUPD c, 64(dig)
+        VMOVUPD d, 96(dig)
+        VZEROUPPER
+        RET

diff --git a/vendor/github.com/minio/md5-simd/block8_amd64.s b/vendor/github.com/minio/md5-simd/block8_amd64.s new file mode 100644 index 0000000..f57db17 --- /dev/null +++ b/vendor/github.com/minio/md5-simd/block8_amd64.s
@@ -0,0 +1,281 @@
	1	//+build !noasm,!appengine,gc
	2
	3	// Copyright (c) 2018 Igneous Systems
	4	// MIT License
	5	//
	6	// Permission is hereby granted, free of charge, to any person obtaining a copy
	7	// of this software and associated documentation files (the "Software"), to deal
	8	// in the Software without restriction, including without limitation the rights
	9	// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	10	// copies of the Software, and to permit persons to whom the Software is
	11	// furnished to do so, subject to the following conditions:
	12	//
	13	// The above copyright notice and this permission notice shall be included in all
	14	// copies or substantial portions of the Software.
	15	//
	16	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	17	// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	18	// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	19	// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	20	// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	21	// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	22	// SOFTWARE.
	23
	24	// Copyright (c) 2020 MinIO Inc. All rights reserved.
	25	// Use of this source code is governed by a license that can be
	26	// found in the LICENSE file.
	27
	28	// This is the AVX2 implementation of the MD5 block function (8-way parallel)
	29
	30	// block8(state uint64, base uintptr, bufs int32, cache *byte, n int)
	31	TEXT ·block8(SB), 4, $0-40
	32	MOVQ state+0(FP), BX
	33	MOVQ base+8(FP), SI
	34	MOVQ bufs+16(FP), AX
	35	MOVQ cache+24(FP), CX
	36	MOVQ n+32(FP), DX
	37	MOVQ ·avx256md5consts+0(SB), DI
	38
	39	// Align cache (which is stack allocated by the compiler)
	40	// to a 256 bit boundary (ymm register alignment)
	41	// The cache8 type is deliberately oversized to permit this.
	42	ADDQ $31, CX
	43	ANDB $-32, CL
	44
	45	#define a Y0
	46	#define b Y1
	47	#define c Y2
	48	#define d Y3
	49
	50	#define sa Y4
	51	#define sb Y5
	52	#define sc Y6
	53	#define sd Y7
	54
	55	#define tmp Y8
	56	#define tmp2 Y9
	57
	58	#define mask Y10
	59	#define off Y11
	60
	61	#define ones Y12
	62
	63	#define rtmp1 Y13
	64	#define rtmp2 Y14
	65
	66	#define mem Y15
	67
	68	#define dig BX
	69	#define cache CX
	70	#define count DX
	71	#define base SI
	72	#define consts DI
	73
	74	#define prepmask \
	75	VPXOR mask, mask, mask \
	76	VPCMPGTD mask, off, mask
	77
	78	#define prep(index) \
	79	VMOVAPD mask, rtmp2 \
	80	VPGATHERDD rtmp2, index4(base)(off1), mem
	81
	82	#define load(index) \
	83	VMOVAPD index*32(cache), mem
	84
	85	#define store(index) \
	86	VMOVAPD mem, index*32(cache)
	87
	88	#define roll(shift, a) \
	89	VPSLLD $shift, a, rtmp1 \
	90	VPSRLD $32-shift, a, a \
	91	VPOR rtmp1, a, a
	92
	93	#define ROUND1(a, b, c, d, index, const, shift) \
	94	VPXOR c, tmp, tmp \
	95	VPADDD 32*const(consts), a, a \
	96	VPADDD mem, a, a \
	97	VPAND b, tmp, tmp \
	98	VPXOR d, tmp, tmp \
	99	prep(index) \
	100	VPADDD tmp, a, a \
	101	roll(shift,a) \
	102	VMOVAPD c, tmp \
	103	VPADDD b, a, a
	104
	105	#define ROUND1load(a, b, c, d, index, const, shift) \
	106	VXORPD c, tmp, tmp \
	107	VPADDD 32*const(consts), a, a \
	108	VPADDD mem, a, a \
	109	VPAND b, tmp, tmp \
	110	VPXOR d, tmp, tmp \
	111	load(index) \
	112	VPADDD tmp, a, a \
	113	roll(shift,a) \
	114	VMOVAPD c, tmp \
	115	VPADDD b, a, a
	116
	117	#define ROUND2(a, b, c, d, index, const, shift) \
	118	VPADDD 32*const(consts), a, a \
	119	VPADDD mem, a, a \
	120	VPAND b, tmp2, tmp2 \
	121	VANDNPD c, tmp, tmp \
	122	load(index) \
	123	VPOR tmp, tmp2, tmp2 \
	124	VMOVAPD c, tmp \
	125	VPADDD tmp2, a, a \
	126	VMOVAPD c, tmp2 \
	127	roll(shift,a) \
	128	VPADDD b, a, a
	129
	130	#define ROUND3(a, b, c, d, index, const, shift) \
	131	VPADDD 32*const(consts), a, a \
	132	VPADDD mem, a, a \
	133	load(index) \
	134	VPXOR d, tmp, tmp \
	135	VPXOR b, tmp, tmp \
	136	VPADDD tmp, a, a \
	137	roll(shift,a) \
	138	VMOVAPD b, tmp \
	139	VPADDD b, a, a
	140
	141	#define ROUND4(a, b, c, d, index, const, shift) \
	142	VPADDD 32*const(consts), a, a \
	143	VPADDD mem, a, a \
	144	VPOR b, tmp, tmp \
	145	VPXOR c, tmp, tmp \
	146	VPADDD tmp, a, a \
	147	load(index) \
	148	roll(shift,a) \
	149	VPXOR c, ones, tmp \
	150	VPADDD b, a, a
	151
	152	// load digest into state registers
	153	VMOVUPD (dig), a
	154	VMOVUPD 32(dig), b
	155	VMOVUPD 64(dig), c
	156	VMOVUPD 96(dig), d
	157
	158	// load source buffer offsets
	159	VMOVUPD (AX), off
	160
	161	prepmask
	162	VPCMPEQD ones, ones, ones
	163
	164	loop:
	165	VMOVAPD a, sa
	166	VMOVAPD b, sb
	167	VMOVAPD c, sc
	168	VMOVAPD d, sd
	169
	170	prep(0)
	171	VMOVAPD d, tmp
	172	store(0)
	173
	174	ROUND1(a,b,c,d, 1,0x00, 7)
	175	store(1)
	176	ROUND1(d,a,b,c, 2,0x01,12)
	177	store(2)
	178	ROUND1(c,d,a,b, 3,0x02,17)
	179	store(3)
	180	ROUND1(b,c,d,a, 4,0x03,22)
	181	store(4)
	182	ROUND1(a,b,c,d, 5,0x04, 7)
	183	store(5)
	184	ROUND1(d,a,b,c, 6,0x05,12)
	185	store(6)
	186	ROUND1(c,d,a,b, 7,0x06,17)
	187	store(7)
	188	ROUND1(b,c,d,a, 8,0x07,22)
	189	store(8)
	190	ROUND1(a,b,c,d, 9,0x08, 7)
	191	store(9)
	192	ROUND1(d,a,b,c,10,0x09,12)
	193	store(10)
	194	ROUND1(c,d,a,b,11,0x0a,17)
	195	store(11)
	196	ROUND1(b,c,d,a,12,0x0b,22)
	197	store(12)
	198	ROUND1(a,b,c,d,13,0x0c, 7)
	199	store(13)
	200	ROUND1(d,a,b,c,14,0x0d,12)
	201	store(14)
	202	ROUND1(c,d,a,b,15,0x0e,17)
	203	store(15)
	204	ROUND1load(b,c,d,a, 1,0x0f,22)
	205
	206	VMOVAPD d, tmp
	207	VMOVAPD d, tmp2
	208
	209	ROUND2(a,b,c,d, 6,0x10, 5)
	210	ROUND2(d,a,b,c,11,0x11, 9)
	211	ROUND2(c,d,a,b, 0,0x12,14)
	212	ROUND2(b,c,d,a, 5,0x13,20)
	213	ROUND2(a,b,c,d,10,0x14, 5)
	214	ROUND2(d,a,b,c,15,0x15, 9)
	215	ROUND2(c,d,a,b, 4,0x16,14)
	216	ROUND2(b,c,d,a, 9,0x17,20)
	217	ROUND2(a,b,c,d,14,0x18, 5)
	218	ROUND2(d,a,b,c, 3,0x19, 9)
	219	ROUND2(c,d,a,b, 8,0x1a,14)
	220	ROUND2(b,c,d,a,13,0x1b,20)
	221	ROUND2(a,b,c,d, 2,0x1c, 5)
	222	ROUND2(d,a,b,c, 7,0x1d, 9)
	223	ROUND2(c,d,a,b,12,0x1e,14)
	224	ROUND2(b,c,d,a, 0,0x1f,20)
	225
	226	load(5)
	227	VMOVAPD c, tmp
	228
	229	ROUND3(a,b,c,d, 8,0x20, 4)
	230	ROUND3(d,a,b,c,11,0x21,11)
	231	ROUND3(c,d,a,b,14,0x22,16)
	232	ROUND3(b,c,d,a, 1,0x23,23)
	233	ROUND3(a,b,c,d, 4,0x24, 4)
	234	ROUND3(d,a,b,c, 7,0x25,11)
	235	ROUND3(c,d,a,b,10,0x26,16)
	236	ROUND3(b,c,d,a,13,0x27,23)
	237	ROUND3(a,b,c,d, 0,0x28, 4)
	238	ROUND3(d,a,b,c, 3,0x29,11)
	239	ROUND3(c,d,a,b, 6,0x2a,16)
	240	ROUND3(b,c,d,a, 9,0x2b,23)
	241	ROUND3(a,b,c,d,12,0x2c, 4)
	242	ROUND3(d,a,b,c,15,0x2d,11)
	243	ROUND3(c,d,a,b, 2,0x2e,16)
	244	ROUND3(b,c,d,a, 0,0x2f,23)
	245
	246	load(0)
	247	VPXOR d, ones, tmp
	248
	249	ROUND4(a,b,c,d, 7,0x30, 6)
	250	ROUND4(d,a,b,c,14,0x31,10)
	251	ROUND4(c,d,a,b, 5,0x32,15)
	252	ROUND4(b,c,d,a,12,0x33,21)
	253	ROUND4(a,b,c,d, 3,0x34, 6)
	254	ROUND4(d,a,b,c,10,0x35,10)
	255	ROUND4(c,d,a,b, 1,0x36,15)
	256	ROUND4(b,c,d,a, 8,0x37,21)
	257	ROUND4(a,b,c,d,15,0x38, 6)
	258	ROUND4(d,a,b,c, 6,0x39,10)
	259	ROUND4(c,d,a,b,13,0x3a,15)
	260	ROUND4(b,c,d,a, 4,0x3b,21)
	261	ROUND4(a,b,c,d,11,0x3c, 6)
	262	ROUND4(d,a,b,c, 2,0x3d,10)
	263	ROUND4(c,d,a,b, 9,0x3e,15)
	264	ROUND4(b,c,d,a, 0,0x3f,21)
	265
	266	VPADDD sa, a, a
	267	VPADDD sb, b, b
	268	VPADDD sc, c, c
	269	VPADDD sd, d, d
	270
	271	LEAQ 64(base), base
	272	SUBQ $64, count
	273	JNE loop
	274
	275	VMOVUPD a, (dig)
	276	VMOVUPD b, 32(dig)
	277	VMOVUPD c, 64(dig)
	278	VMOVUPD d, 96(dig)
	279
	280	VZEROUPPER
	281	RET