13 files changed, 2616 insertions, 0 deletions
diff --git a/vendor/github.com/minio/md5-simd/LICENSE b/vendor/github.com/minio/md5-simd/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/vendor/github.com/minio/md5-simd/LICENSE.Golang b/vendor/github.com/minio/md5-simd/LICENSE.Golang
new file mode 100644
index 0000000..6a66aea
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/LICENSE.Golang
@@ -0,0 +1,27 @@
+Copyright (c) 2009 The Go Authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/minio/md5-simd/README.md b/vendor/github.com/minio/md5-simd/README.md
new file mode 100644
index 0000000..fa6fce1
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/README.md
@@ -0,0 +1,198 @@
+# md5-simd
+This is a SIMD accelerated MD5 package, allowing up to either 8 (AVX2) or 16 (AVX512) independent MD5 sums to be calculated on a single CPU core.
+It was originally based on the [md5vec](https://github.com/igneous-systems/md5vec) repository by Igneous Systems, but has been made more flexible by amongst others supporting different message sizes per lane and adding AVX512.
+`md5-simd` integrates a similar mechanism as described in [minio/sha256-simd](https://github.com/minio/sha256-simd#support-for-avx512) for making it easy for clients to take advantages of the parallel nature of the MD5 calculation. This will result in reduced overall CPU load. 
+It is important to understand that `md5-simd` **does not speed up** a single threaded MD5 hash sum. 
+Rather it allows multiple __independent__  MD5 sums to be computed in parallel on the same CPU core, 
+thereby making more efficient usage of the computing resources.
+## Usage
+[![Documentation](https://godoc.org/github.com/minio/md5-simd?status.svg)](https://pkg.go.dev/github.com/minio/md5-simd?tab=doc)
+In order to use `md5-simd`, you must first create an `Server` which can be 
+used to instantiate one or more objects for MD5 hashing. 
+These objects conform to the regular [`hash.Hash`](https://pkg.go.dev/hash?tab=doc#Hash) interface 
+and as such the normal Write/Reset/Sum functionality works as expected. 
+As an example: 
+```
+    // Create server
+    server := md5simd.NewServer()
+    defer server.Close()
+    // Create hashing object (conforming to hash.Hash)
+    md5Hash := server.NewHash()
+    defer md5Hash.Close()
+    // Write one (or more) blocks
+    md5Hash.Write(block)
+    
+    // Return digest
+    digest := md5Hash.Sum([]byte{})
+```
+To keep performance both a [Server](https://pkg.go.dev/github.com/minio/md5-simd?tab=doc#Server) 
+and individual [Hasher](https://pkg.go.dev/github.com/minio/md5-simd?tab=doc#Hasher) should 
+be closed using the `Close()` function when no longer needed.
+A Hasher can efficiently be re-used by using [`Reset()`](https://pkg.go.dev/hash?tab=doc#Hash) functionality.
+In case your system does not support the instructions required it will fall back to using `crypto/md5` for hashing.
+## Limitations
+As explained above `md5-simd` does not speed up an individual MD5 hash sum computation,
+unless some hierarchical tree construct is used but this will result in different outcomes.
+Running a single hash on a server results in approximately half the throughput.
+Instead, it allows running multiple MD5 calculations in parallel on a single CPU core. 
+This can be beneficial in e.g. multi-threaded server applications where many go-routines 
+are dealing with many requests and multiple MD5 calculations can be packed/scheduled for parallel execution on a single core.
+This will result in a lower overall CPU usage as compared to using the standard `crypto/md5`
+functionality where each MD5 hash computation will consume a single thread (core).
+It is best to test and measure the overall CPU usage in a representative usage scenario in your application
+to get an overall understanding of the benefits of `md5-simd` as compared to `crypto/md5`, ideally under heavy CPU load.
+Also note that `md5-simd` is best meant to work with large objects, 
+so if your application only hashes small objects of a few kilobytes 
+you may be better of by using `crypto/md5`.
+## Performance
+For the best performance writes should be a multiple of 64 bytes, ideally a multiple of 32KB.
+To help with that a [`buffered := bufio.NewWriterSize(hasher, 32<<10)`](https://golang.org/pkg/bufio/#NewWriterSize) 
+can be inserted if you are unsure of the sizes of the writes. 
+Remember to [flush](https://golang.org/pkg/bufio/#Writer.Flush) `buffered` before reading the hash. 
+A single 'server' can process 16 streams concurrently with 1 core (AVX-512) or 2 cores (AVX2). 
+In situations where it is likely that more than 16 streams are fully loaded it may be beneficial
+to use multiple servers.
+The following chart compares the multi-core performance between `crypto/md5` vs the AVX2 vs the AVX512 code:
+![md5-performance-overview](chart/Multi-core-MD5-Aggregated-Hashing-Performance.png)
+Compared to `crypto/md5`, the AVX2 version is up to 4x faster:
+```
+$ benchcmp crypto-md5.txt avx2.txt 
+benchmark                     old MB/s     new MB/s     speedup
+BenchmarkParallel/32KB-4      2229.22      7370.50      3.31x
+BenchmarkParallel/64KB-4      2233.61      8248.46      3.69x
+BenchmarkParallel/128KB-4     2235.43      8660.74      3.87x
+BenchmarkParallel/256KB-4     2236.39      8863.87      3.96x
+BenchmarkParallel/512KB-4     2238.05      8985.39      4.01x
+BenchmarkParallel/1MB-4       2233.56      9042.62      4.05x
+BenchmarkParallel/2MB-4       2224.11      9014.46      4.05x
+BenchmarkParallel/4MB-4       2199.78      8993.61      4.09x
+BenchmarkParallel/8MB-4       2182.48      8748.22      4.01x
+```
+Compared to `crypto/md5`, the AVX512 is up to 8x faster (for larger block sizes):
+```
+$ benchcmp crypto-md5.txt avx512.txt
+benchmark                     old MB/s     new MB/s     speedup
+BenchmarkParallel/32KB-4      2229.22      11605.78     5.21x
+BenchmarkParallel/64KB-4      2233.61      14329.65     6.42x
+BenchmarkParallel/128KB-4     2235.43      16166.39     7.23x
+BenchmarkParallel/256KB-4     2236.39      15570.09     6.96x
+BenchmarkParallel/512KB-4     2238.05      16705.83     7.46x
+BenchmarkParallel/1MB-4       2233.56      16941.95     7.59x
+BenchmarkParallel/2MB-4       2224.11      17136.01     7.70x
+BenchmarkParallel/4MB-4       2199.78      17218.61     7.83x
+BenchmarkParallel/8MB-4       2182.48      17252.88     7.91x
+```
+These measurements were performed on AWS EC2 instance of type `c5.xlarge` equipped with a Xeon Platinum 8124M CPU at 3.0 GHz.
+If only one or two inputs are available the scalar calculation method will be used for the 
+optimal speed in these cases.
+## Operation
+To make operation as easy as possible there is a “Server” coordinating everything. The server keeps track of individual hash states and updates them as new data comes in. This can be visualized as follows:
+![server-architecture](chart/server-architecture.png)
+The data is sent to the server from each hash input in blocks of up to 32KB per round. In our testing we found this to be the block size that yielded the best results.
+Whenever there is data available the server will collect data for up to 16 hashes and process all 16 lanes in parallel. This means that if 16 hashes have data available all the lanes will be filled. However since that may not be the case, the server will fill less lanes and do a round anyway. Lanes can also be partially filled if less than 32KB of data is written.
+![server-lanes-example](chart/server-lanes-example.png)
+In this example 4 lanes are fully filled and 2 lanes are partially filled. In this case the black areas will simply be masked out from the results and ignored. This is also why calculating a single hash on a server will not result in any speedup and hash writes should be a multiple of 32KB for the best performance.
+For AVX512 all 16 calculations will be done on a single core, on AVX2 on 2 cores if there is data for more than 8 lanes.
+So for optimal usage there should be data available for all 16 hashes. It may be perfectly reasonable to use more than 16 concurrent hashes.
+## Design & Tech
+md5-simd has both an AVX2 (8-lane parallel), and an AVX512 (16-lane parallel version) algorithm to accelerate the computation with the following function definitions:
+```
+//go:noescape
+func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
+//go:noescape
+func block16(state *uint32, ptrs *int64, mask uint64, n int)
+```
+The AVX2 version is based on the [md5vec](https://github.com/igneous-systems/md5vec) repository and is essentially unchanged except for minor (cosmetic) changes.
+The AVX512 version is derived from the AVX2 version but adds some further optimizations and simplifications.
+### Caching in upper ZMM registers
+The AVX2 version passes in a `cache8` block of memory (about 0.5 KB) for temporary storage of intermediate results during `ROUND1` which are subsequently used during `ROUND2` through to `ROUND4`.
+Since AVX512 has double the amount of registers (32 ZMM registers as compared to 16 YMM registers), it is possible to use the upper 16 ZMM registers for keeping the intermediate states on the CPU. As such, there is no need to pass in a corresponding `cache16` into the AVX512 block function.
+### Direct loading using 64-bit pointers
+The AVX2 uses the `VPGATHERDD` instruction (for YMM) to do a parallel load of 8 lanes using (8 independent) 32-bit offets. Since there is no control over how the 8 slices that are passed into the (Golang) `blockMd5` function are laid out into memory, it is not possible to derive a "base" address and corresponding offsets (all within 32-bits) for all 8 slices.
+As such the AVX2 version uses an interim buffer to collect the byte slices to be hashed from all 8 inut slices and passed this buffer along with (fixed) 32-bit offsets into the assembly code.
+For the AVX512 version this interim buffer is not needed since the AVX512 code uses a pair of `VPGATHERQD` instructions to directly dereference 64-bit pointers (from a base register address that is initialized to zero).
+Note that two load (gather) instructions are needed because the AVX512 version processes 16-lanes in parallel, requiring 16 times 64-bit = 1024 bits in total to be loaded. A simple `VALIGND` and `VPORD` are subsequently used to merge the lower and upper halves together into a single ZMM register (that contains 16 lanes of 32-bit DWORDS).
+### Masking support
+Due to the fact that pointers are passed directly from the Golang slices, we need to protect against NULL pointers. 
+For this a 16-bit mask is passed in the AVX512 assembly code which is used during the `VPGATHERQD` instructions to mask out lanes that could otherwise result in segment violations.
+### Minor optimizations
+The `roll` macro (three instructions on AVX2) is no longer needed for AVX512 and is replaced by a single `VPROLD` instruction.
+Also several logical operations from the various ROUNDS of the AVX2 version could be combined into a single instruction using ternary logic (with the `VPTERMLOGD` instruction), resulting in a further simplification and speed-up.
+## Low level block function performance
+The benchmark below shows the (single thread) maximum performance of the `block()` function for AVX2 (having 8 lanes) and AVX512 (having 16 lanes). Also the baseline single-core performance from the standard `crypto/md5` package is shown for comparison.
+```
+BenchmarkCryptoMd5-4                     687.66 MB/s           0 B/op          0 allocs/op
+BenchmarkBlock8-4                       4144.80 MB/s           0 B/op          0 allocs/op
+BenchmarkBlock16-4                      8228.88 MB/s           0 B/op          0 allocs/op
+```
+## License
+`md5-simd` is released under the Apache License v2.0. You can find the complete text in the file LICENSE.
+## Contributing
+Contributions are welcome, please send PRs for any enhancements.
+\ No newline at end of file
diff --git a/vendor/github.com/minio/md5-simd/block16_amd64.s b/vendor/github.com/minio/md5-simd/block16_amd64.s
new file mode 100644
index 0000000..be0a43a
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/block16_amd64.s
@@ -0,0 +1,228 @@
+// Copyright (c) 2020 MinIO Inc. All rights reserved.
+// Use of this source code is governed by a license that can be
+// found in the LICENSE file.
+//+build !noasm,!appengine,gc
+// This is the AVX512 implementation of the MD5 block function (16-way parallel)
+#define prep(index) \
+        KMOVQ      kmask, ktmp                      \
+        VPGATHERDD index*4(base)(ptrs*1), ktmp, mem
+#define ROUND1(a, b, c, d, index, const, shift) \
+        VPXORQ     c, tmp, tmp            \
+        VPADDD     64*const(consts), a, a \
+        VPADDD     mem, a, a              \
+        VPTERNLOGD $0x6C, b, d, tmp       \
+        prep(index)                       \
+        VPADDD     tmp, a, a              \
+        VPROLD     $shift, a, a           \
+        VMOVAPD    c, tmp                 \
+        VPADDD     b, a, a
+#define ROUND1noload(a, b, c, d, const, shift) \
+        VPXORQ     c, tmp, tmp            \
+        VPADDD     64*const(consts), a, a \
+        VPADDD     mem, a, a              \
+        VPTERNLOGD $0x6C, b, d, tmp       \
+        VPADDD     tmp, a, a              \
+        VPROLD     $shift, a, a           \
+        VMOVAPD    c, tmp                 \
+        VPADDD     b, a, a
+#define ROUND2(a, b, c, d, zreg, const, shift) \
+        VPADDD     64*const(consts), a, a \
+        VPADDD     zreg, a, a             \
+        VANDNPD    c, tmp, tmp            \
+        VPTERNLOGD $0xEC, b, tmp, tmp2    \
+        VMOVAPD    c, tmp                 \
+        VPADDD     tmp2, a, a             \
+        VMOVAPD    c, tmp2                \
+        VPROLD     $shift, a, a           \
+        VPADDD     b, a, a
+#define ROUND3(a, b, c, d, zreg, const, shift) \
+        VPADDD     64*const(consts), a, a \
+        VPADDD     zreg, a, a             \
+        VPTERNLOGD $0x96, b, d, tmp       \
+        VPADDD     tmp, a, a              \
+        VPROLD     $shift, a, a           \
+        VMOVAPD    b, tmp                 \
+        VPADDD     b, a, a
+#define ROUND4(a, b, c, d, zreg, const, shift) \
+        VPADDD     64*const(consts), a, a \
+        VPADDD     zreg, a, a             \
+        VPTERNLOGD $0x36, b, c, tmp       \
+        VPADDD     tmp, a, a              \
+        VPROLD     $shift, a, a           \
+        VPXORQ     c, ones, tmp           \
+        VPADDD     b, a, a
+TEXT ·block16(SB), 4, $0-40
+        MOVQ  state+0(FP), BX
+        MOVQ  base+8(FP), SI
+        MOVQ  ptrs+16(FP), AX
+        KMOVQ mask+24(FP), K1
+        MOVQ  n+32(FP), DX
+        MOVQ  ·avx512md5consts+0(SB), DI
+#define a Z0
+#define b Z1
+#define c Z2
+#define d Z3
+#define sa Z4
+#define sb Z5
+#define sc Z6
+#define sd Z7
+#define tmp       Z8
+#define tmp2      Z9
+#define ptrs     Z10
+#define ones     Z12
+#define mem      Z15
+#define kmask  K1
+#define ktmp   K3
+// ----------------------------------------------------------
+// Registers Z16 through to Z31 are used for caching purposes
+// ----------------------------------------------------------
+#define dig    BX
+#define count  DX
+#define base   SI
+#define consts DI
+        // load digest into state registers
+        VMOVUPD (dig), a
+        VMOVUPD 0x40(dig), b
+        VMOVUPD 0x80(dig), c
+        VMOVUPD 0xc0(dig), d
+        // load source pointers
+        VMOVUPD 0x00(AX), ptrs
+        MOVQ         $-1, AX
+        VPBROADCASTQ AX, ones
+loop:
+        VMOVAPD a, sa
+        VMOVAPD b, sb
+        VMOVAPD c, sc
+        VMOVAPD d, sd
+        prep(0)
+        VMOVAPD d, tmp
+        VMOVAPD mem, Z16
+        ROUND1(a,b,c,d, 1,0x00, 7)
+        VMOVAPD mem, Z17
+        ROUND1(d,a,b,c, 2,0x01,12)
+        VMOVAPD mem, Z18
+        ROUND1(c,d,a,b, 3,0x02,17)
+        VMOVAPD mem, Z19
+        ROUND1(b,c,d,a, 4,0x03,22)
+        VMOVAPD mem, Z20
+        ROUND1(a,b,c,d, 5,0x04, 7)
+        VMOVAPD mem, Z21
+        ROUND1(d,a,b,c, 6,0x05,12)
+        VMOVAPD mem, Z22
+        ROUND1(c,d,a,b, 7,0x06,17)
+        VMOVAPD mem, Z23
+        ROUND1(b,c,d,a, 8,0x07,22)
+        VMOVAPD mem, Z24
+        ROUND1(a,b,c,d, 9,0x08, 7)
+        VMOVAPD mem, Z25
+        ROUND1(d,a,b,c,10,0x09,12)
+        VMOVAPD mem, Z26
+        ROUND1(c,d,a,b,11,0x0a,17)
+        VMOVAPD mem, Z27
+        ROUND1(b,c,d,a,12,0x0b,22)
+        VMOVAPD mem, Z28
+        ROUND1(a,b,c,d,13,0x0c, 7)
+        VMOVAPD mem, Z29
+        ROUND1(d,a,b,c,14,0x0d,12)
+        VMOVAPD mem, Z30
+        ROUND1(c,d,a,b,15,0x0e,17)
+        VMOVAPD mem, Z31
+        ROUND1noload(b,c,d,a, 0x0f,22)
+        VMOVAPD d, tmp
+        VMOVAPD d, tmp2
+        ROUND2(a,b,c,d, Z17,0x10, 5)
+        ROUND2(d,a,b,c, Z22,0x11, 9)
+        ROUND2(c,d,a,b, Z27,0x12,14)
+        ROUND2(b,c,d,a, Z16,0x13,20)
+        ROUND2(a,b,c,d, Z21,0x14, 5)
+        ROUND2(d,a,b,c, Z26,0x15, 9)
+        ROUND2(c,d,a,b, Z31,0x16,14)
+        ROUND2(b,c,d,a, Z20,0x17,20)
+        ROUND2(a,b,c,d, Z25,0x18, 5)
+        ROUND2(d,a,b,c, Z30,0x19, 9)
+        ROUND2(c,d,a,b, Z19,0x1a,14)
+        ROUND2(b,c,d,a, Z24,0x1b,20)
+        ROUND2(a,b,c,d, Z29,0x1c, 5)
+        ROUND2(d,a,b,c, Z18,0x1d, 9)
+        ROUND2(c,d,a,b, Z23,0x1e,14)
+        ROUND2(b,c,d,a, Z28,0x1f,20)
+        VMOVAPD c, tmp
+        ROUND3(a,b,c,d, Z21,0x20, 4)
+        ROUND3(d,a,b,c, Z24,0x21,11)
+        ROUND3(c,d,a,b, Z27,0x22,16)
+        ROUND3(b,c,d,a, Z30,0x23,23)
+        ROUND3(a,b,c,d, Z17,0x24, 4)
+        ROUND3(d,a,b,c, Z20,0x25,11)
+        ROUND3(c,d,a,b, Z23,0x26,16)
+        ROUND3(b,c,d,a, Z26,0x27,23)
+        ROUND3(a,b,c,d, Z29,0x28, 4)
+        ROUND3(d,a,b,c, Z16,0x29,11)
+        ROUND3(c,d,a,b, Z19,0x2a,16)
+        ROUND3(b,c,d,a, Z22,0x2b,23)
+        ROUND3(a,b,c,d, Z25,0x2c, 4)
+        ROUND3(d,a,b,c, Z28,0x2d,11)
+        ROUND3(c,d,a,b, Z31,0x2e,16)
+        ROUND3(b,c,d,a, Z18,0x2f,23)
+        VPXORQ d, ones, tmp
+        ROUND4(a,b,c,d, Z16,0x30, 6)
+        ROUND4(d,a,b,c, Z23,0x31,10)
+        ROUND4(c,d,a,b, Z30,0x32,15)
+        ROUND4(b,c,d,a, Z21,0x33,21)
+        ROUND4(a,b,c,d, Z28,0x34, 6)
+        ROUND4(d,a,b,c, Z19,0x35,10)
+        ROUND4(c,d,a,b, Z26,0x36,15)
+        ROUND4(b,c,d,a, Z17,0x37,21)
+        ROUND4(a,b,c,d, Z24,0x38, 6)
+        ROUND4(d,a,b,c, Z31,0x39,10)
+        ROUND4(c,d,a,b, Z22,0x3a,15)
+        ROUND4(b,c,d,a, Z29,0x3b,21)
+        ROUND4(a,b,c,d, Z20,0x3c, 6)
+        ROUND4(d,a,b,c, Z27,0x3d,10)
+        ROUND4(c,d,a,b, Z18,0x3e,15)
+        ROUND4(b,c,d,a, Z25,0x3f,21)
+        VPADDD sa, a, a
+        VPADDD sb, b, b
+        VPADDD sc, c, c
+        VPADDD sd, d, d
+        LEAQ 64(base), base
+        SUBQ $64, count
+        JNE  loop
+        VMOVUPD a, (dig)
+        VMOVUPD b, 0x40(dig)
+        VMOVUPD c, 0x80(dig)
+        VMOVUPD d, 0xc0(dig)
+        VZEROUPPER
+        RET
diff --git a/vendor/github.com/minio/md5-simd/block8_amd64.s b/vendor/github.com/minio/md5-simd/block8_amd64.s
new file mode 100644
index 0000000..f57db17
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/block8_amd64.s
@@ -0,0 +1,281 @@
+//+build !noasm,!appengine,gc
+// Copyright (c) 2018 Igneous Systems
+//   MIT License
+//
+//   Permission is hereby granted, free of charge, to any person obtaining a copy
+//   of this software and associated documentation files (the "Software"), to deal
+//   in the Software without restriction, including without limitation the rights
+//   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//   copies of the Software, and to permit persons to whom the Software is
+//   furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all
+//   copies or substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+//   SOFTWARE.
+// Copyright (c) 2020 MinIO Inc. All rights reserved.
+//   Use of this source code is governed by a license that can be
+//   found in the LICENSE file.
+// This is the AVX2 implementation of the MD5 block function (8-way parallel)
+// block8(state *uint64, base uintptr, bufs *int32, cache *byte, n int)
+TEXT ·block8(SB), 4, $0-40
+        MOVQ state+0(FP), BX
+        MOVQ base+8(FP), SI
+        MOVQ bufs+16(FP), AX
+        MOVQ cache+24(FP), CX
+        MOVQ n+32(FP), DX
+        MOVQ ·avx256md5consts+0(SB), DI
+        // Align cache (which is stack allocated by the compiler)
+        // to a 256 bit boundary (ymm register alignment)
+        // The cache8 type is deliberately oversized to permit this.
+        ADDQ $31, CX
+        ANDB $-32, CL
+#define a Y0
+#define b Y1
+#define c Y2
+#define d Y3
+#define sa Y4
+#define sb Y5
+#define sc Y6
+#define sd Y7
+#define tmp  Y8
+#define tmp2 Y9
+#define mask Y10
+#define off  Y11
+#define ones Y12
+#define rtmp1  Y13
+#define rtmp2  Y14
+#define mem   Y15
+#define dig    BX
+#define cache  CX
+#define count  DX
+#define base   SI
+#define consts DI
+#define prepmask \
+        VPXOR    mask, mask, mask \
+        VPCMPGTD mask, off, mask
+#define prep(index) \
+        VMOVAPD    mask, rtmp2                      \
+        VPGATHERDD rtmp2, index*4(base)(off*1), mem
+#define load(index) \
+        VMOVAPD index*32(cache), mem
+#define store(index) \
+        VMOVAPD mem, index*32(cache)
+#define roll(shift, a) \
+        VPSLLD $shift, a, rtmp1 \
+        VPSRLD $32-shift, a, a  \
+        VPOR   rtmp1, a, a
+#define ROUND1(a, b, c, d, index, const, shift) \
+        VPXOR   c, tmp, tmp            \
+        VPADDD  32*const(consts), a, a \
+        VPADDD  mem, a, a              \
+        VPAND   b, tmp, tmp            \
+        VPXOR   d, tmp, tmp            \
+        prep(index)                    \
+        VPADDD  tmp, a, a              \
+        roll(shift,a)                  \
+        VMOVAPD c, tmp                 \
+        VPADDD  b, a, a
+#define ROUND1load(a, b, c, d, index, const, shift) \
+        VXORPD  c, tmp, tmp            \
+        VPADDD  32*const(consts), a, a \
+        VPADDD  mem, a, a              \
+        VPAND   b, tmp, tmp            \
+        VPXOR   d, tmp, tmp            \
+        load(index)                    \
+        VPADDD  tmp, a, a              \
+        roll(shift,a)                  \
+        VMOVAPD c, tmp                 \
+        VPADDD  b, a, a
+#define ROUND2(a, b, c, d, index, const, shift) \
+        VPADDD  32*const(consts), a, a \
+        VPADDD  mem, a, a              \
+        VPAND   b, tmp2, tmp2          \
+        VANDNPD c, tmp, tmp            \
+        load(index)                    \
+        VPOR    tmp, tmp2, tmp2        \
+        VMOVAPD c, tmp                 \
+        VPADDD  tmp2, a, a             \
+        VMOVAPD c, tmp2                \
+        roll(shift,a)                  \
+        VPADDD  b, a, a
+#define ROUND3(a, b, c, d, index, const, shift) \
+        VPADDD  32*const(consts), a, a \
+        VPADDD  mem, a, a              \
+        load(index)                    \
+        VPXOR   d, tmp, tmp            \
+        VPXOR   b, tmp, tmp            \
+        VPADDD  tmp, a, a              \
+        roll(shift,a)                  \
+        VMOVAPD b, tmp                 \
+        VPADDD  b, a, a
+#define ROUND4(a, b, c, d, index, const, shift) \
+        VPADDD 32*const(consts), a, a \
+        VPADDD mem, a, a              \
+        VPOR   b, tmp, tmp            \
+        VPXOR  c, tmp, tmp            \
+        VPADDD tmp, a, a              \
+        load(index)                   \
+        roll(shift,a)                 \
+        VPXOR  c, ones, tmp           \
+        VPADDD b, a, a
+        // load digest into state registers
+        VMOVUPD (dig), a
+        VMOVUPD 32(dig), b
+        VMOVUPD 64(dig), c
+        VMOVUPD 96(dig), d
+        // load source buffer offsets
+        VMOVUPD (AX), off
+        prepmask
+        VPCMPEQD ones, ones, ones
+loop:
+        VMOVAPD a, sa
+        VMOVAPD b, sb
+        VMOVAPD c, sc
+        VMOVAPD d, sd
+        prep(0)
+        VMOVAPD d, tmp
+        store(0)
+        ROUND1(a,b,c,d, 1,0x00, 7)
+        store(1)
+        ROUND1(d,a,b,c, 2,0x01,12)
+        store(2)
+        ROUND1(c,d,a,b, 3,0x02,17)
+        store(3)
+        ROUND1(b,c,d,a, 4,0x03,22)
+        store(4)
+        ROUND1(a,b,c,d, 5,0x04, 7)
+        store(5)
+        ROUND1(d,a,b,c, 6,0x05,12)
+        store(6)
+        ROUND1(c,d,a,b, 7,0x06,17)
+        store(7)
+        ROUND1(b,c,d,a, 8,0x07,22)
+        store(8)
+        ROUND1(a,b,c,d, 9,0x08, 7)
+        store(9)
+        ROUND1(d,a,b,c,10,0x09,12)
+        store(10)
+        ROUND1(c,d,a,b,11,0x0a,17)
+        store(11)
+        ROUND1(b,c,d,a,12,0x0b,22)
+        store(12)
+        ROUND1(a,b,c,d,13,0x0c, 7)
+        store(13)
+        ROUND1(d,a,b,c,14,0x0d,12)
+        store(14)
+        ROUND1(c,d,a,b,15,0x0e,17)
+        store(15)
+        ROUND1load(b,c,d,a, 1,0x0f,22)
+        VMOVAPD d, tmp
+        VMOVAPD d, tmp2
+        ROUND2(a,b,c,d, 6,0x10, 5)
+        ROUND2(d,a,b,c,11,0x11, 9)
+        ROUND2(c,d,a,b, 0,0x12,14)
+        ROUND2(b,c,d,a, 5,0x13,20)
+        ROUND2(a,b,c,d,10,0x14, 5)
+        ROUND2(d,a,b,c,15,0x15, 9)
+        ROUND2(c,d,a,b, 4,0x16,14)
+        ROUND2(b,c,d,a, 9,0x17,20)
+        ROUND2(a,b,c,d,14,0x18, 5)
+        ROUND2(d,a,b,c, 3,0x19, 9)
+        ROUND2(c,d,a,b, 8,0x1a,14)
+        ROUND2(b,c,d,a,13,0x1b,20)
+        ROUND2(a,b,c,d, 2,0x1c, 5)
+        ROUND2(d,a,b,c, 7,0x1d, 9)
+        ROUND2(c,d,a,b,12,0x1e,14)
+        ROUND2(b,c,d,a, 0,0x1f,20)
+        load(5)
+        VMOVAPD c, tmp
+        ROUND3(a,b,c,d, 8,0x20, 4)
+        ROUND3(d,a,b,c,11,0x21,11)
+        ROUND3(c,d,a,b,14,0x22,16)
+        ROUND3(b,c,d,a, 1,0x23,23)
+        ROUND3(a,b,c,d, 4,0x24, 4)
+        ROUND3(d,a,b,c, 7,0x25,11)
+        ROUND3(c,d,a,b,10,0x26,16)
+        ROUND3(b,c,d,a,13,0x27,23)
+        ROUND3(a,b,c,d, 0,0x28, 4)
+        ROUND3(d,a,b,c, 3,0x29,11)
+        ROUND3(c,d,a,b, 6,0x2a,16)
+        ROUND3(b,c,d,a, 9,0x2b,23)
+        ROUND3(a,b,c,d,12,0x2c, 4)
+        ROUND3(d,a,b,c,15,0x2d,11)
+        ROUND3(c,d,a,b, 2,0x2e,16)
+        ROUND3(b,c,d,a, 0,0x2f,23)
+        load(0)
+        VPXOR d, ones, tmp
+        ROUND4(a,b,c,d, 7,0x30, 6)
+        ROUND4(d,a,b,c,14,0x31,10)
+        ROUND4(c,d,a,b, 5,0x32,15)
+        ROUND4(b,c,d,a,12,0x33,21)
+        ROUND4(a,b,c,d, 3,0x34, 6)
+        ROUND4(d,a,b,c,10,0x35,10)
+        ROUND4(c,d,a,b, 1,0x36,15)
+        ROUND4(b,c,d,a, 8,0x37,21)
+        ROUND4(a,b,c,d,15,0x38, 6)
+        ROUND4(d,a,b,c, 6,0x39,10)
+        ROUND4(c,d,a,b,13,0x3a,15)
+        ROUND4(b,c,d,a, 4,0x3b,21)
+        ROUND4(a,b,c,d,11,0x3c, 6)
+        ROUND4(d,a,b,c, 2,0x3d,10)
+        ROUND4(c,d,a,b, 9,0x3e,15)
+        ROUND4(b,c,d,a, 0,0x3f,21)
+        VPADDD sa, a, a
+        VPADDD sb, b, b
+        VPADDD sc, c, c
+        VPADDD sd, d, d
+        LEAQ 64(base), base
+        SUBQ $64, count
+        JNE  loop
+        VMOVUPD a, (dig)
+        VMOVUPD b, 32(dig)
+        VMOVUPD c, 64(dig)
+        VMOVUPD d, 96(dig)
+        VZEROUPPER
+        RET
diff --git a/vendor/github.com/minio/md5-simd/block_amd64.go b/vendor/github.com/minio/md5-simd/block_amd64.go
new file mode 100644
index 0000000..16edda2
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/block_amd64.go
@@ -0,0 +1,210 @@
+//+build !noasm,!appengine,gc
+// Copyright (c) 2020 MinIO Inc. All rights reserved.
+// Use of this source code is governed by a license that can be
+// found in the LICENSE file.
+package md5simd
+import (
+        "fmt"
+        "math"
+        "unsafe"
+        "github.com/klauspost/cpuid/v2"
+)
+var hasAVX512 bool
+func init() {
+        // VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F.
+        hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ)
+}
+//go:noescape
+func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
+//go:noescape
+func block16(state *uint32, base uintptr, ptrs *int32, mask uint64, n int)
+// 8-way 4x uint32 digests in 4 ymm registers
+// (ymm0, ymm1, ymm2, ymm3)
+type digest8 struct {
+        v0, v1, v2, v3 [8]uint32
+}
+// Stack cache for 8x64 byte md5.BlockSize bytes.
+// Must be 32-byte aligned, so allocate 512+32 and
+// align upwards at runtime.
+type cache8 [512 + 32]byte
+// MD5 magic numbers for one lane of hashing; inflated
+// 8x below at init time.
+var md5consts = [64]uint32{
+        0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
+        0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
+        0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+        0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
+        0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
+        0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+        0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
+        0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
+        0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+        0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
+        0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
+        0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+        0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
+        0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
+        0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+        0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
+}
+// inflate the consts 8-way for 8x md5 (256 bit ymm registers)
+var avx256md5consts = func(c []uint32) []uint32 {
+        inf := make([]uint32, 8*len(c))
+        for i := range c {
+                for j := 0; j < 8; j++ {
+                        inf[(i*8)+j] = c[i]
+                }
+        }
+        return inf
+}(md5consts[:])
+// 16-way 4x uint32 digests in 4 zmm registers
+type digest16 struct {
+        v0, v1, v2, v3 [16]uint32
+}
+// inflate the consts 16-way for 16x md5 (512 bit zmm registers)
+var avx512md5consts = func(c []uint32) []uint32 {
+        inf := make([]uint32, 16*len(c))
+        for i := range c {
+                for j := 0; j < 16; j++ {
+                        inf[(i*16)+j] = c[i]
+                }
+        }
+        return inf
+}(md5consts[:])
+// Interface function to assembly code
+func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
+        if hasAVX512 {
+                blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
+                return
+        }
+        // Preparing data using copy is slower since copies aren't inlined.
+        // Calculate on this goroutine
+        if half {
+                for i := range s.i8[0][:] {
+                        s.i8[0][i] = input[i]
+                }
+                for i := range s.d8a.v0[:] {
+                        s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
+                }
+                blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a)
+                for i := range s.d8a.v0[:] {
+                        d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
+                }
+                return
+        }
+        for i := range s.i8[0][:] {
+                s.i8[0][i], s.i8[1][i] = input[i], input[8+i]
+        }
+        for i := range s.d8a.v0[:] {
+                j := (i + 8) & 15
+                s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
+                s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
+        }
+        // Benchmarks appears to be slightly faster when spinning up 2 goroutines instead
+        // of using the current for one of the blocks.
+        s.wg.Add(2)
+        go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }()
+        go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }()
+        s.wg.Wait()
+        for i := range s.d8a.v0[:] {
+                d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
+        }
+        for i := range s.d8b.v0[:] {
+                j := (i + 8) & 15
+                d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i]
+        }
+}
+// Interface function to AVX512 assembly code
+func blockMd5_avx512(s *digest16, input [16][]byte, base []byte, maskRounds *[16]maskRounds) {
+        baseMin := uint64(uintptr(unsafe.Pointer(&(base[0]))))
+        ptrs := [16]int32{}
+        for i := range ptrs {
+                if len(input[i]) > 0 {
+                        if len(input[i]) > internalBlockSize {
+                                panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
+                        }
+                        off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
+                        if off > math.MaxUint32 {
+                                panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
+                        }
+                        ptrs[i] = int32(off)
+                }
+        }
+        sdup := *s // create copy of initial states to receive intermediate updates
+        rounds := generateMaskAndRounds16(input, maskRounds)
+        for r := 0; r < rounds; r++ {
+                m := maskRounds[r]
+                block16(&sdup.v0[0], uintptr(baseMin), &ptrs[0], m.mask, int(64*m.rounds))
+                for j := 0; j < len(ptrs); j++ {
+                        ptrs[j] += int32(64 * m.rounds) // update pointers for next round
+                        if m.mask&(1<<j) != 0 {         // update digest if still masked as active
+                                (*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
+                        }
+                }
+        }
+}
+// Interface function to AVX2 assembly code
+func blockMd5_avx2(s *digest8, input [8][]byte, base []byte, maskRounds *[8]maskRounds) {
+        baseMin := uint64(uintptr(unsafe.Pointer(&(base[0])))) - 4
+        ptrs := [8]int32{}
+        for i := range ptrs {
+                if len(input[i]) > 0 {
+                        if len(input[i]) > internalBlockSize {
+                                panic(fmt.Sprintf("Sanity check fails for lane %d: maximum input length cannot exceed internalBlockSize", i))
+                        }
+                        off := uint64(uintptr(unsafe.Pointer(&(input[i][0])))) - baseMin
+                        if off > math.MaxUint32 {
+                                panic(fmt.Sprintf("invalid buffer sent with offset %x", off))
+                        }
+                        ptrs[i] = int32(off)
+                }
+        }
+        sdup := *s // create copy of initial states to receive intermediate updates
+        rounds := generateMaskAndRounds8(input, maskRounds)
+        for r := 0; r < rounds; r++ {
+                m := maskRounds[r]
+                var cache cache8 // stack storage for block8 tmp state
+                block8(&sdup.v0[0], uintptr(baseMin), &ptrs[0], &cache[0], int(64*m.rounds))
+                for j := 0; j < len(ptrs); j++ {
+                        ptrs[j] += int32(64 * m.rounds) // update pointers for next round
+                        if m.mask&(1<<j) != 0 {         // update digest if still masked as active
+                                (*s).v0[j], (*s).v1[j], (*s).v2[j], (*s).v3[j] = sdup.v0[j], sdup.v1[j], sdup.v2[j], sdup.v3[j]
+                        }
+                }
+        }
+}
diff --git a/vendor/github.com/minio/md5-simd/md5-digest_amd64.go b/vendor/github.com/minio/md5-simd/md5-digest_amd64.go
new file mode 100644
index 0000000..5ea23a4
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/md5-digest_amd64.go
@@ -0,0 +1,188 @@
+//+build !noasm,!appengine,gc
+// Copyright (c) 2020 MinIO Inc. All rights reserved.
+// Use of this source code is governed by a license that can be
+// found in the LICENSE file.
+package md5simd
+import (
+        "encoding/binary"
+        "errors"
+        "fmt"
+        "sync"
+        "sync/atomic"
+)
+// md5Digest - Type for computing MD5 using either AVX2 or AVX512
+type md5Digest struct {
+        uid         uint64
+        blocksCh    chan blockInput
+        cycleServer chan uint64
+        x           [BlockSize]byte
+        nx          int
+        len         uint64
+        buffers     <-chan []byte
+}
+// NewHash - initialize instance for Md5 implementation.
+func (s *md5Server) NewHash() Hasher {
+        uid := atomic.AddUint64(&s.uidCounter, 1)
+        blockCh := make(chan blockInput, buffersPerLane)
+        s.newInput <- newClient{
+                uid:   uid,
+                input: blockCh,
+        }
+        return &md5Digest{
+                uid:         uid,
+                buffers:     s.buffers,
+                blocksCh:    blockCh,
+                cycleServer: s.cycle,
+        }
+}
+// Size - Return size of checksum
+func (d *md5Digest) Size() int { return Size }
+// BlockSize - Return blocksize of checksum
+func (d md5Digest) BlockSize() int { return BlockSize }
+func (d *md5Digest) Reset() {
+        if d.blocksCh == nil {
+                panic("reset after close")
+        }
+        d.nx = 0
+        d.len = 0
+        d.sendBlock(blockInput{uid: d.uid, reset: true}, false)
+}
+// write to digest
+func (d *md5Digest) Write(p []byte) (nn int, err error) {
+        if d.blocksCh == nil {
+                return 0, errors.New("md5Digest closed")
+        }
+        // break input into chunks of maximum internalBlockSize size
+        for {
+                l := len(p)
+                if l > internalBlockSize {
+                        l = internalBlockSize
+                }
+                nnn, err := d.write(p[:l])
+                if err != nil {
+                        return nn, err
+                }
+                nn += nnn
+                p = p[l:]
+                if len(p) == 0 {
+                        break
+                }
+        }
+        return
+}
+func (d *md5Digest) write(p []byte) (nn int, err error) {
+        nn = len(p)
+        d.len += uint64(nn)
+        if d.nx > 0 {
+                n := copy(d.x[d.nx:], p)
+                d.nx += n
+                if d.nx == BlockSize {
+                        // Create a copy of the overflow buffer in order to send it async over the channel
+                        // (since we will modify the overflow buffer down below with any access beyond multiples of 64)
+                        tmp := <-d.buffers
+                        tmp = tmp[:BlockSize]
+                        copy(tmp, d.x[:])
+                        d.sendBlock(blockInput{uid: d.uid, msg: tmp}, len(p)-n < BlockSize)
+                        d.nx = 0
+                }
+                p = p[n:]
+        }
+        if len(p) >= BlockSize {
+                n := len(p) &^ (BlockSize - 1)
+                buf := <-d.buffers
+                buf = buf[:n]
+                copy(buf, p)
+                d.sendBlock(blockInput{uid: d.uid, msg: buf}, len(p)-n < BlockSize)
+                p = p[n:]
+        }
+        if len(p) > 0 {
+                d.nx = copy(d.x[:], p)
+        }
+        return
+}
+func (d *md5Digest) Close() {
+        if d.blocksCh != nil {
+                close(d.blocksCh)
+                d.blocksCh = nil
+        }
+}
+var sumChPool sync.Pool
+func init() {
+        sumChPool.New = func() interface{} {
+                return make(chan sumResult, 1)
+        }
+}
+// Sum - Return MD5 sum in bytes
+func (d *md5Digest) Sum(in []byte) (result []byte) {
+        if d.blocksCh == nil {
+                panic("sum after close")
+        }
+        trail := <-d.buffers
+        trail = append(trail[:0], d.x[:d.nx]...)
+        length := d.len
+        // Padding.  Add a 1 bit and 0 bits until 56 bytes mod 64.
+        var tmp [64]byte
+        tmp[0] = 0x80
+        if length%64 < 56 {
+                trail = append(trail, tmp[0:56-length%64]...)
+        } else {
+                trail = append(trail, tmp[0:64+56-length%64]...)
+        }
+        // Length in bits.
+        length <<= 3
+        binary.LittleEndian.PutUint64(tmp[:], length) // append length in bits
+        trail = append(trail, tmp[0:8]...)
+        if len(trail)%BlockSize != 0 {
+                panic(fmt.Errorf("internal error: sum block was not aligned. len=%d, nx=%d", len(trail), d.nx))
+        }
+        sumCh := sumChPool.Get().(chan sumResult)
+        d.sendBlock(blockInput{uid: d.uid, msg: trail, sumCh: sumCh}, true)
+        sum := <-sumCh
+        sumChPool.Put(sumCh)
+        return append(in, sum.digest[:]...)
+}
+// sendBlock will send a block for processing.
+// If cycle is true we will block on cycle, otherwise we will only block
+// if the block channel is full.
+func (d *md5Digest) sendBlock(bi blockInput, cycle bool) {
+        if cycle {
+                select {
+                case d.blocksCh <- bi:
+                        d.cycleServer <- d.uid
+                }
+                return
+        }
+        // Only block on cycle if we filled the buffer
+        select {
+        case d.blocksCh <- bi:
+                return
+        default:
+                d.cycleServer <- d.uid
+                d.blocksCh <- bi
+        }
+}
diff --git a/vendor/github.com/minio/md5-simd/md5-server_amd64.go b/vendor/github.com/minio/md5-simd/md5-server_amd64.go
new file mode 100644
index 0000000..94f741c
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/md5-server_amd64.go
@@ -0,0 +1,397 @@
+//+build !noasm,!appengine,gc
+// Copyright (c) 2020 MinIO Inc. All rights reserved.
+// Use of this source code is governed by a license that can be
+// found in the LICENSE file.
+package md5simd
+import (
+        "encoding/binary"
+        "fmt"
+        "runtime"
+        "sync"
+        "github.com/klauspost/cpuid/v2"
+)
+// MD5 initialization constants
+const (
+        // Lanes is the number of concurrently calculated hashes.
+        Lanes = 16
+        init0 = 0x67452301
+        init1 = 0xefcdab89
+        init2 = 0x98badcfe
+        init3 = 0x10325476
+        // Use scalar routine when below this many lanes
+        useScalarBelow = 3
+)
+// md5ServerUID - Does not start at 0 but next multiple of 16 so as to be able to
+// differentiate with default initialisation value of 0
+const md5ServerUID = Lanes
+const buffersPerLane = 3
+// Message to send across input channel
+type blockInput struct {
+        uid   uint64
+        msg   []byte
+        sumCh chan sumResult
+        reset bool
+}
+type sumResult struct {
+        digest [Size]byte
+}
+type lanesInfo [Lanes]blockInput
+// md5Server - Type to implement parallel handling of MD5 invocations
+type md5Server struct {
+        uidCounter   uint64
+        cycle        chan uint64           // client with uid has update.
+        newInput     chan newClient        // Add new client.
+        digests      map[uint64][Size]byte // Map of uids to (interim) digest results
+        maskRounds16 [16]maskRounds        // Pre-allocated static array for max 16 rounds
+        maskRounds8a [8]maskRounds         // Pre-allocated static array for max 8 rounds (1st AVX2 core)
+        maskRounds8b [8]maskRounds         // Pre-allocated static array for max 8 rounds (2nd AVX2 core)
+        allBufs      []byte                // Preallocated buffer.
+        buffers      chan []byte           // Preallocated buffers, sliced from allBufs.
+        i8       [2][8][]byte // avx2 temporary vars
+        d8a, d8b digest8
+        wg       sync.WaitGroup
+}
+// NewServer - Create new object for parallel processing handling
+func NewServer() Server {
+        if !cpuid.CPU.Supports(cpuid.AVX2) {
+                return &fallbackServer{}
+        }
+        md5srv := &md5Server{}
+        md5srv.digests = make(map[uint64][Size]byte)
+        md5srv.newInput = make(chan newClient, Lanes)
+        md5srv.cycle = make(chan uint64, Lanes*10)
+        md5srv.uidCounter = md5ServerUID - 1
+        md5srv.allBufs = make([]byte, 32+buffersPerLane*Lanes*internalBlockSize)
+        md5srv.buffers = make(chan []byte, buffersPerLane*Lanes)
+        // Fill buffers.
+        for i := 0; i < buffersPerLane*Lanes; i++ {
+                s := 32 + i*internalBlockSize
+                md5srv.buffers <- md5srv.allBufs[s : s+internalBlockSize : s+internalBlockSize]
+        }
+        // Start a single thread for reading from the input channel
+        go md5srv.process(md5srv.newInput)
+        return md5srv
+}
+type newClient struct {
+        uid   uint64
+        input chan blockInput
+}
+// process - Sole handler for reading from the input channel.
+func (s *md5Server) process(newClients chan newClient) {
+        // To fill up as many lanes as possible:
+        //
+        // 1. Wait for a cycle id.
+        // 2. If not already in a lane, add, otherwise leave on channel
+        // 3. Start timer
+        // 4. Check if lanes is full, if so, goto 10 (process).
+        // 5. If timeout, goto 10.
+        // 6. Wait for new id (goto 2)  or timeout (goto 10).
+        // 10. Process.
+        // 11. Check all input if there is already input, if so add to lanes.
+        // 12. Goto 1
+        // lanes contains the lanes.
+        var lanes lanesInfo
+        // lanesFilled contains the number of filled lanes for current cycle.
+        var lanesFilled int
+        // clients contains active clients
+        var clients = make(map[uint64]chan blockInput, Lanes)
+        addToLane := func(uid uint64) {
+                cl, ok := clients[uid]
+                if !ok {
+                        // Unknown client. Maybe it was already removed.
+                        return
+                }
+                // Check if we already have it.
+                for _, lane := range lanes[:lanesFilled] {
+                        if lane.uid == uid {
+                                return
+                        }
+                }
+                // Continue until we get a block or there is nothing on channel
+                for {
+                        select {
+                        case block, ok := <-cl:
+                                if !ok {
+                                        // Client disconnected
+                                        delete(clients, block.uid)
+                                        return
+                                }
+                                if block.uid != uid {
+                                        panic(fmt.Errorf("uid mismatch, %d (block) != %d (client)", block.uid, uid))
+                                }
+                                // If reset message, reset and we're done
+                                if block.reset {
+                                        delete(s.digests, uid)
+                                        continue
+                                }
+                                // If requesting sum, we will need to maintain state.
+                                if block.sumCh != nil {
+                                        var dig digest
+                                        d, ok := s.digests[uid]
+                                        if ok {
+                                                dig.s[0] = binary.LittleEndian.Uint32(d[0:4])
+                                                dig.s[1] = binary.LittleEndian.Uint32(d[4:8])
+                                                dig.s[2] = binary.LittleEndian.Uint32(d[8:12])
+                                                dig.s[3] = binary.LittleEndian.Uint32(d[12:16])
+                                        } else {
+                                                dig.s[0], dig.s[1], dig.s[2], dig.s[3] = init0, init1, init2, init3
+                                        }
+                                        sum := sumResult{}
+                                        // Add end block to current digest.
+                                        blockScalar(&dig.s, block.msg)
+                                        binary.LittleEndian.PutUint32(sum.digest[0:], dig.s[0])
+                                        binary.LittleEndian.PutUint32(sum.digest[4:], dig.s[1])
+                                        binary.LittleEndian.PutUint32(sum.digest[8:], dig.s[2])
+                                        binary.LittleEndian.PutUint32(sum.digest[12:], dig.s[3])
+                                        block.sumCh <- sum
+                                        if block.msg != nil {
+                                                s.buffers <- block.msg
+                                        }
+                                        continue
+                                }
+                                if len(block.msg) == 0 {
+                                        continue
+                                }
+                                lanes[lanesFilled] = block
+                                lanesFilled++
+                                return
+                        default:
+                                return
+                        }
+                }
+        }
+        addNewClient := func(cl newClient) {
+                if _, ok := clients[cl.uid]; ok {
+                        panic("internal error: duplicate client registration")
+                }
+                clients[cl.uid] = cl.input
+        }
+        allLanesFilled := func() bool {
+                return lanesFilled == Lanes || lanesFilled >= len(clients)
+        }
+        for {
+                // Step 1.
+                for lanesFilled == 0 {
+                        select {
+                        case cl, ok := <-newClients:
+                                if !ok {
+                                        return
+                                }
+                                addNewClient(cl)
+                                // Check if it already sent a payload.
+                                addToLane(cl.uid)
+                                continue
+                        case uid := <-s.cycle:
+                                addToLane(uid)
+                        }
+                }
+        fillLanes:
+                for !allLanesFilled() {
+                        select {
+                        case cl, ok := <-newClients:
+                                if !ok {
+                                        return
+                                }
+                                addNewClient(cl)
+                        case uid := <-s.cycle:
+                                addToLane(uid)
+                        default:
+                                // Nothing more queued...
+                                break fillLanes
+                        }
+                }
+                // If we did not fill all lanes, check if there is more waiting
+                if !allLanesFilled() {
+                        runtime.Gosched()
+                        for uid := range clients {
+                                addToLane(uid)
+                                if allLanesFilled() {
+                                        break
+                                }
+                        }
+                }
+                if false {
+                        if !allLanesFilled() {
+                                fmt.Println("Not all lanes filled", lanesFilled, "of", len(clients))
+                                //pprof.Lookup("goroutine").WriteTo(os.Stdout, 1)
+                        } else if true {
+                                fmt.Println("all lanes filled")
+                        }
+                }
+                // Process the lanes we could collect
+                s.blocks(lanes[:lanesFilled])
+                // Clear lanes...
+                lanesFilled = 0
+                // Add all current queued
+                for uid := range clients {
+                        addToLane(uid)
+                        if allLanesFilled() {
+                                break
+                        }
+                }
+        }
+}
+func (s *md5Server) Close() {
+        if s.newInput != nil {
+                close(s.newInput)
+                s.newInput = nil
+        }
+}
+// Invoke assembly and send results back
+func (s *md5Server) blocks(lanes []blockInput) {
+        if len(lanes) < useScalarBelow {
+                // Use scalar routine when below this many lanes
+                switch len(lanes) {
+                case 0:
+                case 1:
+                        lane := lanes[0]
+                        var d digest
+                        a, ok := s.digests[lane.uid]
+                        if ok {
+                                d.s[0] = binary.LittleEndian.Uint32(a[0:4])
+                                d.s[1] = binary.LittleEndian.Uint32(a[4:8])
+                                d.s[2] = binary.LittleEndian.Uint32(a[8:12])
+                                d.s[3] = binary.LittleEndian.Uint32(a[12:16])
+                        } else {
+                                d.s[0] = init0
+                                d.s[1] = init1
+                                d.s[2] = init2
+                                d.s[3] = init3
+                        }
+                        if len(lane.msg) > 0 {
+                                // Update...
+                                blockScalar(&d.s, lane.msg)
+                        }
+                        dig := [Size]byte{}
+                        binary.LittleEndian.PutUint32(dig[0:], d.s[0])
+                        binary.LittleEndian.PutUint32(dig[4:], d.s[1])
+                        binary.LittleEndian.PutUint32(dig[8:], d.s[2])
+                        binary.LittleEndian.PutUint32(dig[12:], d.s[3])
+                        s.digests[lane.uid] = dig
+                        if lane.msg != nil {
+                                s.buffers <- lane.msg
+                        }
+                        lanes[0] = blockInput{}
+                default:
+                        s.wg.Add(len(lanes))
+                        var results [useScalarBelow]digest
+                        for i := range lanes {
+                                lane := lanes[i]
+                                go func(i int) {
+                                        var d digest
+                                        defer s.wg.Done()
+                                        a, ok := s.digests[lane.uid]
+                                        if ok {
+                                                d.s[0] = binary.LittleEndian.Uint32(a[0:4])
+                                                d.s[1] = binary.LittleEndian.Uint32(a[4:8])
+                                                d.s[2] = binary.LittleEndian.Uint32(a[8:12])
+                                                d.s[3] = binary.LittleEndian.Uint32(a[12:16])
+                                        } else {
+                                                d.s[0] = init0
+                                                d.s[1] = init1
+                                                d.s[2] = init2
+                                                d.s[3] = init3
+                                        }
+                                        if len(lane.msg) == 0 {
+                                                results[i] = d
+                                                return
+                                        }
+                                        // Update...
+                                        blockScalar(&d.s, lane.msg)
+                                        results[i] = d
+                                }(i)
+                        }
+                        s.wg.Wait()
+                        for i, lane := range lanes {
+                                dig := [Size]byte{}
+                                binary.LittleEndian.PutUint32(dig[0:], results[i].s[0])
+                                binary.LittleEndian.PutUint32(dig[4:], results[i].s[1])
+                                binary.LittleEndian.PutUint32(dig[8:], results[i].s[2])
+                                binary.LittleEndian.PutUint32(dig[12:], results[i].s[3])
+                                s.digests[lane.uid] = dig
+                                if lane.msg != nil {
+                                        s.buffers <- lane.msg
+                                }
+                                lanes[i] = blockInput{}
+                        }
+                }
+                return
+        }
+        inputs := [16][]byte{}
+        for i := range lanes {
+                inputs[i] = lanes[i].msg
+        }
+        // Collect active digests...
+        state := s.getDigests(lanes)
+        // Process all lanes...
+        s.blockMd5_x16(&state, inputs, len(lanes) <= 8)
+        for i, lane := range lanes {
+                uid := lane.uid
+                dig := [Size]byte{}
+                binary.LittleEndian.PutUint32(dig[0:], state.v0[i])
+                binary.LittleEndian.PutUint32(dig[4:], state.v1[i])
+                binary.LittleEndian.PutUint32(dig[8:], state.v2[i])
+                binary.LittleEndian.PutUint32(dig[12:], state.v3[i])
+                s.digests[uid] = dig
+                if lane.msg != nil {
+                        s.buffers <- lane.msg
+                }
+                lanes[i] = blockInput{}
+        }
+}
+func (s *md5Server) getDigests(lanes []blockInput) (d digest16) {
+        for i, lane := range lanes {
+                a, ok := s.digests[lane.uid]
+                if ok {
+                        d.v0[i] = binary.LittleEndian.Uint32(a[0:4])
+                        d.v1[i] = binary.LittleEndian.Uint32(a[4:8])
+                        d.v2[i] = binary.LittleEndian.Uint32(a[8:12])
+                        d.v3[i] = binary.LittleEndian.Uint32(a[12:16])
+                } else {
+                        d.v0[i] = init0
+                        d.v1[i] = init1
+                        d.v2[i] = init2
+                        d.v3[i] = init3
+                }
+        }
+        return
+}
diff --git a/vendor/github.com/minio/md5-simd/md5-server_fallback.go b/vendor/github.com/minio/md5-simd/md5-server_fallback.go
new file mode 100644
index 0000000..7814dad
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/md5-server_fallback.go
@@ -0,0 +1,12 @@
+//+build !amd64 appengine !gc noasm
+// Copyright (c) 2020 MinIO Inc. All rights reserved.
+// Use of this source code is governed by a license that can be
+// found in the LICENSE file.
+package md5simd
+// NewServer - Create new object for parallel processing handling
+func NewServer() *fallbackServer {
+        return &fallbackServer{}
+}
diff --git a/vendor/github.com/minio/md5-simd/md5-util_amd64.go b/vendor/github.com/minio/md5-simd/md5-util_amd64.go
new file mode 100644
index 0000000..73981b0
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/md5-util_amd64.go
@@ -0,0 +1,85 @@
+//+build !noasm,!appengine,gc
+// Copyright (c) 2020 MinIO Inc. All rights reserved.
+// Use of this source code is governed by a license that can be
+// found in the LICENSE file.
+package md5simd
+// Helper struct for sorting blocks based on length
+type lane struct {
+        len uint
+        pos uint
+}
+type digest struct {
+        s [4]uint32
+}
+// Helper struct for generating number of rounds in combination with mask for valid lanes
+type maskRounds struct {
+        mask   uint64
+        rounds uint64
+}
+func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
+        // Sort on blocks length small to large
+        var sorted [8]lane
+        for c, inpt := range input[:] {
+                sorted[c] = lane{uint(len(inpt)), uint(c)}
+                for i := c - 1; i >= 0; i-- {
+                        // swap so largest is at the end...
+                        if sorted[i].len > sorted[i+1].len {
+                                sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
+                                continue
+                        }
+                        break
+                }
+        }
+        // Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
+        m, round := uint64(0xff), uint64(0)
+        for _, s := range sorted[:] {
+                if s.len > 0 {
+                        if uint64(s.len)>>6 > round {
+                                mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
+                                rounds++
+                        }
+                        round = uint64(s.len) >> 6
+                }
+                m = m & ^(1 << uint(s.pos))
+        }
+        return
+}
+func generateMaskAndRounds16(input [16][]byte, mr *[16]maskRounds) (rounds int) {
+        // Sort on blocks length small to large
+        var sorted [16]lane
+        for c, inpt := range input[:] {
+                sorted[c] = lane{uint(len(inpt)), uint(c)}
+                for i := c - 1; i >= 0; i-- {
+                        // swap so largest is at the end...
+                        if sorted[i].len > sorted[i+1].len {
+                                sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
+                                continue
+                        }
+                        break
+                }
+        }
+        // Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
+        m, round := uint64(0xffff), uint64(0)
+        for _, s := range sorted[:] {
+                if s.len > 0 {
+                        if uint64(s.len)>>6 > round {
+                                mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
+                                rounds++
+                        }
+                        round = uint64(s.len) >> 6
+                }
+                m = m & ^(1 << uint(s.pos))
+        }
+        return
+}
diff --git a/vendor/github.com/minio/md5-simd/md5.go b/vendor/github.com/minio/md5-simd/md5.go
new file mode 100644
index 0000000..11b0cb9
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/md5.go
@@ -0,0 +1,63 @@
+package md5simd
+import (
+        "crypto/md5"
+        "hash"
+        "sync"
+)
+const (
+        // The blocksize of MD5 in bytes.
+        BlockSize = 64
+        // The size of an MD5 checksum in bytes.
+        Size = 16
+        // internalBlockSize is the internal block size.
+        internalBlockSize = 32 << 10
+)
+type Server interface {
+        NewHash() Hasher
+        Close()
+}
+type Hasher interface {
+        hash.Hash
+        Close()
+}
+// StdlibHasher returns a Hasher that uses the stdlib for hashing.
+// Used hashers are stored in a pool for fast reuse.
+func StdlibHasher() Hasher {
+        return &md5Wrapper{Hash: md5Pool.New().(hash.Hash)}
+}
+// md5Wrapper is a wrapper around the builtin hasher.
+type md5Wrapper struct {
+        hash.Hash
+}
+var md5Pool = sync.Pool{New: func() interface{} {
+        return md5.New()
+}}
+// fallbackServer - Fallback when no assembly is available.
+type fallbackServer struct {
+}
+// NewHash -- return regular Golang md5 hashing from crypto
+func (s *fallbackServer) NewHash() Hasher {
+        return &md5Wrapper{Hash: md5Pool.New().(hash.Hash)}
+}
+func (s *fallbackServer) Close() {
+}
+func (m *md5Wrapper) Close() {
+        if m.Hash != nil {
+                m.Reset()
+                md5Pool.Put(m.Hash)
+                m.Hash = nil
+        }
+}
diff --git a/vendor/github.com/minio/md5-simd/md5block_amd64.go b/vendor/github.com/minio/md5-simd/md5block_amd64.go
new file mode 100644
index 0000000..4c27936
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/md5block_amd64.go
@@ -0,0 +1,11 @@
+// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
+// +build !appengine
+// +build !noasm
+// +build gc
+package md5simd
+// Encode p to digest
+//go:noescape
+func blockScalar(dig *[4]uint32, p []byte)
diff --git a/vendor/github.com/minio/md5-simd/md5block_amd64.s b/vendor/github.com/minio/md5-simd/md5block_amd64.s
new file mode 100644
index 0000000..fbc4a21
--- /dev/null
+++ b/vendor/github.com/minio/md5-simd/md5block_amd64.s
@@ -0,0 +1,714 @@
+// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
+// +build !appengine
+// +build !noasm
+// +build gc
+// func blockScalar(dig *[4]uint32, p []byte)
+TEXT ·blockScalar(SB), $0-32
+        MOVQ p_len+16(FP), AX
+        MOVQ dig+0(FP), CX
+        MOVQ p_base+8(FP), DX
+        SHRQ $0x06, AX
+        SHLQ $0x06, AX
+        LEAQ (DX)(AX*1), AX
+        CMPQ DX, AX
+        JEQ  end
+        MOVL (CX), BX
+        MOVL 4(CX), BP
+        MOVL 8(CX), SI
+        MOVL 12(CX), CX
+        MOVL $0xffffffff, DI
+loop:
+        MOVL (DX), R8
+        MOVL CX, R9
+        MOVL BX, R10
+        MOVL BP, R11
+        MOVL SI, R12
+        MOVL CX, R13
+        // ROUND1
+        XORL SI, R9
+        ADDL $0xd76aa478, BX
+        ADDL R8, BX
+        ANDL BP, R9
+        XORL CX, R9
+        MOVL 4(DX), R8
+        ADDL R9, BX
+        ROLL $0x07, BX
+        MOVL SI, R9
+        ADDL BP, BX
+        XORL BP, R9
+        ADDL $0xe8c7b756, CX
+        ADDL R8, CX
+        ANDL BX, R9
+        XORL SI, R9
+        MOVL 8(DX), R8
+        ADDL R9, CX
+        ROLL $0x0c, CX
+        MOVL BP, R9
+        ADDL BX, CX
+        XORL BX, R9
+        ADDL $0x242070db, SI
+        ADDL R8, SI
+        ANDL CX, R9
+        XORL BP, R9
+        MOVL 12(DX), R8
+        ADDL R9, SI
+        ROLL $0x11, SI
+        MOVL BX, R9
+        ADDL CX, SI
+        XORL CX, R9
+        ADDL $0xc1bdceee, BP
+        ADDL R8, BP
+        ANDL SI, R9
+        XORL BX, R9
+        MOVL 16(DX), R8
+        ADDL R9, BP
+        ROLL $0x16, BP
+        MOVL CX, R9
+        ADDL SI, BP
+        XORL SI, R9
+        ADDL $0xf57c0faf, BX
+        ADDL R8, BX
+        ANDL BP, R9
+        XORL CX, R9
+        MOVL 20(DX), R8
+        ADDL R9, BX
+        ROLL $0x07, BX
+        MOVL SI, R9
+        ADDL BP, BX
+        XORL BP, R9
+        ADDL $0x4787c62a, CX
+        ADDL R8, CX
+        ANDL BX, R9
+        XORL SI, R9
+        MOVL 24(DX), R8
+        ADDL R9, CX
+        ROLL $0x0c, CX
+        MOVL BP, R9
+        ADDL BX, CX
+        XORL BX, R9
+        ADDL $0xa8304613, SI
+        ADDL R8, SI
+        ANDL CX, R9
+        XORL BP, R9
+        MOVL 28(DX), R8
+        ADDL R9, SI
+        ROLL $0x11, SI
+        MOVL BX, R9
+        ADDL CX, SI
+        XORL CX, R9
+        ADDL $0xfd469501, BP
+        ADDL R8, BP
+        ANDL SI, R9
+        XORL BX, R9
+        MOVL 32(DX), R8
+        ADDL R9, BP
+        ROLL $0x16, BP
+        MOVL CX, R9
+        ADDL SI, BP
+        XORL SI, R9
+        ADDL $0x698098d8, BX
+        ADDL R8, BX
+        ANDL BP, R9
+        XORL CX, R9
+        MOVL 36(DX), R8
+        ADDL R9, BX
+        ROLL $0x07, BX
+        MOVL SI, R9
+        ADDL BP, BX
+        XORL BP, R9
+        ADDL $0x8b44f7af, CX
+        ADDL R8, CX
+        ANDL BX, R9
+        XORL SI, R9
+        MOVL 40(DX), R8
+        ADDL R9, CX
+        ROLL $0x0c, CX
+        MOVL BP, R9
+        ADDL BX, CX
+        XORL BX, R9
+        ADDL $0xffff5bb1, SI
+        ADDL R8, SI
+        ANDL CX, R9
+        XORL BP, R9
+        MOVL 44(DX), R8
+        ADDL R9, SI
+        ROLL $0x11, SI
+        MOVL BX, R9
+        ADDL CX, SI
+        XORL CX, R9
+        ADDL $0x895cd7be, BP
+        ADDL R8, BP
+        ANDL SI, R9
+        XORL BX, R9
+        MOVL 48(DX), R8
+        ADDL R9, BP
+        ROLL $0x16, BP
+        MOVL CX, R9
+        ADDL SI, BP
+        XORL SI, R9
+        ADDL $0x6b901122, BX
+        ADDL R8, BX
+        ANDL BP, R9
+        XORL CX, R9
+        MOVL 52(DX), R8
+        ADDL R9, BX
+        ROLL $0x07, BX
+        MOVL SI, R9
+        ADDL BP, BX
+        XORL BP, R9
+        ADDL $0xfd987193, CX
+        ADDL R8, CX
+        ANDL BX, R9
+        XORL SI, R9
+        MOVL 56(DX), R8
+        ADDL R9, CX
+        ROLL $0x0c, CX
+        MOVL BP, R9
+        ADDL BX, CX
+        XORL BX, R9
+        ADDL $0xa679438e, SI
+        ADDL R8, SI
+        ANDL CX, R9
+        XORL BP, R9
+        MOVL 60(DX), R8
+        ADDL R9, SI
+        ROLL $0x11, SI
+        MOVL BX, R9
+        ADDL CX, SI
+        XORL CX, R9
+        ADDL $0x49b40821, BP
+        ADDL R8, BP
+        ANDL SI, R9
+        XORL BX, R9
+        MOVL 4(DX), R8
+        ADDL R9, BP
+        ROLL $0x16, BP
+        MOVL CX, R9
+        ADDL SI, BP
+        // ROUND2
+        MOVL CX, R9
+        MOVL CX, R14
+        XORL DI, R9
+        ADDL $0xf61e2562, BX
+        ADDL R8, BX
+        ANDL BP, R14
+        ANDL SI, R9
+        MOVL 24(DX), R8
+        ORL  R9, R14
+        MOVL SI, R9
+        ADDL R14, BX
+        MOVL SI, R14
+        ROLL $0x05, BX
+        ADDL BP, BX
+        XORL DI, R9
+        ADDL $0xc040b340, CX
+        ADDL R8, CX
+        ANDL BX, R14
+        ANDL BP, R9
+        MOVL 44(DX), R8
+        ORL  R9, R14
+        MOVL BP, R9
+        ADDL R14, CX
+        MOVL BP, R14
+        ROLL $0x09, CX
+        ADDL BX, CX
+        XORL DI, R9
+        ADDL $0x265e5a51, SI
+        ADDL R8, SI
+        ANDL CX, R14
+        ANDL BX, R9
+        MOVL (DX), R8
+        ORL  R9, R14
+        MOVL BX, R9
+        ADDL R14, SI
+        MOVL BX, R14
+        ROLL $0x0e, SI
+        ADDL CX, SI
+        XORL DI, R9
+        ADDL $0xe9b6c7aa, BP
+        ADDL R8, BP
+        ANDL SI, R14
+        ANDL CX, R9
+        MOVL 20(DX), R8
+        ORL  R9, R14
+        MOVL CX, R9
+        ADDL R14, BP
+        MOVL CX, R14
+        ROLL $0x14, BP
+        ADDL SI, BP
+        XORL DI, R9
+        ADDL $0xd62f105d, BX
+        ADDL R8, BX
+        ANDL BP, R14
+        ANDL SI, R9
+        MOVL 40(DX), R8
+        ORL  R9, R14
+        MOVL SI, R9
+        ADDL R14, BX
+        MOVL SI, R14
+        ROLL $0x05, BX
+        ADDL BP, BX
+        XORL DI, R9
+        ADDL $0x02441453, CX
+        ADDL R8, CX
+        ANDL BX, R14
+        ANDL BP, R9
+        MOVL 60(DX), R8
+        ORL  R9, R14
+        MOVL BP, R9
+        ADDL R14, CX
+        MOVL BP, R14
+        ROLL $0x09, CX
+        ADDL BX, CX
+        XORL DI, R9
+        ADDL $0xd8a1e681, SI
+        ADDL R8, SI
+        ANDL CX, R14
+        ANDL BX, R9
+        MOVL 16(DX), R8
+        ORL  R9, R14
+        MOVL BX, R9
+        ADDL R14, SI
+        MOVL BX, R14
+        ROLL $0x0e, SI
+        ADDL CX, SI
+        XORL DI, R9
+        ADDL $0xe7d3fbc8, BP
+        ADDL R8, BP
+        ANDL SI, R14
+        ANDL CX, R9
+        MOVL 36(DX), R8
+        ORL  R9, R14
+        MOVL CX, R9
+        ADDL R14, BP
+        MOVL CX, R14
+        ROLL $0x14, BP
+        ADDL SI, BP
+        XORL DI, R9
+        ADDL $0x21e1cde6, BX
+        ADDL R8, BX
+        ANDL BP, R14
+        ANDL SI, R9
+        MOVL 56(DX), R8
+        ORL  R9, R14
+        MOVL SI, R9
+        ADDL R14, BX
+        MOVL SI, R14
+        ROLL $0x05, BX
+        ADDL BP, BX
+        XORL DI, R9
+        ADDL $0xc33707d6, CX
+        ADDL R8, CX
+        ANDL BX, R14
+        ANDL BP, R9
+        MOVL 12(DX), R8
+        ORL  R9, R14
+        MOVL BP, R9
+        ADDL R14, CX
+        MOVL BP, R14
+        ROLL $0x09, CX
+        ADDL BX, CX
+        XORL DI, R9
+        ADDL $0xf4d50d87, SI
+        ADDL R8, SI
+        ANDL CX, R14
+        ANDL BX, R9
+        MOVL 32(DX), R8
+        ORL  R9, R14
+        MOVL BX, R9
+        ADDL R14, SI
+        MOVL BX, R14
+        ROLL $0x0e, SI
+        ADDL CX, SI
+        XORL DI, R9
+        ADDL $0x455a14ed, BP
+        ADDL R8, BP
+        ANDL SI, R14
+        ANDL CX, R9
+        MOVL 52(DX), R8
+        ORL  R9, R14
+        MOVL CX, R9
+        ADDL R14, BP
+        MOVL CX, R14
+        ROLL $0x14, BP
+        ADDL SI, BP
+        XORL DI, R9
+        ADDL $0xa9e3e905, BX
+        ADDL R8, BX
+        ANDL BP, R14
+        ANDL SI, R9
+        MOVL 8(DX), R8
+        ORL  R9, R14
+        MOVL SI, R9
+        ADDL R14, BX
+        MOVL SI, R14
+        ROLL $0x05, BX
+        ADDL BP, BX
+        XORL DI, R9
+        ADDL $0xfcefa3f8, CX
+        ADDL R8, CX
+        ANDL BX, R14
+        ANDL BP, R9
+        MOVL 28(DX), R8
+        ORL  R9, R14
+        MOVL BP, R9
+        ADDL R14, CX
+        MOVL BP, R14
+        ROLL $0x09, CX
+        ADDL BX, CX
+        XORL DI, R9
+        ADDL $0x676f02d9, SI
+        ADDL R8, SI
+        ANDL CX, R14
+        ANDL BX, R9
+        MOVL 48(DX), R8
+        ORL  R9, R14
+        MOVL BX, R9
+        ADDL R14, SI
+        MOVL BX, R14
+        ROLL $0x0e, SI
+        ADDL CX, SI
+        XORL DI, R9
+        ADDL $0x8d2a4c8a, BP
+        ADDL R8, BP
+        ANDL SI, R14
+        ANDL CX, R9
+        MOVL 20(DX), R8
+        ORL  R9, R14
+        MOVL CX, R9
+        ADDL R14, BP
+        MOVL CX, R14
+        ROLL $0x14, BP
+        ADDL SI, BP
+        // ROUND3
+        MOVL SI, R9
+        ADDL $0xfffa3942, BX
+        ADDL R8, BX
+        MOVL 32(DX), R8
+        XORL CX, R9
+        XORL BP, R9
+        ADDL R9, BX
+        ROLL $0x04, BX
+        MOVL BP, R9
+        ADDL BP, BX
+        ADDL $0x8771f681, CX
+        ADDL R8, CX
+        MOVL 44(DX), R8
+        XORL SI, R9
+        XORL BX, R9
+        ADDL R9, CX
+        ROLL $0x0b, CX
+        MOVL BX, R9
+        ADDL BX, CX
+        ADDL $0x6d9d6122, SI
+        ADDL R8, SI
+        MOVL 56(DX), R8
+        XORL BP, R9
+        XORL CX, R9
+        ADDL R9, SI
+        ROLL $0x10, SI
+        MOVL CX, R9
+        ADDL CX, SI
+        ADDL $0xfde5380c, BP
+        ADDL R8, BP
+        MOVL 4(DX), R8
+        XORL BX, R9
+        XORL SI, R9
+        ADDL R9, BP
+        ROLL $0x17, BP
+        MOVL SI, R9
+        ADDL SI, BP
+        ADDL $0xa4beea44, BX
+        ADDL R8, BX
+        MOVL 16(DX), R8
+        XORL CX, R9
+        XORL BP, R9
+        ADDL R9, BX
+        ROLL $0x04, BX
+        MOVL BP, R9
+        ADDL BP, BX
+        ADDL $0x4bdecfa9, CX
+        ADDL R8, CX
+        MOVL 28(DX), R8
+        XORL SI, R9
+        XORL BX, R9
+        ADDL R9, CX
+        ROLL $0x0b, CX
+        MOVL BX, R9
+        ADDL BX, CX
+        ADDL $0xf6bb4b60, SI
+        ADDL R8, SI
+        MOVL 40(DX), R8
+        XORL BP, R9
+        XORL CX, R9
+        ADDL R9, SI
+        ROLL $0x10, SI
+        MOVL CX, R9
+        ADDL CX, SI
+        ADDL $0xbebfbc70, BP
+        ADDL R8, BP
+        MOVL 52(DX), R8
+        XORL BX, R9
+        XORL SI, R9
+        ADDL R9, BP
+        ROLL $0x17, BP
+        MOVL SI, R9
+        ADDL SI, BP
+        ADDL $0x289b7ec6, BX
+        ADDL R8, BX
+        MOVL (DX), R8
+        XORL CX, R9
+        XORL BP, R9
+        ADDL R9, BX
+        ROLL $0x04, BX
+        MOVL BP, R9
+        ADDL BP, BX
+        ADDL $0xeaa127fa, CX
+        ADDL R8, CX
+        MOVL 12(DX), R8
+        XORL SI, R9
+        XORL BX, R9
+        ADDL R9, CX
+        ROLL $0x0b, CX
+        MOVL BX, R9
+        ADDL BX, CX
+        ADDL $0xd4ef3085, SI
+        ADDL R8, SI
+        MOVL 24(DX), R8
+        XORL BP, R9
+        XORL CX, R9
+        ADDL R9, SI
+        ROLL $0x10, SI
+        MOVL CX, R9
+        ADDL CX, SI
+        ADDL $0x04881d05, BP
+        ADDL R8, BP
+        MOVL 36(DX), R8
+        XORL BX, R9
+        XORL SI, R9
+        ADDL R9, BP
+        ROLL $0x17, BP
+        MOVL SI, R9
+        ADDL SI, BP
+        ADDL $0xd9d4d039, BX
+        ADDL R8, BX
+        MOVL 48(DX), R8
+        XORL CX, R9
+        XORL BP, R9
+        ADDL R9, BX
+        ROLL $0x04, BX
+        MOVL BP, R9
+        ADDL BP, BX
+        ADDL $0xe6db99e5, CX
+        ADDL R8, CX
+        MOVL 60(DX), R8
+        XORL SI, R9
+        XORL BX, R9
+        ADDL R9, CX
+        ROLL $0x0b, CX
+        MOVL BX, R9
+        ADDL BX, CX
+        ADDL $0x1fa27cf8, SI
+        ADDL R8, SI
+        MOVL 8(DX), R8
+        XORL BP, R9
+        XORL CX, R9
+        ADDL R9, SI
+        ROLL $0x10, SI
+        MOVL CX, R9
+        ADDL CX, SI
+        ADDL $0xc4ac5665, BP
+        ADDL R8, BP
+        MOVL (DX), R8
+        XORL BX, R9
+        XORL SI, R9
+        ADDL R9, BP
+        ROLL $0x17, BP
+        MOVL SI, R9
+        ADDL SI, BP
+        // ROUND4
+        MOVL DI, R9
+        XORL CX, R9
+        ADDL $0xf4292244, BX
+        ADDL R8, BX
+        ORL  BP, R9
+        XORL SI, R9
+        ADDL R9, BX
+        MOVL 28(DX), R8
+        MOVL DI, R9
+        ROLL $0x06, BX
+        XORL SI, R9
+        ADDL BP, BX
+        ADDL $0x432aff97, CX
+        ADDL R8, CX
+        ORL  BX, R9
+        XORL BP, R9
+        ADDL R9, CX
+        MOVL 56(DX), R8
+        MOVL DI, R9
+        ROLL $0x0a, CX
+        XORL BP, R9
+        ADDL BX, CX
+        ADDL $0xab9423a7, SI
+        ADDL R8, SI
+        ORL  CX, R9
+        XORL BX, R9
+        ADDL R9, SI
+        MOVL 20(DX), R8
+        MOVL DI, R9
+        ROLL $0x0f, SI
+        XORL BX, R9
+        ADDL CX, SI
+        ADDL $0xfc93a039, BP
+        ADDL R8, BP
+        ORL  SI, R9
+        XORL CX, R9
+        ADDL R9, BP
+        MOVL 48(DX), R8
+        MOVL DI, R9
+        ROLL $0x15, BP
+        XORL CX, R9
+        ADDL SI, BP
+        ADDL $0x655b59c3, BX
+        ADDL R8, BX
+        ORL  BP, R9
+        XORL SI, R9
+        ADDL R9, BX
+        MOVL 12(DX), R8
+        MOVL DI, R9
+        ROLL $0x06, BX
+        XORL SI, R9
+        ADDL BP, BX
+        ADDL $0x8f0ccc92, CX
+        ADDL R8, CX
+        ORL  BX, R9
+        XORL BP, R9
+        ADDL R9, CX
+        MOVL 40(DX), R8
+        MOVL DI, R9
+        ROLL $0x0a, CX
+        XORL BP, R9
+        ADDL BX, CX
+        ADDL $0xffeff47d, SI
+        ADDL R8, SI
+        ORL  CX, R9
+        XORL BX, R9
+        ADDL R9, SI
+        MOVL 4(DX), R8
+        MOVL DI, R9
+        ROLL $0x0f, SI
+        XORL BX, R9
+        ADDL CX, SI
+        ADDL $0x85845dd1, BP
+        ADDL R8, BP
+        ORL  SI, R9
+        XORL CX, R9
+        ADDL R9, BP
+        MOVL 32(DX), R8
+        MOVL DI, R9
+        ROLL $0x15, BP
+        XORL CX, R9
+        ADDL SI, BP
+        ADDL $0x6fa87e4f, BX
+        ADDL R8, BX
+        ORL  BP, R9
+        XORL SI, R9
+        ADDL R9, BX
+        MOVL 60(DX), R8
+        MOVL DI, R9
+        ROLL $0x06, BX
+        XORL SI, R9
+        ADDL BP, BX
+        ADDL $0xfe2ce6e0, CX
+        ADDL R8, CX
+        ORL  BX, R9
+        XORL BP, R9
+        ADDL R9, CX
+        MOVL 24(DX), R8
+        MOVL DI, R9
+        ROLL $0x0a, CX
+        XORL BP, R9
+        ADDL BX, CX
+        ADDL $0xa3014314, SI
+        ADDL R8, SI
+        ORL  CX, R9
+        XORL BX, R9
+        ADDL R9, SI
+        MOVL 52(DX), R8
+        MOVL DI, R9
+        ROLL $0x0f, SI
+        XORL BX, R9
+        ADDL CX, SI
+        ADDL $0x4e0811a1, BP
+        ADDL R8, BP
+        ORL  SI, R9
+        XORL CX, R9
+        ADDL R9, BP
+        MOVL 16(DX), R8
+        MOVL DI, R9
+        ROLL $0x15, BP
+        XORL CX, R9
+        ADDL SI, BP
+        ADDL $0xf7537e82, BX
+        ADDL R8, BX
+        ORL  BP, R9
+        XORL SI, R9
+        ADDL R9, BX
+        MOVL 44(DX), R8
+        MOVL DI, R9
+        ROLL $0x06, BX
+        XORL SI, R9
+        ADDL BP, BX
+        ADDL $0xbd3af235, CX
+        ADDL R8, CX
+        ORL  BX, R9
+        XORL BP, R9
+        ADDL R9, CX
+        MOVL 8(DX), R8
+        MOVL DI, R9
+        ROLL $0x0a, CX
+        XORL BP, R9
+        ADDL BX, CX
+        ADDL $0x2ad7d2bb, SI
+        ADDL R8, SI
+        ORL  CX, R9
+        XORL BX, R9
+        ADDL R9, SI
+        MOVL 36(DX), R8
+        MOVL DI, R9
+        ROLL $0x0f, SI
+        XORL BX, R9
+        ADDL CX, SI
+        ADDL $0xeb86d391, BP
+        ADDL R8, BP
+        ORL  SI, R9
+        XORL CX, R9
+        ADDL R9, BP
+        ROLL $0x15, BP
+        ADDL SI, BP
+        ADDL R10, BX
+        ADDL R11, BP
+        ADDL R12, SI
+        ADDL R13, CX
+        // Prepare next loop
+        ADDQ $0x40, DX
+        CMPQ DX, AX
+        JB   loop
+        // Write output
+        MOVQ dig+0(FP), AX
+        MOVL BX, (AX)
+        MOVL BP, 4(AX)
+        MOVL SI, 8(AX)
+        MOVL CX, 12(AX)
+end:
+        RET