aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/github.com/klauspost
diff options
context:
space:
mode:
authorLibravatar Rutger Broekhoff2023-12-29 21:31:53 +0100
committerLibravatar Rutger Broekhoff2023-12-29 21:31:53 +0100
commit404aeae4545d2426c089a5f8d5e82dae56f5212b (patch)
tree2d84e00af272b39fc04f3795ae06bc48970e57b5 /vendor/github.com/klauspost
parent209d8b0187ed025dec9ac149ebcced3462877bff (diff)
downloadgitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.tar.gz
gitolfs3-404aeae4545d2426c089a5f8d5e82dae56f5212b.zip
Make Nix builds work
Diffstat (limited to 'vendor/github.com/klauspost')
-rw-r--r--vendor/github.com/klauspost/compress/LICENSE304
-rw-r--r--vendor/github.com/klauspost/compress/s2/.gitignore15
-rw-r--r--vendor/github.com/klauspost/compress/s2/LICENSE28
-rw-r--r--vendor/github.com/klauspost/compress/s2/README.md1120
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode.go437
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_amd64.s568
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_arm64.s574
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_asm.go17
-rw-r--r--vendor/github.com/klauspost/compress/s2/decode_other.go292
-rw-r--r--vendor/github.com/klauspost/compress/s2/dict.go350
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode.go393
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_all.go1048
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_amd64.go148
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_best.go796
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_better.go1106
-rw-r--r--vendor/github.com/klauspost/compress/s2/encode_go.go729
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go228
-rw-r--r--vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s21169
-rw-r--r--vendor/github.com/klauspost/compress/s2/index.go596
-rw-r--r--vendor/github.com/klauspost/compress/s2/lz4convert.go585
-rw-r--r--vendor/github.com/klauspost/compress/s2/lz4sconvert.go467
-rw-r--r--vendor/github.com/klauspost/compress/s2/reader.go1062
-rw-r--r--vendor/github.com/klauspost/compress/s2/s2.go143
-rw-r--r--vendor/github.com/klauspost/compress/s2/writer.go1020
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/.gitignore24
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml74
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt35
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/LICENSE22
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/README.md497
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/cpuid.go1473
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/cpuid_386.s47
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s72
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s26
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/detect_arm64.go247
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/detect_ref.go15
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/detect_x86.go37
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/featureid_string.go279
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go121
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go130
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go16
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go8
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go11
-rw-r--r--vendor/github.com/klauspost/cpuid/v2/test-architectures.sh15
43 files changed, 36344 insertions, 0 deletions
diff --git a/vendor/github.com/klauspost/compress/LICENSE b/vendor/github.com/klauspost/compress/LICENSE
new file mode 100644
index 0000000..87d5574
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/LICENSE
@@ -0,0 +1,304 @@
1Copyright (c) 2012 The Go Authors. All rights reserved.
2Copyright (c) 2019 Klaus Post. All rights reserved.
3
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
7
8 * Redistributions of source code must retain the above copyright
9notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above
11copyright notice, this list of conditions and the following disclaimer
12in the documentation and/or other materials provided with the
13distribution.
14 * Neither the name of Google Inc. nor the names of its
15contributors may be used to endorse or promote products derived from
16this software without specific prior written permission.
17
18THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30------------------
31
32Files: gzhttp/*
33
34 Apache License
35 Version 2.0, January 2004
36 http://www.apache.org/licenses/
37
38 TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
39
40 1. Definitions.
41
42 "License" shall mean the terms and conditions for use, reproduction,
43 and distribution as defined by Sections 1 through 9 of this document.
44
45 "Licensor" shall mean the copyright owner or entity authorized by
46 the copyright owner that is granting the License.
47
48 "Legal Entity" shall mean the union of the acting entity and all
49 other entities that control, are controlled by, or are under common
50 control with that entity. For the purposes of this definition,
51 "control" means (i) the power, direct or indirect, to cause the
52 direction or management of such entity, whether by contract or
53 otherwise, or (ii) ownership of fifty percent (50%) or more of the
54 outstanding shares, or (iii) beneficial ownership of such entity.
55
56 "You" (or "Your") shall mean an individual or Legal Entity
57 exercising permissions granted by this License.
58
59 "Source" form shall mean the preferred form for making modifications,
60 including but not limited to software source code, documentation
61 source, and configuration files.
62
63 "Object" form shall mean any form resulting from mechanical
64 transformation or translation of a Source form, including but
65 not limited to compiled object code, generated documentation,
66 and conversions to other media types.
67
68 "Work" shall mean the work of authorship, whether in Source or
69 Object form, made available under the License, as indicated by a
70 copyright notice that is included in or attached to the work
71 (an example is provided in the Appendix below).
72
73 "Derivative Works" shall mean any work, whether in Source or Object
74 form, that is based on (or derived from) the Work and for which the
75 editorial revisions, annotations, elaborations, or other modifications
76 represent, as a whole, an original work of authorship. For the purposes
77 of this License, Derivative Works shall not include works that remain
78 separable from, or merely link (or bind by name) to the interfaces of,
79 the Work and Derivative Works thereof.
80
81 "Contribution" shall mean any work of authorship, including
82 the original version of the Work and any modifications or additions
83 to that Work or Derivative Works thereof, that is intentionally
84 submitted to Licensor for inclusion in the Work by the copyright owner
85 or by an individual or Legal Entity authorized to submit on behalf of
86 the copyright owner. For the purposes of this definition, "submitted"
87 means any form of electronic, verbal, or written communication sent
88 to the Licensor or its representatives, including but not limited to
89 communication on electronic mailing lists, source code control systems,
90 and issue tracking systems that are managed by, or on behalf of, the
91 Licensor for the purpose of discussing and improving the Work, but
92 excluding communication that is conspicuously marked or otherwise
93 designated in writing by the copyright owner as "Not a Contribution."
94
95 "Contributor" shall mean Licensor and any individual or Legal Entity
96 on behalf of whom a Contribution has been received by Licensor and
97 subsequently incorporated within the Work.
98
99 2. Grant of Copyright License. Subject to the terms and conditions of
100 this License, each Contributor hereby grants to You a perpetual,
101 worldwide, non-exclusive, no-charge, royalty-free, irrevocable
102 copyright license to reproduce, prepare Derivative Works of,
103 publicly display, publicly perform, sublicense, and distribute the
104 Work and such Derivative Works in Source or Object form.
105
106 3. Grant of Patent License. Subject to the terms and conditions of
107 this License, each Contributor hereby grants to You a perpetual,
108 worldwide, non-exclusive, no-charge, royalty-free, irrevocable
109 (except as stated in this section) patent license to make, have made,
110 use, offer to sell, sell, import, and otherwise transfer the Work,
111 where such license applies only to those patent claims licensable
112 by such Contributor that are necessarily infringed by their
113 Contribution(s) alone or by combination of their Contribution(s)
114 with the Work to which such Contribution(s) was submitted. If You
115 institute patent litigation against any entity (including a
116 cross-claim or counterclaim in a lawsuit) alleging that the Work
117 or a Contribution incorporated within the Work constitutes direct
118 or contributory patent infringement, then any patent licenses
119 granted to You under this License for that Work shall terminate
120 as of the date such litigation is filed.
121
122 4. Redistribution. You may reproduce and distribute copies of the
123 Work or Derivative Works thereof in any medium, with or without
124 modifications, and in Source or Object form, provided that You
125 meet the following conditions:
126
127 (a) You must give any other recipients of the Work or
128 Derivative Works a copy of this License; and
129
130 (b) You must cause any modified files to carry prominent notices
131 stating that You changed the files; and
132
133 (c) You must retain, in the Source form of any Derivative Works
134 that You distribute, all copyright, patent, trademark, and
135 attribution notices from the Source form of the Work,
136 excluding those notices that do not pertain to any part of
137 the Derivative Works; and
138
139 (d) If the Work includes a "NOTICE" text file as part of its
140 distribution, then any Derivative Works that You distribute must
141 include a readable copy of the attribution notices contained
142 within such NOTICE file, excluding those notices that do not
143 pertain to any part of the Derivative Works, in at least one
144 of the following places: within a NOTICE text file distributed
145 as part of the Derivative Works; within the Source form or
146 documentation, if provided along with the Derivative Works; or,
147 within a display generated by the Derivative Works, if and
148 wherever such third-party notices normally appear. The contents
149 of the NOTICE file are for informational purposes only and
150 do not modify the License. You may add Your own attribution
151 notices within Derivative Works that You distribute, alongside
152 or as an addendum to the NOTICE text from the Work, provided
153 that such additional attribution notices cannot be construed
154 as modifying the License.
155
156 You may add Your own copyright statement to Your modifications and
157 may provide additional or different license terms and conditions
158 for use, reproduction, or distribution of Your modifications, or
159 for any such Derivative Works as a whole, provided Your use,
160 reproduction, and distribution of the Work otherwise complies with
161 the conditions stated in this License.
162
163 5. Submission of Contributions. Unless You explicitly state otherwise,
164 any Contribution intentionally submitted for inclusion in the Work
165 by You to the Licensor shall be under the terms and conditions of
166 this License, without any additional terms or conditions.
167 Notwithstanding the above, nothing herein shall supersede or modify
168 the terms of any separate license agreement you may have executed
169 with Licensor regarding such Contributions.
170
171 6. Trademarks. This License does not grant permission to use the trade
172 names, trademarks, service marks, or product names of the Licensor,
173 except as required for reasonable and customary use in describing the
174 origin of the Work and reproducing the content of the NOTICE file.
175
176 7. Disclaimer of Warranty. Unless required by applicable law or
177 agreed to in writing, Licensor provides the Work (and each
178 Contributor provides its Contributions) on an "AS IS" BASIS,
179 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
180 implied, including, without limitation, any warranties or conditions
181 of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
182 PARTICULAR PURPOSE. You are solely responsible for determining the
183 appropriateness of using or redistributing the Work and assume any
184 risks associated with Your exercise of permissions under this License.
185
186 8. Limitation of Liability. In no event and under no legal theory,
187 whether in tort (including negligence), contract, or otherwise,
188 unless required by applicable law (such as deliberate and grossly
189 negligent acts) or agreed to in writing, shall any Contributor be
190 liable to You for damages, including any direct, indirect, special,
191 incidental, or consequential damages of any character arising as a
192 result of this License or out of the use or inability to use the
193 Work (including but not limited to damages for loss of goodwill,
194 work stoppage, computer failure or malfunction, or any and all
195 other commercial damages or losses), even if such Contributor
196 has been advised of the possibility of such damages.
197
198 9. Accepting Warranty or Additional Liability. While redistributing
199 the Work or Derivative Works thereof, You may choose to offer,
200 and charge a fee for, acceptance of support, warranty, indemnity,
201 or other liability obligations and/or rights consistent with this
202 License. However, in accepting such obligations, You may act only
203 on Your own behalf and on Your sole responsibility, not on behalf
204 of any other Contributor, and only if You agree to indemnify,
205 defend, and hold each Contributor harmless for any liability
206 incurred by, or claims asserted against, such Contributor by reason
207 of your accepting any such warranty or additional liability.
208
209 END OF TERMS AND CONDITIONS
210
211 APPENDIX: How to apply the Apache License to your work.
212
213 To apply the Apache License to your work, attach the following
214 boilerplate notice, with the fields enclosed by brackets "[]"
215 replaced with your own identifying information. (Don't include
216 the brackets!) The text should be enclosed in the appropriate
217 comment syntax for the file format. We also recommend that a
218 file or class name and description of purpose be included on the
219 same "printed page" as the copyright notice for easier
220 identification within third-party archives.
221
222 Copyright 2016-2017 The New York Times Company
223
224 Licensed under the Apache License, Version 2.0 (the "License");
225 you may not use this file except in compliance with the License.
226 You may obtain a copy of the License at
227
228 http://www.apache.org/licenses/LICENSE-2.0
229
230 Unless required by applicable law or agreed to in writing, software
231 distributed under the License is distributed on an "AS IS" BASIS,
232 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
233 See the License for the specific language governing permissions and
234 limitations under the License.
235
236------------------
237
238Files: s2/cmd/internal/readahead/*
239
240The MIT License (MIT)
241
242Copyright (c) 2015 Klaus Post
243
244Permission is hereby granted, free of charge, to any person obtaining a copy
245of this software and associated documentation files (the "Software"), to deal
246in the Software without restriction, including without limitation the rights
247to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
248copies of the Software, and to permit persons to whom the Software is
249furnished to do so, subject to the following conditions:
250
251The above copyright notice and this permission notice shall be included in all
252copies or substantial portions of the Software.
253
254THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
255IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
256FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
257AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
258LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
259OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
260SOFTWARE.
261
262---------------------
263Files: snappy/*
264Files: internal/snapref/*
265
266Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
267
268Redistribution and use in source and binary forms, with or without
269modification, are permitted provided that the following conditions are
270met:
271
272 * Redistributions of source code must retain the above copyright
273notice, this list of conditions and the following disclaimer.
274 * Redistributions in binary form must reproduce the above
275copyright notice, this list of conditions and the following disclaimer
276in the documentation and/or other materials provided with the
277distribution.
278 * Neither the name of Google Inc. nor the names of its
279contributors may be used to endorse or promote products derived from
280this software without specific prior written permission.
281
282THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
283"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
284LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
285A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
286OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
287SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
288LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
289DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
290THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
291(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
292OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
293
294-----------------
295
296Files: s2/cmd/internal/filepathx/*
297
298Copyright 2016 The filepathx Authors
299
300Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
301
302The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
303
304THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/vendor/github.com/klauspost/compress/s2/.gitignore b/vendor/github.com/klauspost/compress/s2/.gitignore
new file mode 100644
index 0000000..3a89c6e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/.gitignore
@@ -0,0 +1,15 @@
1testdata/bench
2
3# These explicitly listed benchmark data files are for an obsolete version of
4# snappy_test.go.
5testdata/alice29.txt
6testdata/asyoulik.txt
7testdata/fireworks.jpeg
8testdata/geo.protodata
9testdata/html
10testdata/html_x_4
11testdata/kppkn.gtb
12testdata/lcet10.txt
13testdata/paper-100k.pdf
14testdata/plrabn12.txt
15testdata/urls.10K
diff --git a/vendor/github.com/klauspost/compress/s2/LICENSE b/vendor/github.com/klauspost/compress/s2/LICENSE
new file mode 100644
index 0000000..1d2d645
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/LICENSE
@@ -0,0 +1,28 @@
1Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
2Copyright (c) 2019 Klaus Post. All rights reserved.
3
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
7
8 * Redistributions of source code must retain the above copyright
9notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above
11copyright notice, this list of conditions and the following disclaimer
12in the documentation and/or other materials provided with the
13distribution.
14 * Neither the name of Google Inc. nor the names of its
15contributors may be used to endorse or promote products derived from
16this software without specific prior written permission.
17
18THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/compress/s2/README.md b/vendor/github.com/klauspost/compress/s2/README.md
new file mode 100644
index 0000000..8284bb0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/README.md
@@ -0,0 +1,1120 @@
1# S2 Compression
2
3S2 is an extension of [Snappy](https://github.com/google/snappy).
4
5S2 is aimed for high throughput, which is why it features concurrent compression for bigger payloads.
6
7Decoding is compatible with Snappy compressed content, but content compressed with S2 cannot be decompressed by Snappy.
8This means that S2 can seamlessly replace Snappy without converting compressed content.
9
10S2 can produce Snappy compatible output, faster and better than Snappy.
11If you want full benefit of the changes you should use s2 without Snappy compatibility.
12
13S2 is designed to have high throughput on content that cannot be compressed.
14This is important, so you don't have to worry about spending CPU cycles on already compressed data.
15
16## Benefits over Snappy
17
18* Better compression
19* Adjustable compression (3 levels)
20* Concurrent stream compression
21* Faster decompression, even for Snappy compatible content
22* Concurrent Snappy/S2 stream decompression
23* Skip forward in compressed stream
24* Random seeking with indexes
25* Compatible with reading Snappy compressed content
26* Smaller block size overhead on incompressible blocks
27* Block concatenation
28* Block Dictionary support
29* Uncompressed stream mode
30* Automatic stream size padding
31* Snappy compatible block compression
32
33## Drawbacks over Snappy
34
35* Not optimized for 32 bit systems
36* Streams use slightly more memory due to larger blocks and concurrency (configurable)
37
38# Usage
39
40Installation: `go get -u github.com/klauspost/compress/s2`
41
42Full package documentation:
43
44[![godoc][1]][2]
45
46[1]: https://godoc.org/github.com/klauspost/compress?status.svg
47[2]: https://godoc.org/github.com/klauspost/compress/s2
48
49## Compression
50
51```Go
52func EncodeStream(src io.Reader, dst io.Writer) error {
53 enc := s2.NewWriter(dst)
54 _, err := io.Copy(enc, src)
55 if err != nil {
56 enc.Close()
57 return err
58 }
59 // Blocks until compression is done.
60 return enc.Close()
61}
62```
63
64You should always call `enc.Close()`, otherwise you will leak resources and your encode will be incomplete.
65
66For the best throughput, you should attempt to reuse the `Writer` using the `Reset()` method.
67
68The Writer in S2 is always buffered, therefore `NewBufferedWriter` in Snappy can be replaced with `NewWriter` in S2.
69It is possible to flush any buffered data using the `Flush()` method.
70This will block until all data sent to the encoder has been written to the output.
71
72S2 also supports the `io.ReaderFrom` interface, which will consume all input from a reader.
73
74As a final method to compress data, if you have a single block of data you would like to have encoded as a stream,
75a slightly more efficient method is to use the `EncodeBuffer` method.
76This will take ownership of the buffer until the stream is closed.
77
78```Go
79func EncodeStream(src []byte, dst io.Writer) error {
80 enc := s2.NewWriter(dst)
81 // The encoder owns the buffer until Flush or Close is called.
82 err := enc.EncodeBuffer(buf)
83 if err != nil {
84 enc.Close()
85 return err
86 }
87 // Blocks until compression is done.
88 return enc.Close()
89}
90```
91
92Each call to `EncodeBuffer` will result in discrete blocks being created without buffering,
93so it should only be used a single time per stream.
94If you need to write several blocks, you should use the regular io.Writer interface.
95
96
97## Decompression
98
99```Go
100func DecodeStream(src io.Reader, dst io.Writer) error {
101 dec := s2.NewReader(src)
102 _, err := io.Copy(dst, dec)
103 return err
104}
105```
106
107Similar to the Writer, a Reader can be reused using the `Reset` method.
108
109For the best possible throughput, there is a `EncodeBuffer(buf []byte)` function available.
110However, it requires that the provided buffer isn't used after it is handed over to S2 and until the stream is flushed or closed.
111
112For smaller data blocks, there is also a non-streaming interface: `Encode()`, `EncodeBetter()` and `Decode()`.
113Do however note that these functions (similar to Snappy) does not provide validation of data,
114so data corruption may be undetected. Stream encoding provides CRC checks of data.
115
116It is possible to efficiently skip forward in a compressed stream using the `Skip()` method.
117For big skips the decompressor is able to skip blocks without decompressing them.
118
119## Single Blocks
120
121Similar to Snappy S2 offers single block compression.
122Blocks do not offer the same flexibility and safety as streams,
123but may be preferable for very small payloads, less than 100K.
124
125Using a simple `dst := s2.Encode(nil, src)` will compress `src` and return the compressed result.
126It is possible to provide a destination buffer.
127If the buffer has a capacity of `s2.MaxEncodedLen(len(src))` it will be used.
128If not a new will be allocated.
129
130Alternatively `EncodeBetter`/`EncodeBest` can also be used for better, but slightly slower compression.
131
132Similarly to decompress a block you can use `dst, err := s2.Decode(nil, src)`.
133Again an optional destination buffer can be supplied.
134The `s2.DecodedLen(src)` can be used to get the minimum capacity needed.
135If that is not satisfied a new buffer will be allocated.
136
137Block function always operate on a single goroutine since it should only be used for small payloads.
138
139# Commandline tools
140
141Some very simply commandline tools are provided; `s2c` for compression and `s2d` for decompression.
142
143Binaries can be downloaded on the [Releases Page](https://github.com/klauspost/compress/releases).
144
145Installing then requires Go to be installed. To install them, use:
146
147`go install github.com/klauspost/compress/s2/cmd/s2c@latest && go install github.com/klauspost/compress/s2/cmd/s2d@latest`
148
149To build binaries to the current folder use:
150
151`go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d`
152
153
154## s2c
155
156```
157Usage: s2c [options] file1 file2
158
159Compresses all files supplied as input separately.
160Output files are written as 'filename.ext.s2' or 'filename.ext.snappy'.
161By default output files will be overwritten.
162Use - as the only file name to read from stdin and write to stdout.
163
164Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
165Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
166
167File names beginning with 'http://' and 'https://' will be downloaded and compressed.
168Only http response code 200 is accepted.
169
170Options:
171 -bench int
172 Run benchmark n times. No output will be written
173 -blocksize string
174 Max block size. Examples: 64K, 256K, 1M, 4M. Must be power of two and <= 4MB (default "4M")
175 -c Write all output to stdout. Multiple input files will be concatenated
176 -cpu int
177 Compress using this amount of threads (default 32)
178 -faster
179 Compress faster, but with a minor compression loss
180 -help
181 Display help
182 -index
183 Add seek index (default true)
184 -o string
185 Write output to another file. Single input file only
186 -pad string
187 Pad size to a multiple of this value, Examples: 500, 64K, 256K, 1M, 4M, etc (default "1")
188 -q Don't write any output to terminal, except errors
189 -rm
190 Delete source file(s) after successful compression
191 -safe
192 Do not overwrite output files
193 -slower
194 Compress more, but a lot slower
195 -snappy
196 Generate Snappy compatible output stream
197 -verify
198 Verify written files
199
200```
201
202## s2d
203
204```
205Usage: s2d [options] file1 file2
206
207Decompresses all files supplied as input. Input files must end with '.s2' or '.snappy'.
208Output file names have the extension removed. By default output files will be overwritten.
209Use - as the only file name to read from stdin and write to stdout.
210
211Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
212Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
213
214File names beginning with 'http://' and 'https://' will be downloaded and decompressed.
215Extensions on downloaded files are ignored. Only http response code 200 is accepted.
216
217Options:
218 -bench int
219 Run benchmark n times. No output will be written
220 -c Write all output to stdout. Multiple input files will be concatenated
221 -help
222 Display help
223 -o string
224 Write output to another file. Single input file only
225 -offset string
226 Start at offset. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
227 -q Don't write any output to terminal, except errors
228 -rm
229 Delete source file(s) after successful decompression
230 -safe
231 Do not overwrite output files
232 -tail string
233 Return last of compressed file. Examples: 92, 64K, 256K, 1M, 4M. Requires Index
234 -verify
235 Verify files, but do not write output
236```
237
238## s2sx: self-extracting archives
239
240s2sx allows creating self-extracting archives with no dependencies.
241
242By default, executables are created for the same platforms as the host os,
243but this can be overridden with `-os` and `-arch` parameters.
244
245Extracted files have 0666 permissions, except when untar option used.
246
247```
248Usage: s2sx [options] file1 file2
249
250Compresses all files supplied as input separately.
251If files have '.s2' extension they are assumed to be compressed already.
252Output files are written as 'filename.s2sx' and with '.exe' for windows targets.
253If output is big, an additional file with ".more" is written. This must be included as well.
254By default output files will be overwritten.
255
256Wildcards are accepted: testdir/*.txt will compress all files in testdir ending with .txt
257Directories can be wildcards as well. testdir/*/*.txt will match testdir/subdir/b.txt
258
259Options:
260 -arch string
261 Destination architecture (default "amd64")
262 -c Write all output to stdout. Multiple input files will be concatenated
263 -cpu int
264 Compress using this amount of threads (default 32)
265 -help
266 Display help
267 -max string
268 Maximum executable size. Rest will be written to another file. (default "1G")
269 -os string
270 Destination operating system (default "windows")
271 -q Don't write any output to terminal, except errors
272 -rm
273 Delete source file(s) after successful compression
274 -safe
275 Do not overwrite output files
276 -untar
277 Untar on destination
278```
279
280Available platforms are:
281
282 * darwin-amd64
283 * darwin-arm64
284 * linux-amd64
285 * linux-arm
286 * linux-arm64
287 * linux-mips64
288 * linux-ppc64le
289 * windows-386
290 * windows-amd64
291
292By default, there is a size limit of 1GB for the output executable.
293
294When this is exceeded the remaining file content is written to a file called
295output+`.more`. This file must be included for a successful extraction and
296placed alongside the executable for a successful extraction.
297
298This file *must* have the same name as the executable, so if the executable is renamed,
299so must the `.more` file.
300
301This functionality is disabled with stdin/stdout.
302
303### Self-extracting TAR files
304
305If you wrap a TAR file you can specify `-untar` to make it untar on the destination host.
306
307Files are extracted to the current folder with the path specified in the tar file.
308
309Note that tar files are not validated before they are wrapped.
310
311For security reasons files that move below the root folder are not allowed.
312
313# Performance
314
315This section will focus on comparisons to Snappy.
316This package is solely aimed at replacing Snappy as a high speed compression package.
317If you are mainly looking for better compression [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd)
318gives better compression, but typically at speeds slightly below "better" mode in this package.
319
320Compression is increased compared to Snappy, mostly around 5-20% and the throughput is typically 25-40% increased (single threaded) compared to the Snappy Go implementation.
321
322Streams are concurrently compressed. The stream will be distributed among all available CPU cores for the best possible throughput.
323
324A "better" compression mode is also available. This allows to trade a bit of speed for a minor compression gain.
325The content compressed in this mode is fully compatible with the standard decoder.
326
327Snappy vs S2 **compression** speed on 16 core (32 thread) computer, using all threads and a single thread (1 CPU):
328
329| File | S2 Speed | S2 Throughput | S2 % smaller | S2 "better" | "better" throughput | "better" % smaller |
330|---------------------------------------------------------------------------------------------------------|----------|---------------|--------------|-------------|---------------------|--------------------|
331| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 16.33x | 10556 MB/s | 8.0% | 6.04x | 5252 MB/s | 14.7% |
332| (1 CPU) | 1.08x | 940 MB/s | - | 0.46x | 400 MB/s | - |
333| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 16.51x | 15224 MB/s | 31.70% | 9.47x | 8734 MB/s | 37.71% |
334| (1 CPU) | 1.26x | 1157 MB/s | - | 0.60x | 556 MB/s | - |
335| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 15.14x | 12598 MB/s | -5.76% | 6.23x | 5675 MB/s | 3.62% |
336| (1 CPU) | 1.02x | 932 MB/s | - | 0.47x | 432 MB/s | - |
337| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 11.21x | 12116 MB/s | 15.95% | 3.24x | 3500 MB/s | 18.00% |
338| (1 CPU) | 1.05x | 1135 MB/s | - | 0.27x | 292 MB/s | - |
339| [apache.log](https://files.klauspost.com/compress/apache.log.zst) | 8.55x | 16673 MB/s | 20.54% | 5.85x | 11420 MB/s | 24.97% |
340| (1 CPU) | 1.91x | 1771 MB/s | - | 0.53x | 1041 MB/s | - |
341| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 15.76x | 14357 MB/s | 24.01% | 8.67x | 7891 MB/s | 33.68% |
342| (1 CPU) | 1.17x | 1064 MB/s | - | 0.65x | 595 MB/s | - |
343| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 13.33x | 9835 MB/s | 2.34% | 6.85x | 4863 MB/s | 9.96% |
344| (1 CPU) | 0.97x | 689 MB/s | - | 0.55x | 387 MB/s | - |
345| sharnd.out.2gb | 9.11x | 13213 MB/s | 0.01% | 1.49x | 9184 MB/s | 0.01% |
346| (1 CPU) | 0.88x | 5418 MB/s | - | 0.77x | 5417 MB/s | - |
347| [sofia-air-quality-dataset csv](https://files.klauspost.com/compress/sofia-air-quality-dataset.tar.zst) | 22.00x | 11477 MB/s | 18.73% | 11.15x | 5817 MB/s | 27.88% |
348| (1 CPU) | 1.23x | 642 MB/s | - | 0.71x | 642 MB/s | - |
349| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 11.23x | 6520 MB/s | 5.9% | 5.35x | 3109 MB/s | 15.88% |
350| (1 CPU) | 1.05x | 607 MB/s | - | 0.52x | 304 MB/s | - |
351| [enwik9](https://files.klauspost.com/compress/enwik9.zst) | 19.28x | 8440 MB/s | 4.04% | 9.31x | 4076 MB/s | 18.04% |
352| (1 CPU) | 1.12x | 488 MB/s | - | 0.57x | 250 MB/s | - |
353
354### Legend
355
356* `S2 Speed`: Speed of S2 compared to Snappy, using 16 cores and 1 core.
357* `S2 Throughput`: Throughput of S2 in MB/s.
358* `S2 % smaller`: How many percent of the Snappy output size is S2 better.
359* `S2 "better"`: Speed when enabling "better" compression mode in S2 compared to Snappy.
360* `"better" throughput`: Speed when enabling "better" compression mode in S2 compared to Snappy.
361* `"better" % smaller`: How many percent of the Snappy output size is S2 better when using "better" compression.
362
363There is a good speedup across the board when using a single thread and a significant speedup when using multiple threads.
364
365Machine generated data gets by far the biggest compression boost, with size being reduced by up to 35% of Snappy size.
366
367The "better" compression mode sees a good improvement in all cases, but usually at a performance cost.
368
369Incompressible content (`sharnd.out.2gb`, 2GB random data) sees the smallest speedup.
370This is likely dominated by synchronization overhead, which is confirmed by the fact that single threaded performance is higher (see above).
371
372## Decompression
373
374S2 attempts to create content that is also fast to decompress, except in "better" mode where the smallest representation is used.
375
376S2 vs Snappy **decompression** speed. Both operating on single core:
377
378| File | S2 Throughput | vs. Snappy | Better Throughput | vs. Snappy |
379|-----------------------------------------------------------------------------------------------------|---------------|------------|-------------------|------------|
380| [rawstudio-mint14.tar](https://files.klauspost.com/compress/rawstudio-mint14.7z) | 2117 MB/s | 1.14x | 1738 MB/s | 0.94x |
381| [github-june-2days-2019.json](https://files.klauspost.com/compress/github-june-2days-2019.json.zst) | 2401 MB/s | 1.25x | 2307 MB/s | 1.20x |
382| [github-ranks-backup.bin](https://files.klauspost.com/compress/github-ranks-backup.bin.zst) | 2075 MB/s | 0.98x | 1764 MB/s | 0.83x |
383| [consensus.db.10gb](https://files.klauspost.com/compress/consensus.db.10gb.zst) | 2967 MB/s | 1.05x | 2885 MB/s | 1.02x |
384| [adresser.json](https://files.klauspost.com/compress/adresser.json.zst) | 4141 MB/s | 1.07x | 4184 MB/s | 1.08x |
385| [gob-stream](https://files.klauspost.com/compress/gob-stream.7z) | 2264 MB/s | 1.12x | 2185 MB/s | 1.08x |
386| [10gb.tar](http://mattmahoney.net/dc/10gb.html) | 1525 MB/s | 1.03x | 1347 MB/s | 0.91x |
387| sharnd.out.2gb | 3813 MB/s | 0.79x | 3900 MB/s | 0.81x |
388| [enwik9](http://mattmahoney.net/dc/textdata.html) | 1246 MB/s | 1.29x | 967 MB/s | 1.00x |
389| [silesia.tar](http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip) | 1433 MB/s | 1.12x | 1203 MB/s | 0.94x |
390| [enwik10](https://encode.su/threads/3315-enwik10-benchmark-results) | 1284 MB/s | 1.32x | 1010 MB/s | 1.04x |
391
392### Legend
393
394* `S2 Throughput`: Decompression speed of S2 encoded content.
395* `Better Throughput`: Decompression speed of S2 "better" encoded content.
396* `vs Snappy`: Decompression speed of S2 "better" mode compared to Snappy and absolute speed.
397
398
399While the decompression code hasn't changed, there is a significant speedup in decompression speed.
400S2 prefers longer matches and will typically only find matches that are 6 bytes or longer.
401While this reduces compression a bit, it improves decompression speed.
402
403The "better" compression mode will actively look for shorter matches, which is why it has a decompression speed quite similar to Snappy.
404
405Without assembly decompression is also very fast; single goroutine decompression speed. No assembly:
406
407| File | S2 Throughput | S2 throughput |
408|--------------------------------|---------------|---------------|
409| consensus.db.10gb.s2 | 1.84x | 2289.8 MB/s |
410| 10gb.tar.s2 | 1.30x | 867.07 MB/s |
411| rawstudio-mint14.tar.s2 | 1.66x | 1329.65 MB/s |
412| github-june-2days-2019.json.s2 | 2.36x | 1831.59 MB/s |
413| github-ranks-backup.bin.s2 | 1.73x | 1390.7 MB/s |
414| enwik9.s2 | 1.67x | 681.53 MB/s |
415| adresser.json.s2 | 3.41x | 4230.53 MB/s |
416| silesia.tar.s2 | 1.52x | 811.58 |
417
418Even though S2 typically compresses better than Snappy, decompression speed is always better.
419
420### Concurrent Stream Decompression
421
422For full stream decompression S2 offers a [DecodeConcurrent](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.DecodeConcurrent)
423that will decode a full stream using multiple goroutines.
424
425Example scaling, AMD Ryzen 3950X, 16 cores, decompression using `s2d -bench=3 <input>`, best of 3:
426
427| Input | `-cpu=1` | `-cpu=2` | `-cpu=4` | `-cpu=8` | `-cpu=16` |
428|-------------------------------------------|------------|------------|------------|------------|-------------|
429| enwik10.snappy | 1098.6MB/s | 1819.8MB/s | 3625.6MB/s | 6910.6MB/s | 10818.2MB/s |
430| enwik10.s2 | 1303.5MB/s | 2606.1MB/s | 4847.9MB/s | 8878.4MB/s | 9592.1MB/s |
431| sofia-air-quality-dataset.tar.snappy | 1302.0MB/s | 2165.0MB/s | 4244.5MB/s | 8241.0MB/s | 12920.5MB/s |
432| sofia-air-quality-dataset.tar.s2 | 1399.2MB/s | 2463.2MB/s | 5196.5MB/s | 9639.8MB/s | 11439.5MB/s |
433| sofia-air-quality-dataset.tar.s2 (no asm) | 837.5MB/s | 1652.6MB/s | 3183.6MB/s | 5945.0MB/s | 9620.7MB/s |
434
435Scaling can be expected to be pretty linear until memory bandwidth is saturated.
436
437For now the DecodeConcurrent can only be used for full streams without seeking or combining with regular reads.
438
439## Block compression
440
441
442When compressing blocks no concurrent compression is performed just as Snappy.
443This is because blocks are for smaller payloads and generally will not benefit from concurrent compression.
444
445An important change is that incompressible blocks will not be more than at most 10 bytes bigger than the input.
446In rare, worst case scenario Snappy blocks could be significantly bigger than the input.
447
448### Mixed content blocks
449
450The most reliable is a wide dataset.
451For this we use [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
45253927 files, total input size: 4,014,735,833 bytes. Single goroutine used.
453
454| * | Input | Output | Reduction | MB/s |
455|-------------------|------------|------------|------------|------------|
456| S2 | 4014735833 | 1059723369 | 73.60% | **936.73** |
457| S2 Better | 4014735833 | 961580539 | 76.05% | 451.10 |
458| S2 Best | 4014735833 | 899182886 | **77.60%** | 46.84 |
459| Snappy | 4014735833 | 1128706759 | 71.89% | 790.15 |
460| S2, Snappy Output | 4014735833 | 1093823291 | 72.75% | 936.60 |
461| LZ4 | 4014735833 | 1063768713 | 73.50% | 452.02 |
462
463S2 delivers both the best single threaded throughput with regular mode and the best compression rate with "best".
464"Better" mode provides the same compression speed as LZ4 with better compression ratio.
465
466When outputting Snappy compatible output it still delivers better throughput (150MB/s more) and better compression.
467
468As can be seen from the other benchmarks decompression should also be easier on the S2 generated output.
469
470Though they cannot be compared due to different decompression speeds here are the speed/size comparisons for
471other Go compressors:
472
473| * | Input | Output | Reduction | MB/s |
474|-------------------|------------|------------|-----------|--------|
475| Zstd Fastest (Go) | 4014735833 | 794608518 | 80.21% | 236.04 |
476| Zstd Best (Go) | 4014735833 | 704603356 | 82.45% | 35.63 |
477| Deflate (Go) l1 | 4014735833 | 871294239 | 78.30% | 214.04 |
478| Deflate (Go) l9 | 4014735833 | 730389060 | 81.81% | 41.17 |
479
480### Standard block compression
481
482Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
483So individual benchmarks should only be seen as a guideline and the overall picture is more important.
484
485These micro-benchmarks are with data in cache and trained branch predictors. For a more realistic benchmark see the mixed content above.
486
487Block compression. Parallel benchmark running on 16 cores, 16 goroutines.
488
489AMD64 assembly is use for both S2 and Snappy.
490
491| Absolute Perf | Snappy size | S2 Size | Snappy Speed | S2 Speed | Snappy dec | S2 dec |
492|-----------------------|-------------|---------|--------------|-------------|-------------|-------------|
493| html | 22843 | 20868 | 16246 MB/s | 18617 MB/s | 40972 MB/s | 49263 MB/s |
494| urls.10K | 335492 | 286541 | 7943 MB/s | 10201 MB/s | 22523 MB/s | 26484 MB/s |
495| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 303228 MB/s | 718321 MB/s | 827552 MB/s |
496| fireworks.jpeg (200B) | 146 | 155 | 8869 MB/s | 20180 MB/s | 33691 MB/s | 52421 MB/s |
497| paper-100k.pdf | 85304 | 84202 | 167546 MB/s | 112988 MB/s | 326905 MB/s | 291944 MB/s |
498| html_x_4 | 92234 | 20870 | 15194 MB/s | 54457 MB/s | 30843 MB/s | 32217 MB/s |
499| alice29.txt | 88034 | 85934 | 5936 MB/s | 6540 MB/s | 12882 MB/s | 20044 MB/s |
500| asyoulik.txt | 77503 | 79575 | 5517 MB/s | 6657 MB/s | 12735 MB/s | 22806 MB/s |
501| lcet10.txt | 234661 | 220383 | 6235 MB/s | 6303 MB/s | 14519 MB/s | 18697 MB/s |
502| plrabn12.txt | 319267 | 318196 | 5159 MB/s | 6074 MB/s | 11923 MB/s | 19901 MB/s |
503| geo.protodata | 23335 | 18606 | 21220 MB/s | 25432 MB/s | 56271 MB/s | 62540 MB/s |
504| kppkn.gtb | 69526 | 65019 | 9732 MB/s | 8905 MB/s | 18491 MB/s | 18969 MB/s |
505| alice29.txt (128B) | 80 | 82 | 6691 MB/s | 17179 MB/s | 31883 MB/s | 38874 MB/s |
506| alice29.txt (1000B) | 774 | 774 | 12204 MB/s | 13273 MB/s | 48056 MB/s | 52341 MB/s |
507| alice29.txt (10000B) | 6648 | 6933 | 10044 MB/s | 12824 MB/s | 32378 MB/s | 46322 MB/s |
508| alice29.txt (20000B) | 12686 | 13516 | 7733 MB/s | 12160 MB/s | 30566 MB/s | 58969 MB/s |
509
510
511Speed is generally at or above Snappy. Small blocks gets a significant speedup, although at the expense of size.
512
513Decompression speed is better than Snappy, except in one case.
514
515Since payloads are very small the variance in terms of size is rather big, so they should only be seen as a general guideline.
516
517Size is on average around Snappy, but varies on content type.
518In cases where compression is worse, it usually is compensated by a speed boost.
519
520
521### Better compression
522
523Benchmarking single block performance is subject to a lot more variation since it only tests a limited number of file patterns.
524So individual benchmarks should only be seen as a guideline and the overall picture is more important.
525
526| Absolute Perf | Snappy size | Better Size | Snappy Speed | Better Speed | Snappy dec | Better dec |
527|-----------------------|-------------|-------------|--------------|--------------|-------------|-------------|
528| html | 22843 | 18972 | 16246 MB/s | 8621 MB/s | 40972 MB/s | 40292 MB/s |
529| urls.10K | 335492 | 248079 | 7943 MB/s | 5104 MB/s | 22523 MB/s | 20981 MB/s |
530| fireworks.jpeg | 123034 | 123100 | 349544 MB/s | 84429 MB/s | 718321 MB/s | 823698 MB/s |
531| fireworks.jpeg (200B) | 146 | 149 | 8869 MB/s | 7125 MB/s | 33691 MB/s | 30101 MB/s |
532| paper-100k.pdf | 85304 | 82887 | 167546 MB/s | 11087 MB/s | 326905 MB/s | 198869 MB/s |
533| html_x_4 | 92234 | 18982 | 15194 MB/s | 29316 MB/s | 30843 MB/s | 30937 MB/s |
534| alice29.txt | 88034 | 71611 | 5936 MB/s | 3709 MB/s | 12882 MB/s | 16611 MB/s |
535| asyoulik.txt | 77503 | 65941 | 5517 MB/s | 3380 MB/s | 12735 MB/s | 14975 MB/s |
536| lcet10.txt | 234661 | 184939 | 6235 MB/s | 3537 MB/s | 14519 MB/s | 16634 MB/s |
537| plrabn12.txt | 319267 | 264990 | 5159 MB/s | 2960 MB/s | 11923 MB/s | 13382 MB/s |
538| geo.protodata | 23335 | 17689 | 21220 MB/s | 10859 MB/s | 56271 MB/s | 57961 MB/s |
539| kppkn.gtb | 69526 | 55398 | 9732 MB/s | 5206 MB/s | 18491 MB/s | 16524 MB/s |
540| alice29.txt (128B) | 80 | 78 | 6691 MB/s | 7422 MB/s | 31883 MB/s | 34225 MB/s |
541| alice29.txt (1000B) | 774 | 746 | 12204 MB/s | 5734 MB/s | 48056 MB/s | 42068 MB/s |
542| alice29.txt (10000B) | 6648 | 6218 | 10044 MB/s | 6055 MB/s | 32378 MB/s | 28813 MB/s |
543| alice29.txt (20000B) | 12686 | 11492 | 7733 MB/s | 3143 MB/s | 30566 MB/s | 27315 MB/s |
544
545
546Except for the mostly incompressible JPEG image compression is better and usually in the
547double digits in terms of percentage reduction over Snappy.
548
549The PDF sample shows a significant slowdown compared to Snappy, as this mode tries harder
550to compress the data. Very small blocks are also not favorable for better compression, so throughput is way down.
551
552This mode aims to provide better compression at the expense of performance and achieves that
553without a huge performance penalty, except on very small blocks.
554
555Decompression speed suffers a little compared to the regular S2 mode,
556but still manages to be close to Snappy in spite of increased compression.
557
558# Best compression mode
559
560S2 offers a "best" compression mode.
561
562This will compress as much as possible with little regard to CPU usage.
563
564Mainly for offline compression, but where decompression speed should still
565be high and compatible with other S2 compressed data.
566
567Some examples compared on 16 core CPU, amd64 assembly used:
568
569```
570* enwik10
571Default... 10000000000 -> 4759950115 [47.60%]; 1.03s, 9263.0MB/s
572Better... 10000000000 -> 4084706676 [40.85%]; 2.16s, 4415.4MB/s
573Best... 10000000000 -> 3615520079 [36.16%]; 42.259s, 225.7MB/s
574
575* github-june-2days-2019.json
576Default... 6273951764 -> 1041700255 [16.60%]; 431ms, 13882.3MB/s
577Better... 6273951764 -> 945841238 [15.08%]; 547ms, 10938.4MB/s
578Best... 6273951764 -> 826392576 [13.17%]; 9.455s, 632.8MB/s
579
580* nyc-taxi-data-10M.csv
581Default... 3325605752 -> 1093516949 [32.88%]; 324ms, 9788.7MB/s
582Better... 3325605752 -> 885394158 [26.62%]; 491ms, 6459.4MB/s
583Best... 3325605752 -> 773681257 [23.26%]; 8.29s, 412.0MB/s
584
585* 10gb.tar
586Default... 10065157632 -> 5915541066 [58.77%]; 1.028s, 9337.4MB/s
587Better... 10065157632 -> 5453844650 [54.19%]; 1.597s, 4862.7MB/s
588Best... 10065157632 -> 5192495021 [51.59%]; 32.78s, 308.2MB/
589
590* consensus.db.10gb
591Default... 10737418240 -> 4549762344 [42.37%]; 882ms, 12118.4MB/s
592Better... 10737418240 -> 4438535064 [41.34%]; 1.533s, 3500.9MB/s
593Best... 10737418240 -> 4210602774 [39.21%]; 42.96s, 254.4MB/s
594```
595
596Decompression speed should be around the same as using the 'better' compression mode.
597
598## Dictionaries
599
600*Note: S2 dictionary compression is currently at an early implementation stage, with no assembly for
601neither encoding nor decoding. Performance improvements can be expected in the future.*
602
603Adding dictionaries allow providing a custom dictionary that will serve as lookup in the beginning of blocks.
604
605The same dictionary *must* be used for both encoding and decoding.
606S2 does not keep track of whether the same dictionary is used,
607and using the wrong dictionary will most often not result in an error when decompressing.
608
609Blocks encoded *without* dictionaries can be decompressed seamlessly *with* a dictionary.
610This means it is possible to switch from an encoding without dictionaries to an encoding with dictionaries
611and treat the blocks similarly.
612
613Similar to [zStandard dictionaries](https://github.com/facebook/zstd#the-case-for-small-data-compression),
614the same usage scenario applies to S2 dictionaries.
615
616> Training works if there is some correlation in a family of small data samples. The more data-specific a dictionary is, the more efficient it is (there is no universal dictionary). Hence, deploying one dictionary per type of data will provide the greatest benefits. Dictionary gains are mostly effective in the first few KB. Then, the compression algorithm will gradually use previously decoded content to better compress the rest of the file.
617
618S2 further limits the dictionary to only be enabled on the first 64KB of a block.
619This will remove any negative (speed) impacts of the dictionaries on bigger blocks.
620
621### Compression
622
623Using the [github_users_sample_set](https://github.com/facebook/zstd/releases/download/v1.1.3/github_users_sample_set.tar.zst)
624and a 64KB dictionary trained with zStandard the following sizes can be achieved.
625
626| | Default | Better | Best |
627|--------------------|------------------|------------------|-----------------------|
628| Without Dictionary | 3362023 (44.92%) | 3083163 (41.19%) | 3057944 (40.86%) |
629| With Dictionary | 921524 (12.31%) | 873154 (11.67%) | 785503 bytes (10.49%) |
630
631So for highly repetitive content, this case provides an almost 3x reduction in size.
632
633For less uniform data we will use the Go source code tree.
634Compressing First 64KB of all `.go` files in `go/src`, Go 1.19.5, 8912 files, 51253563 bytes input:
635
636| | Default | Better | Best |
637|--------------------|-------------------|-------------------|-------------------|
638| Without Dictionary | 22955767 (44.79%) | 20189613 (39.39% | 19482828 (38.01%) |
639| With Dictionary | 19654568 (38.35%) | 16289357 (31.78%) | 15184589 (29.63%) |
640| Saving/file | 362 bytes | 428 bytes | 472 bytes |
641
642
643### Creating Dictionaries
644
645There are no tools to create dictionaries in S2.
646However, there are multiple ways to create a useful dictionary:
647
648#### Using a Sample File
649
650If your input is very uniform, you can just use a sample file as the dictionary.
651
652For example in the `github_users_sample_set` above, the average compression only goes up from
65310.49% to 11.48% by using the first file as dictionary compared to using a dedicated dictionary.
654
655```Go
656 // Read a sample
657 sample, err := os.ReadFile("sample.json")
658
659 // Create a dictionary.
660 dict := s2.MakeDict(sample, nil)
661
662 // b := dict.Bytes() will provide a dictionary that can be saved
663 // and reloaded with s2.NewDict(b).
664
665 // To encode:
666 encoded := dict.Encode(nil, file)
667
668 // To decode:
669 decoded, err := dict.Decode(nil, file)
670```
671
672#### Using Zstandard
673
674Zstandard dictionaries can easily be converted to S2 dictionaries.
675
676This can be helpful to generate dictionaries for files that don't have a fixed structure.
677
678
679Example, with training set files placed in `./training-set`:
680
681`λ zstd -r --train-fastcover training-set/* --maxdict=65536 -o name.dict`
682
683This will create a dictionary of 64KB, that can be converted to a dictionary like this:
684
685```Go
686 // Decode the Zstandard dictionary.
687 insp, err := zstd.InspectDictionary(zdict)
688 if err != nil {
689 panic(err)
690 }
691
692 // We are only interested in the contents.
693 // Assume that files start with "// Copyright (c) 2023".
694 // Search for the longest match for that.
695 // This may save a few bytes.
696 dict := s2.MakeDict(insp.Content(), []byte("// Copyright (c) 2023"))
697
698 // b := dict.Bytes() will provide a dictionary that can be saved
699 // and reloaded with s2.NewDict(b).
700
701 // We can now encode using this dictionary
702 encodedWithDict := dict.Encode(nil, payload)
703
704 // To decode content:
705 decoded, err := dict.Decode(nil, encodedWithDict)
706```
707
708It is recommended to save the dictionary returned by ` b:= dict.Bytes()`, since that will contain only the S2 dictionary.
709
710This dictionary can later be loaded using `s2.NewDict(b)`. The dictionary then no longer requires `zstd` to be initialized.
711
712Also note how `s2.MakeDict` allows you to search for a common starting sequence of your files.
713This can be omitted, at the expense of a few bytes.
714
715# Snappy Compatibility
716
717S2 now offers full compatibility with Snappy.
718
719This means that the efficient encoders of S2 can be used to generate fully Snappy compatible output.
720
721There is a [snappy](https://github.com/klauspost/compress/tree/master/snappy) package that can be used by
722simply changing imports from `github.com/golang/snappy` to `github.com/klauspost/compress/snappy`.
723This uses "better" mode for all operations.
724If you would like more control, you can use the s2 package as described below:
725
726## Blocks
727
728Snappy compatible blocks can be generated with the S2 encoder.
729Compression and speed is typically a bit better `MaxEncodedLen` is also smaller for smaller memory usage. Replace
730
731| Snappy | S2 replacement |
732|---------------------------|-----------------------|
733| snappy.Encode(...) | s2.EncodeSnappy(...) |
734| snappy.MaxEncodedLen(...) | s2.MaxEncodedLen(...) |
735
736`s2.EncodeSnappy` can be replaced with `s2.EncodeSnappyBetter` or `s2.EncodeSnappyBest` to get more efficiently compressed snappy compatible output.
737
738`s2.ConcatBlocks` is compatible with snappy blocks.
739
740Comparison of [`webdevdata.org-2015-01-07-subset`](https://files.klauspost.com/compress/webdevdata.org-2015-01-07-4GB-subset.7z),
74153927 files, total input size: 4,014,735,833 bytes. amd64, single goroutine used:
742
743| Encoder | Size | MB/s | Reduction |
744|-----------------------|------------|------------|------------|
745| snappy.Encode | 1128706759 | 725.59 | 71.89% |
746| s2.EncodeSnappy | 1093823291 | **899.16** | 72.75% |
747| s2.EncodeSnappyBetter | 1001158548 | 578.49 | 75.06% |
748| s2.EncodeSnappyBest | 944507998 | 66.00 | **76.47%** |
749
750## Streams
751
752For streams, replace `enc = snappy.NewBufferedWriter(w)` with `enc = s2.NewWriter(w, s2.WriterSnappyCompat())`.
753All other options are available, but note that block size limit is different for snappy.
754
755Comparison of different streams, AMD Ryzen 3950x, 16 cores. Size and throughput:
756
757| File | snappy.NewWriter | S2 Snappy | S2 Snappy, Better | S2 Snappy, Best |
758|-----------------------------|--------------------------|---------------------------|--------------------------|-------------------------|
759| nyc-taxi-data-10M.csv | 1316042016 - 539.47MB/s | 1307003093 - 10132.73MB/s | 1174534014 - 5002.44MB/s | 1115904679 - 177.97MB/s |
760| enwik10 (xml) | 5088294643 - 451.13MB/s | 5175840939 - 9440.69MB/s | 4560784526 - 4487.21MB/s | 4340299103 - 158.92MB/s |
761| 10gb.tar (mixed) | 6056946612 - 729.73MB/s | 6208571995 - 9978.05MB/s | 5741646126 - 4919.98MB/s | 5548973895 - 180.44MB/s |
762| github-june-2days-2019.json | 1525176492 - 933.00MB/s | 1476519054 - 13150.12MB/s | 1400547532 - 5803.40MB/s | 1321887137 - 204.29MB/s |
763| consensus.db.10gb (db) | 5412897703 - 1102.14MB/s | 5354073487 - 13562.91MB/s | 5335069899 - 5294.73MB/s | 5201000954 - 175.72MB/s |
764
765# Decompression
766
767All decompression functions map directly to equivalent s2 functions.
768
769| Snappy | S2 replacement |
770|------------------------|--------------------|
771| snappy.Decode(...) | s2.Decode(...) |
772| snappy.DecodedLen(...) | s2.DecodedLen(...) |
773| snappy.NewReader(...) | s2.NewReader(...) |
774
775Features like [quick forward skipping without decompression](https://pkg.go.dev/github.com/klauspost/compress/s2#Reader.Skip)
776are also available for Snappy streams.
777
778If you know you are only decompressing snappy streams, setting [`ReaderMaxBlockSize(64<<10)`](https://pkg.go.dev/github.com/klauspost/compress/s2#ReaderMaxBlockSize)
779on your Reader will reduce memory consumption.
780
781# Concatenating blocks and streams.
782
783Concatenating streams will concatenate the output of both without recompressing them.
784While this is inefficient in terms of compression it might be usable in certain scenarios.
785The 10 byte 'stream identifier' of the second stream can optionally be stripped, but it is not a requirement.
786
787Blocks can be concatenated using the `ConcatBlocks` function.
788
789Snappy blocks/streams can safely be concatenated with S2 blocks and streams.
790Streams with indexes (see below) will currently not work on concatenated streams.
791
792# Stream Seek Index
793
794S2 and Snappy streams can have indexes. These indexes will allow random seeking within the compressed data.
795
796The index can either be appended to the stream as a skippable block or returned for separate storage.
797
798When the index is appended to a stream it will be skipped by regular decoders,
799so the output remains compatible with other decoders.
800
801## Creating an Index
802
803To automatically add an index to a stream, add `WriterAddIndex()` option to your writer.
804Then the index will be added to the stream when `Close()` is called.
805
806```
807 // Add Index to stream...
808 enc := s2.NewWriter(w, s2.WriterAddIndex())
809 io.Copy(enc, r)
810 enc.Close()
811```
812
813If you want to store the index separately, you can use `CloseIndex()` instead of the regular `Close()`.
814This will return the index. Note that `CloseIndex()` should only be called once, and you shouldn't call `Close()`.
815
816```
817 // Get index for separate storage...
818 enc := s2.NewWriter(w)
819 io.Copy(enc, r)
820 index, err := enc.CloseIndex()
821```
822
823The `index` can then be used needing to read from the stream.
824This means the index can be used without needing to seek to the end of the stream
825or for manually forwarding streams. See below.
826
827Finally, an existing S2/Snappy stream can be indexed using the `s2.IndexStream(r io.Reader)` function.
828
829## Using Indexes
830
831To use indexes there is a `ReadSeeker(random bool, index []byte) (*ReadSeeker, error)` function available.
832
833Calling ReadSeeker will return an [io.ReadSeeker](https://pkg.go.dev/io#ReadSeeker) compatible version of the reader.
834
835If 'random' is specified the returned io.Seeker can be used for random seeking, otherwise only forward seeking is supported.
836Enabling random seeking requires the original input to support the [io.Seeker](https://pkg.go.dev/io#Seeker) interface.
837
838```
839 dec := s2.NewReader(r)
840 rs, err := dec.ReadSeeker(false, nil)
841 rs.Seek(wantOffset, io.SeekStart)
842```
843
844Get a seeker to seek forward. Since no index is provided, the index is read from the stream.
845This requires that an index was added and that `r` supports the [io.Seeker](https://pkg.go.dev/io#Seeker) interface.
846
847A custom index can be specified which will be used if supplied.
848When using a custom index, it will not be read from the input stream.
849
850```
851 dec := s2.NewReader(r)
852 rs, err := dec.ReadSeeker(false, index)
853 rs.Seek(wantOffset, io.SeekStart)
854```
855
856This will read the index from `index`. Since we specify non-random (forward only) seeking `r` does not have to be an io.Seeker
857
858```
859 dec := s2.NewReader(r)
860 rs, err := dec.ReadSeeker(true, index)
861 rs.Seek(wantOffset, io.SeekStart)
862```
863
864Finally, since we specify that we want to do random seeking `r` must be an io.Seeker.
865
866The returned [ReadSeeker](https://pkg.go.dev/github.com/klauspost/compress/s2#ReadSeeker) contains a shallow reference to the existing Reader,
867meaning changes performed to one is reflected in the other.
868
869To check if a stream contains an index at the end, the `(*Index).LoadStream(rs io.ReadSeeker) error` can be used.
870
871## Manually Forwarding Streams
872
873Indexes can also be read outside the decoder using the [Index](https://pkg.go.dev/github.com/klauspost/compress/s2#Index) type.
874This can be used for parsing indexes, either separate or in streams.
875
876In some cases it may not be possible to serve a seekable stream.
877This can for instance be an HTTP stream, where the Range request
878is sent at the start of the stream.
879
880With a little bit of extra code it is still possible to use indexes
881to forward to specific offset with a single forward skip.
882
883It is possible to load the index manually like this:
884```
885 var index s2.Index
886 _, err = index.Load(idxBytes)
887```
888
889This can be used to figure out how much to offset the compressed stream:
890
891```
892 compressedOffset, uncompressedOffset, err := index.Find(wantOffset)
893```
894
895The `compressedOffset` is the number of bytes that should be skipped
896from the beginning of the compressed file.
897
898The `uncompressedOffset` will then be offset of the uncompressed bytes returned
899when decoding from that position. This will always be <= wantOffset.
900
901When creating a decoder it must be specified that it should *not* expect a stream identifier
902at the beginning of the stream. Assuming the io.Reader `r` has been forwarded to `compressedOffset`
903we create the decoder like this:
904
905```
906 dec := s2.NewReader(r, s2.ReaderIgnoreStreamIdentifier())
907```
908
909We are not completely done. We still need to forward the stream the uncompressed bytes we didn't want.
910This is done using the regular "Skip" function:
911
912```
913 err = dec.Skip(wantOffset - uncompressedOffset)
914```
915
916This will ensure that we are at exactly the offset we want, and reading from `dec` will start at the requested offset.
917
918# Compact storage
919
920For compact storage [RemoveIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RemoveIndexHeaders) can be used to remove any redundant info from
921a serialized index. If you remove the header it must be restored before [Loading](https://pkg.go.dev/github.com/klauspost/compress/s2#Index.Load).
922
923This is expected to save 20 bytes. These can be restored using [RestoreIndexHeaders](https://pkg.go.dev/github.com/klauspost/compress/s2#RestoreIndexHeaders). This removes a layer of security, but is the most compact representation. Returns nil if headers contains errors.
924
925## Index Format:
926
927Each block is structured as a snappy skippable block, with the chunk ID 0x99.
928
929The block can be read from the front, but contains information so it can be read from the back as well.
930
931Numbers are stored as fixed size little endian values or [zigzag encoded](https://developers.google.com/protocol-buffers/docs/encoding#signed_integers) [base 128 varints](https://developers.google.com/protocol-buffers/docs/encoding),
932with un-encoded value length of 64 bits, unless other limits are specified.
933
934| Content | Format |
935|--------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
936| ID, `[1]byte` | Always 0x99. |
937| Data Length, `[3]byte` | 3 byte little-endian length of the chunk in bytes, following this. |
938| Header `[6]byte` | Header, must be `[115, 50, 105, 100, 120, 0]` or in text: "s2idx\x00". |
939| UncompressedSize, Varint | Total Uncompressed size. |
940| CompressedSize, Varint | Total Compressed size if known. Should be -1 if unknown. |
941| EstBlockSize, Varint | Block Size, used for guessing uncompressed offsets. Must be >= 0. |
942| Entries, Varint | Number of Entries in index, must be < 65536 and >=0. |
943| HasUncompressedOffsets `byte` | 0 if no uncompressed offsets are present, 1 if present. Other values are invalid. |
944| UncompressedOffsets, [Entries]VarInt | Uncompressed offsets. See below how to decode. |
945| CompressedOffsets, [Entries]VarInt | Compressed offsets. See below how to decode. |
946| Block Size, `[4]byte` | Little Endian total encoded size (including header and trailer). Can be used for searching backwards to start of block. |
947| Trailer `[6]byte` | Trailer, must be `[0, 120, 100, 105, 50, 115]` or in text: "\x00xdi2s". Can be used for identifying block from end of stream. |
948
949For regular streams the uncompressed offsets are fully predictable,
950so `HasUncompressedOffsets` allows to specify that compressed blocks all have
951exactly `EstBlockSize` bytes of uncompressed content.
952
953Entries *must* be in order, starting with the lowest offset,
954and there *must* be no uncompressed offset duplicates.
955Entries *may* point to the start of a skippable block,
956but it is then not allowed to also have an entry for the next block since
957that would give an uncompressed offset duplicate.
958
959There is no requirement for all blocks to be represented in the index.
960In fact there is a maximum of 65536 block entries in an index.
961
962The writer can use any method to reduce the number of entries.
963An implicit block start at 0,0 can be assumed.
964
965### Decoding entries:
966
967```
968// Read Uncompressed entries.
969// Each assumes EstBlockSize delta from previous.
970for each entry {
971 uOff = 0
972 if HasUncompressedOffsets == 1 {
973 uOff = ReadVarInt // Read value from stream
974 }
975
976 // Except for the first entry, use previous values.
977 if entryNum == 0 {
978 entry[entryNum].UncompressedOffset = uOff
979 continue
980 }
981
982 // Uncompressed uses previous offset and adds EstBlockSize
983 entry[entryNum].UncompressedOffset = entry[entryNum-1].UncompressedOffset + EstBlockSize + uOff
984}
985
986
987// Guess that the first block will be 50% of uncompressed size.
988// Integer truncating division must be used.
989CompressGuess := EstBlockSize / 2
990
991// Read Compressed entries.
992// Each assumes CompressGuess delta from previous.
993// CompressGuess is adjusted for each value.
994for each entry {
995 cOff = ReadVarInt // Read value from stream
996
997 // Except for the first entry, use previous values.
998 if entryNum == 0 {
999 entry[entryNum].CompressedOffset = cOff
1000 continue
1001 }
1002
1003 // Compressed uses previous and our estimate.
1004 entry[entryNum].CompressedOffset = entry[entryNum-1].CompressedOffset + CompressGuess + cOff
1005
1006 // Adjust compressed offset for next loop, integer truncating division must be used.
1007 CompressGuess += cOff/2
1008}
1009```
1010
1011To decode from any given uncompressed offset `(wantOffset)`:
1012
1013* Iterate entries until `entry[n].UncompressedOffset > wantOffset`.
1014* Start decoding from `entry[n-1].CompressedOffset`.
1015* Discard `entry[n-1].UncompressedOffset - wantOffset` bytes from the decoded stream.
1016
1017See [using indexes](https://github.com/klauspost/compress/tree/master/s2#using-indexes) for functions that perform the operations with a simpler interface.
1018
1019
1020# Format Extensions
1021
1022* Frame [Stream identifier](https://github.com/google/snappy/blob/master/framing_format.txt#L68) changed from `sNaPpY` to `S2sTwO`.
1023* [Framed compressed blocks](https://github.com/google/snappy/blob/master/format_description.txt) can be up to 4MB (up from 64KB).
1024* Compressed blocks can have an offset of `0`, which indicates to repeat the last seen offset.
1025
1026Repeat offsets must be encoded as a [2.2.1. Copy with 1-byte offset (01)](https://github.com/google/snappy/blob/master/format_description.txt#L89), where the offset is 0.
1027
1028The length is specified by reading the 3-bit length specified in the tag and decode using this table:
1029
1030| Length | Actual Length |
1031|--------|----------------------|
1032| 0 | 4 |
1033| 1 | 5 |
1034| 2 | 6 |
1035| 3 | 7 |
1036| 4 | 8 |
1037| 5 | 8 + read 1 byte |
1038| 6 | 260 + read 2 bytes |
1039| 7 | 65540 + read 3 bytes |
1040
1041This allows any repeat offset + length to be represented by 2 to 5 bytes.
1042It also allows to emit matches longer than 64 bytes with one copy + one repeat instead of several 64 byte copies.
1043
1044Lengths are stored as little endian values.
1045
1046The first copy of a block cannot be a repeat offset and the offset is reset on every block in streams.
1047
1048Default streaming block size is 1MB.
1049
1050# Dictionary Encoding
1051
1052Adding dictionaries allow providing a custom dictionary that will serve as lookup in the beginning of blocks.
1053
1054A dictionary provides an initial repeat value that can be used to point to a common header.
1055
1056Other than that the dictionary contains values that can be used as back-references.
1057
1058Often used data should be placed at the *end* of the dictionary since offsets < 2048 bytes will be smaller.
1059
1060## Format
1061
1062Dictionary *content* must at least 16 bytes and less or equal to 64KiB (65536 bytes).
1063
1064Encoding: `[repeat value (uvarint)][dictionary content...]`
1065
1066Before the dictionary content, an unsigned base-128 (uvarint) encoded value specifying the initial repeat offset.
1067This value is an offset into the dictionary content and not a back-reference offset,
1068so setting this to 0 will make the repeat value point to the first value of the dictionary.
1069
1070The value must be less than the dictionary length-8
1071
1072## Encoding
1073
1074From the decoder point of view the dictionary content is seen as preceding the encoded content.
1075
1076`[dictionary content][decoded output]`
1077
1078Backreferences to the dictionary are encoded as ordinary backreferences that have an offset before the start of the decoded block.
1079
1080Matches copying from the dictionary are **not** allowed to cross from the dictionary into the decoded data.
1081However, if a copy ends at the end of the dictionary the next repeat will point to the start of the decoded buffer, which is allowed.
1082
1083The first match can be a repeat value, which will use the repeat offset stored in the dictionary.
1084
1085When 64KB (65536 bytes) has been en/decoded it is no longer allowed to reference the dictionary,
1086neither by a copy nor repeat operations.
1087If the boundary is crossed while copying from the dictionary, the operation should complete,
1088but the next instruction is not allowed to reference the dictionary.
1089
1090Valid blocks encoded *without* a dictionary can be decoded with any dictionary.
1091There are no checks whether the supplied dictionary is the correct for a block.
1092Because of this there is no overhead by using a dictionary.
1093
1094## Example
1095
1096This is the dictionary content. Elements are separated by `[]`.
1097
1098Dictionary: `[0x0a][Yesterday 25 bananas were added to Benjamins brown bag]`.
1099
1100Initial repeat offset is set at 10, which is the letter `2`.
1101
1102Encoded `[LIT "10"][REPEAT len=10][LIT "hich"][MATCH off=50 len=6][MATCH off=31 len=6][MATCH off=61 len=10]`
1103
1104Decoded: `[10][ bananas w][hich][ were ][brown ][were added]`
1105
1106Output: `10 bananas which were brown were added`
1107
1108
1109## Streams
1110
1111For streams each block can use the dictionary.
1112
1113The dictionary cannot not currently be provided on the stream.
1114
1115
1116# LICENSE
1117
1118This code is based on the [Snappy-Go](https://github.com/golang/snappy) implementation.
1119
1120Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
diff --git a/vendor/github.com/klauspost/compress/s2/decode.go b/vendor/github.com/klauspost/compress/s2/decode.go
new file mode 100644
index 0000000..6c7feaf
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode.go
@@ -0,0 +1,437 @@
1// Copyright 2011 The Snappy-Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6package s2
7
8import (
9 "encoding/binary"
10 "errors"
11 "fmt"
12 "strconv"
13)
14
15var (
16 // ErrCorrupt reports that the input is invalid.
17 ErrCorrupt = errors.New("s2: corrupt input")
18 // ErrCRC reports that the input failed CRC validation (streams only)
19 ErrCRC = errors.New("s2: corrupt input, crc mismatch")
20 // ErrTooLarge reports that the uncompressed length is too large.
21 ErrTooLarge = errors.New("s2: decoded block is too large")
22 // ErrUnsupported reports that the input isn't supported.
23 ErrUnsupported = errors.New("s2: unsupported input")
24)
25
26// DecodedLen returns the length of the decoded block.
27func DecodedLen(src []byte) (int, error) {
28 v, _, err := decodedLen(src)
29 return v, err
30}
31
32// decodedLen returns the length of the decoded block and the number of bytes
33// that the length header occupied.
34func decodedLen(src []byte) (blockLen, headerLen int, err error) {
35 v, n := binary.Uvarint(src)
36 if n <= 0 || v > 0xffffffff {
37 return 0, 0, ErrCorrupt
38 }
39
40 const wordSize = 32 << (^uint(0) >> 32 & 1)
41 if wordSize == 32 && v > 0x7fffffff {
42 return 0, 0, ErrTooLarge
43 }
44 return int(v), n, nil
45}
46
47const (
48 decodeErrCodeCorrupt = 1
49)
50
51// Decode returns the decoded form of src. The returned slice may be a sub-
52// slice of dst if dst was large enough to hold the entire decoded block.
53// Otherwise, a newly allocated slice will be returned.
54//
55// The dst and src must not overlap. It is valid to pass a nil dst.
56func Decode(dst, src []byte) ([]byte, error) {
57 dLen, s, err := decodedLen(src)
58 if err != nil {
59 return nil, err
60 }
61 if dLen <= cap(dst) {
62 dst = dst[:dLen]
63 } else {
64 dst = make([]byte, dLen)
65 }
66 if s2Decode(dst, src[s:]) != 0 {
67 return nil, ErrCorrupt
68 }
69 return dst, nil
70}
71
72// s2DecodeDict writes the decoding of src to dst. It assumes that the varint-encoded
73// length of the decompressed bytes has already been read, and that len(dst)
74// equals that length.
75//
76// It returns 0 on success or a decodeErrCodeXxx error code on failure.
77func s2DecodeDict(dst, src []byte, dict *Dict) int {
78 if dict == nil {
79 return s2Decode(dst, src)
80 }
81 const debug = false
82 const debugErrs = debug
83
84 if debug {
85 fmt.Println("Starting decode, dst len:", len(dst))
86 }
87 var d, s, length int
88 offset := len(dict.dict) - dict.repeat
89
90 // As long as we can read at least 5 bytes...
91 for s < len(src)-5 {
92 // Removing bounds checks is SLOWER, when if doing
93 // in := src[s:s+5]
94 // Checked on Go 1.18
95 switch src[s] & 0x03 {
96 case tagLiteral:
97 x := uint32(src[s] >> 2)
98 switch {
99 case x < 60:
100 s++
101 case x == 60:
102 s += 2
103 x = uint32(src[s-1])
104 case x == 61:
105 in := src[s : s+3]
106 x = uint32(in[1]) | uint32(in[2])<<8
107 s += 3
108 case x == 62:
109 in := src[s : s+4]
110 // Load as 32 bit and shift down.
111 x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
112 x >>= 8
113 s += 4
114 case x == 63:
115 in := src[s : s+5]
116 x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
117 s += 5
118 }
119 length = int(x) + 1
120 if debug {
121 fmt.Println("literals, length:", length, "d-after:", d+length)
122 }
123 if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
124 if debugErrs {
125 fmt.Println("corrupt literal: length:", length, "d-left:", len(dst)-d, "src-left:", len(src)-s)
126 }
127 return decodeErrCodeCorrupt
128 }
129
130 copy(dst[d:], src[s:s+length])
131 d += length
132 s += length
133 continue
134
135 case tagCopy1:
136 s += 2
137 toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
138 length = int(src[s-2]) >> 2 & 0x7
139 if toffset == 0 {
140 if debug {
141 fmt.Print("(repeat) ")
142 }
143 // keep last offset
144 switch length {
145 case 5:
146 length = int(src[s]) + 4
147 s += 1
148 case 6:
149 in := src[s : s+2]
150 length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
151 s += 2
152 case 7:
153 in := src[s : s+3]
154 length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)
155 s += 3
156 default: // 0-> 4
157 }
158 } else {
159 offset = toffset
160 }
161 length += 4
162 case tagCopy2:
163 in := src[s : s+3]
164 offset = int(uint32(in[1]) | uint32(in[2])<<8)
165 length = 1 + int(in[0])>>2
166 s += 3
167
168 case tagCopy4:
169 in := src[s : s+5]
170 offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
171 length = 1 + int(in[0])>>2
172 s += 5
173 }
174
175 if offset <= 0 || length > len(dst)-d {
176 if debugErrs {
177 fmt.Println("match error; offset:", offset, "length:", length, "dst-left:", len(dst)-d)
178 }
179 return decodeErrCodeCorrupt
180 }
181
182 // copy from dict
183 if d < offset {
184 if d > MaxDictSrcOffset {
185 if debugErrs {
186 fmt.Println("dict after", MaxDictSrcOffset, "d:", d, "offset:", offset, "length:", length)
187 }
188 return decodeErrCodeCorrupt
189 }
190 startOff := len(dict.dict) - offset + d
191 if startOff < 0 || startOff+length > len(dict.dict) {
192 if debugErrs {
193 fmt.Printf("offset (%d) + length (%d) bigger than dict (%d)\n", offset, length, len(dict.dict))
194 }
195 return decodeErrCodeCorrupt
196 }
197 if debug {
198 fmt.Println("dict copy, length:", length, "offset:", offset, "d-after:", d+length, "dict start offset:", startOff)
199 }
200 copy(dst[d:d+length], dict.dict[startOff:])
201 d += length
202 continue
203 }
204
205 if debug {
206 fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
207 }
208
209 // Copy from an earlier sub-slice of dst to a later sub-slice.
210 // If no overlap, use the built-in copy:
211 if offset > length {
212 copy(dst[d:d+length], dst[d-offset:])
213 d += length
214 continue
215 }
216
217 // Unlike the built-in copy function, this byte-by-byte copy always runs
218 // forwards, even if the slices overlap. Conceptually, this is:
219 //
220 // d += forwardCopy(dst[d:d+length], dst[d-offset:])
221 //
222 // We align the slices into a and b and show the compiler they are the same size.
223 // This allows the loop to run without bounds checks.
224 a := dst[d : d+length]
225 b := dst[d-offset:]
226 b = b[:len(a)]
227 for i := range a {
228 a[i] = b[i]
229 }
230 d += length
231 }
232
233 // Remaining with extra checks...
234 for s < len(src) {
235 switch src[s] & 0x03 {
236 case tagLiteral:
237 x := uint32(src[s] >> 2)
238 switch {
239 case x < 60:
240 s++
241 case x == 60:
242 s += 2
243 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
244 if debugErrs {
245 fmt.Println("src went oob")
246 }
247 return decodeErrCodeCorrupt
248 }
249 x = uint32(src[s-1])
250 case x == 61:
251 s += 3
252 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
253 if debugErrs {
254 fmt.Println("src went oob")
255 }
256 return decodeErrCodeCorrupt
257 }
258 x = uint32(src[s-2]) | uint32(src[s-1])<<8
259 case x == 62:
260 s += 4
261 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
262 if debugErrs {
263 fmt.Println("src went oob")
264 }
265 return decodeErrCodeCorrupt
266 }
267 x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
268 case x == 63:
269 s += 5
270 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
271 if debugErrs {
272 fmt.Println("src went oob")
273 }
274 return decodeErrCodeCorrupt
275 }
276 x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
277 }
278 length = int(x) + 1
279 if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
280 if debugErrs {
281 fmt.Println("corrupt literal: length:", length, "d-left:", len(dst)-d, "src-left:", len(src)-s)
282 }
283 return decodeErrCodeCorrupt
284 }
285 if debug {
286 fmt.Println("literals, length:", length, "d-after:", d+length)
287 }
288
289 copy(dst[d:], src[s:s+length])
290 d += length
291 s += length
292 continue
293
294 case tagCopy1:
295 s += 2
296 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
297 if debugErrs {
298 fmt.Println("src went oob")
299 }
300 return decodeErrCodeCorrupt
301 }
302 length = int(src[s-2]) >> 2 & 0x7
303 toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
304 if toffset == 0 {
305 if debug {
306 fmt.Print("(repeat) ")
307 }
308 // keep last offset
309 switch length {
310 case 5:
311 s += 1
312 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
313 if debugErrs {
314 fmt.Println("src went oob")
315 }
316 return decodeErrCodeCorrupt
317 }
318 length = int(uint32(src[s-1])) + 4
319 case 6:
320 s += 2
321 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
322 if debugErrs {
323 fmt.Println("src went oob")
324 }
325 return decodeErrCodeCorrupt
326 }
327 length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
328 case 7:
329 s += 3
330 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
331 if debugErrs {
332 fmt.Println("src went oob")
333 }
334 return decodeErrCodeCorrupt
335 }
336 length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
337 default: // 0-> 4
338 }
339 } else {
340 offset = toffset
341 }
342 length += 4
343 case tagCopy2:
344 s += 3
345 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
346 if debugErrs {
347 fmt.Println("src went oob")
348 }
349 return decodeErrCodeCorrupt
350 }
351 length = 1 + int(src[s-3])>>2
352 offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
353
354 case tagCopy4:
355 s += 5
356 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
357 if debugErrs {
358 fmt.Println("src went oob")
359 }
360 return decodeErrCodeCorrupt
361 }
362 length = 1 + int(src[s-5])>>2
363 offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
364 }
365
366 if offset <= 0 || length > len(dst)-d {
367 if debugErrs {
368 fmt.Println("match error; offset:", offset, "length:", length, "dst-left:", len(dst)-d)
369 }
370 return decodeErrCodeCorrupt
371 }
372
373 // copy from dict
374 if d < offset {
375 if d > MaxDictSrcOffset {
376 if debugErrs {
377 fmt.Println("dict after", MaxDictSrcOffset, "d:", d, "offset:", offset, "length:", length)
378 }
379 return decodeErrCodeCorrupt
380 }
381 rOff := len(dict.dict) - (offset - d)
382 if debug {
383 fmt.Println("starting dict entry from dict offset", len(dict.dict)-rOff)
384 }
385 if rOff+length > len(dict.dict) {
386 if debugErrs {
387 fmt.Println("err: END offset", rOff+length, "bigger than dict", len(dict.dict), "dict offset:", rOff, "length:", length)
388 }
389 return decodeErrCodeCorrupt
390 }
391 if rOff < 0 {
392 if debugErrs {
393 fmt.Println("err: START offset", rOff, "less than 0", len(dict.dict), "dict offset:", rOff, "length:", length)
394 }
395 return decodeErrCodeCorrupt
396 }
397 copy(dst[d:d+length], dict.dict[rOff:])
398 d += length
399 continue
400 }
401
402 if debug {
403 fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
404 }
405
406 // Copy from an earlier sub-slice of dst to a later sub-slice.
407 // If no overlap, use the built-in copy:
408 if offset > length {
409 copy(dst[d:d+length], dst[d-offset:])
410 d += length
411 continue
412 }
413
414 // Unlike the built-in copy function, this byte-by-byte copy always runs
415 // forwards, even if the slices overlap. Conceptually, this is:
416 //
417 // d += forwardCopy(dst[d:d+length], dst[d-offset:])
418 //
419 // We align the slices into a and b and show the compiler they are the same size.
420 // This allows the loop to run without bounds checks.
421 a := dst[d : d+length]
422 b := dst[d-offset:]
423 b = b[:len(a)]
424 for i := range a {
425 a[i] = b[i]
426 }
427 d += length
428 }
429
430 if d != len(dst) {
431 if debugErrs {
432 fmt.Println("wanted length", len(dst), "got", d)
433 }
434 return decodeErrCodeCorrupt
435 }
436 return 0
437}
diff --git a/vendor/github.com/klauspost/compress/s2/decode_amd64.s b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
new file mode 100644
index 0000000..9b105e0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_amd64.s
@@ -0,0 +1,568 @@
1// Copyright 2016 The Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6// +build !appengine
7// +build gc
8// +build !noasm
9
10#include "textflag.h"
11
12#define R_TMP0 AX
13#define R_TMP1 BX
14#define R_LEN CX
15#define R_OFF DX
16#define R_SRC SI
17#define R_DST DI
18#define R_DBASE R8
19#define R_DLEN R9
20#define R_DEND R10
21#define R_SBASE R11
22#define R_SLEN R12
23#define R_SEND R13
24#define R_TMP2 R14
25#define R_TMP3 R15
26
27// The asm code generally follows the pure Go code in decode_other.go, except
28// where marked with a "!!!".
29
30// func decode(dst, src []byte) int
31//
32// All local variables fit into registers. The non-zero stack size is only to
33// spill registers and push args when issuing a CALL. The register allocation:
34// - R_TMP0 scratch
35// - R_TMP1 scratch
36// - R_LEN length or x (shared)
37// - R_OFF offset
38// - R_SRC &src[s]
39// - R_DST &dst[d]
40// + R_DBASE dst_base
41// + R_DLEN dst_len
42// + R_DEND dst_base + dst_len
43// + R_SBASE src_base
44// + R_SLEN src_len
45// + R_SEND src_base + src_len
46// - R_TMP2 used by doCopy
47// - R_TMP3 used by doCopy
48//
49// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
50// function, and after a CALL returns, and are not otherwise modified.
51//
52// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
53// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
54TEXT ·s2Decode(SB), NOSPLIT, $48-56
55 // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
56 MOVQ dst_base+0(FP), R_DBASE
57 MOVQ dst_len+8(FP), R_DLEN
58 MOVQ R_DBASE, R_DST
59 MOVQ R_DBASE, R_DEND
60 ADDQ R_DLEN, R_DEND
61 MOVQ src_base+24(FP), R_SBASE
62 MOVQ src_len+32(FP), R_SLEN
63 MOVQ R_SBASE, R_SRC
64 MOVQ R_SBASE, R_SEND
65 ADDQ R_SLEN, R_SEND
66 XORQ R_OFF, R_OFF
67
68loop:
69 // for s < len(src)
70 CMPQ R_SRC, R_SEND
71 JEQ end
72
73 // R_LEN = uint32(src[s])
74 //
75 // switch src[s] & 0x03
76 MOVBLZX (R_SRC), R_LEN
77 MOVL R_LEN, R_TMP1
78 ANDL $3, R_TMP1
79 CMPL R_TMP1, $1
80 JAE tagCopy
81
82 // ----------------------------------------
83 // The code below handles literal tags.
84
85 // case tagLiteral:
86 // x := uint32(src[s] >> 2)
87 // switch
88 SHRL $2, R_LEN
89 CMPL R_LEN, $60
90 JAE tagLit60Plus
91
92 // case x < 60:
93 // s++
94 INCQ R_SRC
95
96doLit:
97 // This is the end of the inner "switch", when we have a literal tag.
98 //
99 // We assume that R_LEN == x and x fits in a uint32, where x is the variable
100 // used in the pure Go decode_other.go code.
101
102 // length = int(x) + 1
103 //
104 // Unlike the pure Go code, we don't need to check if length <= 0 because
105 // R_LEN can hold 64 bits, so the increment cannot overflow.
106 INCQ R_LEN
107
108 // Prepare to check if copying length bytes will run past the end of dst or
109 // src.
110 //
111 // R_TMP0 = len(dst) - d
112 // R_TMP1 = len(src) - s
113 MOVQ R_DEND, R_TMP0
114 SUBQ R_DST, R_TMP0
115 MOVQ R_SEND, R_TMP1
116 SUBQ R_SRC, R_TMP1
117
118 // !!! Try a faster technique for short (16 or fewer bytes) copies.
119 //
120 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
121 // goto callMemmove // Fall back on calling runtime·memmove.
122 // }
123 //
124 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
125 // against 21 instead of 16, because it cannot assume that all of its input
126 // is contiguous in memory and so it needs to leave enough source bytes to
127 // read the next tag without refilling buffers, but Go's Decode assumes
128 // contiguousness (the src argument is a []byte).
129 CMPQ R_LEN, $16
130 JGT callMemmove
131 CMPQ R_TMP0, $16
132 JLT callMemmove
133 CMPQ R_TMP1, $16
134 JLT callMemmove
135
136 // !!! Implement the copy from src to dst as a 16-byte load and store.
137 // (Decode's documentation says that dst and src must not overlap.)
138 //
139 // This always copies 16 bytes, instead of only length bytes, but that's
140 // OK. If the input is a valid Snappy encoding then subsequent iterations
141 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
142 // non-nil error), so the overrun will be ignored.
143 //
144 // Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
145 // 16-byte loads and stores. This technique probably wouldn't be as
146 // effective on architectures that are fussier about alignment.
147 MOVOU 0(R_SRC), X0
148 MOVOU X0, 0(R_DST)
149
150 // d += length
151 // s += length
152 ADDQ R_LEN, R_DST
153 ADDQ R_LEN, R_SRC
154 JMP loop
155
156callMemmove:
157 // if length > len(dst)-d || length > len(src)-s { etc }
158 CMPQ R_LEN, R_TMP0
159 JGT errCorrupt
160 CMPQ R_LEN, R_TMP1
161 JGT errCorrupt
162
163 // copy(dst[d:], src[s:s+length])
164 //
165 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
166 // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
167 // three registers to the stack, to save local variables across the CALL.
168 MOVQ R_DST, 0(SP)
169 MOVQ R_SRC, 8(SP)
170 MOVQ R_LEN, 16(SP)
171 MOVQ R_DST, 24(SP)
172 MOVQ R_SRC, 32(SP)
173 MOVQ R_LEN, 40(SP)
174 MOVQ R_OFF, 48(SP)
175 CALL runtime·memmove(SB)
176
177 // Restore local variables: unspill registers from the stack and
178 // re-calculate R_DBASE-R_SEND.
179 MOVQ 24(SP), R_DST
180 MOVQ 32(SP), R_SRC
181 MOVQ 40(SP), R_LEN
182 MOVQ 48(SP), R_OFF
183 MOVQ dst_base+0(FP), R_DBASE
184 MOVQ dst_len+8(FP), R_DLEN
185 MOVQ R_DBASE, R_DEND
186 ADDQ R_DLEN, R_DEND
187 MOVQ src_base+24(FP), R_SBASE
188 MOVQ src_len+32(FP), R_SLEN
189 MOVQ R_SBASE, R_SEND
190 ADDQ R_SLEN, R_SEND
191
192 // d += length
193 // s += length
194 ADDQ R_LEN, R_DST
195 ADDQ R_LEN, R_SRC
196 JMP loop
197
198tagLit60Plus:
199 // !!! This fragment does the
200 //
201 // s += x - 58; if uint(s) > uint(len(src)) { etc }
202 //
203 // checks. In the asm version, we code it once instead of once per switch case.
204 ADDQ R_LEN, R_SRC
205 SUBQ $58, R_SRC
206 CMPQ R_SRC, R_SEND
207 JA errCorrupt
208
209 // case x == 60:
210 CMPL R_LEN, $61
211 JEQ tagLit61
212 JA tagLit62Plus
213
214 // x = uint32(src[s-1])
215 MOVBLZX -1(R_SRC), R_LEN
216 JMP doLit
217
218tagLit61:
219 // case x == 61:
220 // x = uint32(src[s-2]) | uint32(src[s-1])<<8
221 MOVWLZX -2(R_SRC), R_LEN
222 JMP doLit
223
224tagLit62Plus:
225 CMPL R_LEN, $62
226 JA tagLit63
227
228 // case x == 62:
229 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
230 // We read one byte, safe to read one back, since we are just reading tag.
231 // x = binary.LittleEndian.Uint32(src[s-1:]) >> 8
232 MOVL -4(R_SRC), R_LEN
233 SHRL $8, R_LEN
234 JMP doLit
235
236tagLit63:
237 // case x == 63:
238 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
239 MOVL -4(R_SRC), R_LEN
240 JMP doLit
241
242// The code above handles literal tags.
243// ----------------------------------------
244// The code below handles copy tags.
245
246tagCopy4:
247 // case tagCopy4:
248 // s += 5
249 ADDQ $5, R_SRC
250
251 // if uint(s) > uint(len(src)) { etc }
252 CMPQ R_SRC, R_SEND
253 JA errCorrupt
254
255 // length = 1 + int(src[s-5])>>2
256 SHRQ $2, R_LEN
257 INCQ R_LEN
258
259 // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
260 MOVLQZX -4(R_SRC), R_OFF
261 JMP doCopy
262
263tagCopy2:
264 // case tagCopy2:
265 // s += 3
266 ADDQ $3, R_SRC
267
268 // if uint(s) > uint(len(src)) { etc }
269 CMPQ R_SRC, R_SEND
270 JA errCorrupt
271
272 // length = 1 + int(src[s-3])>>2
273 SHRQ $2, R_LEN
274 INCQ R_LEN
275
276 // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
277 MOVWQZX -2(R_SRC), R_OFF
278 JMP doCopy
279
280tagCopy:
281 // We have a copy tag. We assume that:
282 // - R_TMP1 == src[s] & 0x03
283 // - R_LEN == src[s]
284 CMPQ R_TMP1, $2
285 JEQ tagCopy2
286 JA tagCopy4
287
288 // case tagCopy1:
289 // s += 2
290 ADDQ $2, R_SRC
291
292 // if uint(s) > uint(len(src)) { etc }
293 CMPQ R_SRC, R_SEND
294 JA errCorrupt
295
296 // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
297 // length = 4 + int(src[s-2])>>2&0x7
298 MOVBQZX -1(R_SRC), R_TMP1
299 MOVQ R_LEN, R_TMP0
300 SHRQ $2, R_LEN
301 ANDQ $0xe0, R_TMP0
302 ANDQ $7, R_LEN
303 SHLQ $3, R_TMP0
304 ADDQ $4, R_LEN
305 ORQ R_TMP1, R_TMP0
306
307 // check if repeat code, ZF set by ORQ.
308 JZ repeatCode
309
310 // This is a regular copy, transfer our temporary value to R_OFF (length)
311 MOVQ R_TMP0, R_OFF
312 JMP doCopy
313
314// This is a repeat code.
315repeatCode:
316 // If length < 9, reuse last offset, with the length already calculated.
317 CMPQ R_LEN, $9
318 JL doCopyRepeat
319
320 // Read additional bytes for length.
321 JE repeatLen1
322
323 // Rare, so the extra branch shouldn't hurt too much.
324 CMPQ R_LEN, $10
325 JE repeatLen2
326 JMP repeatLen3
327
328// Read repeat lengths.
329repeatLen1:
330 // s ++
331 ADDQ $1, R_SRC
332
333 // if uint(s) > uint(len(src)) { etc }
334 CMPQ R_SRC, R_SEND
335 JA errCorrupt
336
337 // length = src[s-1] + 8
338 MOVBQZX -1(R_SRC), R_LEN
339 ADDL $8, R_LEN
340 JMP doCopyRepeat
341
342repeatLen2:
343 // s +=2
344 ADDQ $2, R_SRC
345
346 // if uint(s) > uint(len(src)) { etc }
347 CMPQ R_SRC, R_SEND
348 JA errCorrupt
349
350 // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + (1 << 8)
351 MOVWQZX -2(R_SRC), R_LEN
352 ADDL $260, R_LEN
353 JMP doCopyRepeat
354
355repeatLen3:
356 // s +=3
357 ADDQ $3, R_SRC
358
359 // if uint(s) > uint(len(src)) { etc }
360 CMPQ R_SRC, R_SEND
361 JA errCorrupt
362
363 // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + (1 << 16)
364 // Read one byte further back (just part of the tag, shifted out)
365 MOVL -4(R_SRC), R_LEN
366 SHRL $8, R_LEN
367 ADDL $65540, R_LEN
368 JMP doCopyRepeat
369
370doCopy:
371 // This is the end of the outer "switch", when we have a copy tag.
372 //
373 // We assume that:
374 // - R_LEN == length && R_LEN > 0
375 // - R_OFF == offset
376
377 // if d < offset { etc }
378 MOVQ R_DST, R_TMP1
379 SUBQ R_DBASE, R_TMP1
380 CMPQ R_TMP1, R_OFF
381 JLT errCorrupt
382
383 // Repeat values can skip the test above, since any offset > 0 will be in dst.
384doCopyRepeat:
385 // if offset <= 0 { etc }
386 CMPQ R_OFF, $0
387 JLE errCorrupt
388
389 // if length > len(dst)-d { etc }
390 MOVQ R_DEND, R_TMP1
391 SUBQ R_DST, R_TMP1
392 CMPQ R_LEN, R_TMP1
393 JGT errCorrupt
394
395 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
396 //
397 // Set:
398 // - R_TMP2 = len(dst)-d
399 // - R_TMP3 = &dst[d-offset]
400 MOVQ R_DEND, R_TMP2
401 SUBQ R_DST, R_TMP2
402 MOVQ R_DST, R_TMP3
403 SUBQ R_OFF, R_TMP3
404
405 // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
406 //
407 // First, try using two 8-byte load/stores, similar to the doLit technique
408 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
409 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
410 // and not one 16-byte load/store, and the first store has to be before the
411 // second load, due to the overlap if offset is in the range [8, 16).
412 //
413 // if length > 16 || offset < 8 || len(dst)-d < 16 {
414 // goto slowForwardCopy
415 // }
416 // copy 16 bytes
417 // d += length
418 CMPQ R_LEN, $16
419 JGT slowForwardCopy
420 CMPQ R_OFF, $8
421 JLT slowForwardCopy
422 CMPQ R_TMP2, $16
423 JLT slowForwardCopy
424 MOVQ 0(R_TMP3), R_TMP0
425 MOVQ R_TMP0, 0(R_DST)
426 MOVQ 8(R_TMP3), R_TMP1
427 MOVQ R_TMP1, 8(R_DST)
428 ADDQ R_LEN, R_DST
429 JMP loop
430
431slowForwardCopy:
432 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
433 // can still try 8-byte load stores, provided we can overrun up to 10 extra
434 // bytes. As above, the overrun will be fixed up by subsequent iterations
435 // of the outermost loop.
436 //
437 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
438 // commentary says:
439 //
440 // ----
441 //
442 // The main part of this loop is a simple copy of eight bytes at a time
443 // until we've copied (at least) the requested amount of bytes. However,
444 // if d and d-offset are less than eight bytes apart (indicating a
445 // repeating pattern of length < 8), we first need to expand the pattern in
446 // order to get the correct results. For instance, if the buffer looks like
447 // this, with the eight-byte <d-offset> and <d> patterns marked as
448 // intervals:
449 //
450 // abxxxxxxxxxxxx
451 // [------] d-offset
452 // [------] d
453 //
454 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
455 // once, after which we can move <d> two bytes without moving <d-offset>:
456 //
457 // ababxxxxxxxxxx
458 // [------] d-offset
459 // [------] d
460 //
461 // and repeat the exercise until the two no longer overlap.
462 //
463 // This allows us to do very well in the special case of one single byte
464 // repeated many times, without taking a big hit for more general cases.
465 //
466 // The worst case of extra writing past the end of the match occurs when
467 // offset == 1 and length == 1; the last copy will read from byte positions
468 // [0..7] and write to [4..11], whereas it was only supposed to write to
469 // position 1. Thus, ten excess bytes.
470 //
471 // ----
472 //
473 // That "10 byte overrun" worst case is confirmed by Go's
474 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
475 // and finishSlowForwardCopy algorithm.
476 //
477 // if length > len(dst)-d-10 {
478 // goto verySlowForwardCopy
479 // }
480 SUBQ $10, R_TMP2
481 CMPQ R_LEN, R_TMP2
482 JGT verySlowForwardCopy
483
484 // We want to keep the offset, so we use R_TMP2 from here.
485 MOVQ R_OFF, R_TMP2
486
487makeOffsetAtLeast8:
488 // !!! As above, expand the pattern so that offset >= 8 and we can use
489 // 8-byte load/stores.
490 //
491 // for offset < 8 {
492 // copy 8 bytes from dst[d-offset:] to dst[d:]
493 // length -= offset
494 // d += offset
495 // offset += offset
496 // // The two previous lines together means that d-offset, and therefore
497 // // R_TMP3, is unchanged.
498 // }
499 CMPQ R_TMP2, $8
500 JGE fixUpSlowForwardCopy
501 MOVQ (R_TMP3), R_TMP1
502 MOVQ R_TMP1, (R_DST)
503 SUBQ R_TMP2, R_LEN
504 ADDQ R_TMP2, R_DST
505 ADDQ R_TMP2, R_TMP2
506 JMP makeOffsetAtLeast8
507
508fixUpSlowForwardCopy:
509 // !!! Add length (which might be negative now) to d (implied by R_DST being
510 // &dst[d]) so that d ends up at the right place when we jump back to the
511 // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
512 // length is positive, copying the remaining length bytes will write to the
513 // right place.
514 MOVQ R_DST, R_TMP0
515 ADDQ R_LEN, R_DST
516
517finishSlowForwardCopy:
518 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
519 // length means that we overrun, but as above, that will be fixed up by
520 // subsequent iterations of the outermost loop.
521 CMPQ R_LEN, $0
522 JLE loop
523 MOVQ (R_TMP3), R_TMP1
524 MOVQ R_TMP1, (R_TMP0)
525 ADDQ $8, R_TMP3
526 ADDQ $8, R_TMP0
527 SUBQ $8, R_LEN
528 JMP finishSlowForwardCopy
529
530verySlowForwardCopy:
531 // verySlowForwardCopy is a simple implementation of forward copy. In C
532 // parlance, this is a do/while loop instead of a while loop, since we know
533 // that length > 0. In Go syntax:
534 //
535 // for {
536 // dst[d] = dst[d - offset]
537 // d++
538 // length--
539 // if length == 0 {
540 // break
541 // }
542 // }
543 MOVB (R_TMP3), R_TMP1
544 MOVB R_TMP1, (R_DST)
545 INCQ R_TMP3
546 INCQ R_DST
547 DECQ R_LEN
548 JNZ verySlowForwardCopy
549 JMP loop
550
551// The code above handles copy tags.
552// ----------------------------------------
553
554end:
555 // This is the end of the "for s < len(src)".
556 //
557 // if d != len(dst) { etc }
558 CMPQ R_DST, R_DEND
559 JNE errCorrupt
560
561 // return 0
562 MOVQ $0, ret+48(FP)
563 RET
564
565errCorrupt:
566 // return decodeErrCodeCorrupt
567 MOVQ $1, ret+48(FP)
568 RET
diff --git a/vendor/github.com/klauspost/compress/s2/decode_arm64.s b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
new file mode 100644
index 0000000..4b63d50
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_arm64.s
@@ -0,0 +1,574 @@
1// Copyright 2020 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build !appengine
6// +build gc
7// +build !noasm
8
9#include "textflag.h"
10
11#define R_TMP0 R2
12#define R_TMP1 R3
13#define R_LEN R4
14#define R_OFF R5
15#define R_SRC R6
16#define R_DST R7
17#define R_DBASE R8
18#define R_DLEN R9
19#define R_DEND R10
20#define R_SBASE R11
21#define R_SLEN R12
22#define R_SEND R13
23#define R_TMP2 R14
24#define R_TMP3 R15
25
26// TEST_SRC will check if R_SRC is <= SRC_END
27#define TEST_SRC() \
28 CMP R_SEND, R_SRC \
29 BGT errCorrupt
30
31// MOVD R_SRC, R_TMP1
32// SUB R_SBASE, R_TMP1, R_TMP1
33// CMP R_SLEN, R_TMP1
34// BGT errCorrupt
35
36// The asm code generally follows the pure Go code in decode_other.go, except
37// where marked with a "!!!".
38
39// func decode(dst, src []byte) int
40//
41// All local variables fit into registers. The non-zero stack size is only to
42// spill registers and push args when issuing a CALL. The register allocation:
43// - R_TMP0 scratch
44// - R_TMP1 scratch
45// - R_LEN length or x
46// - R_OFF offset
47// - R_SRC &src[s]
48// - R_DST &dst[d]
49// + R_DBASE dst_base
50// + R_DLEN dst_len
51// + R_DEND dst_base + dst_len
52// + R_SBASE src_base
53// + R_SLEN src_len
54// + R_SEND src_base + src_len
55// - R_TMP2 used by doCopy
56// - R_TMP3 used by doCopy
57//
58// The registers R_DBASE-R_SEND (marked with a "+") are set at the start of the
59// function, and after a CALL returns, and are not otherwise modified.
60//
61// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
62// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
63TEXT ·s2Decode(SB), NOSPLIT, $56-64
64 // Initialize R_SRC, R_DST and R_DBASE-R_SEND.
65 MOVD dst_base+0(FP), R_DBASE
66 MOVD dst_len+8(FP), R_DLEN
67 MOVD R_DBASE, R_DST
68 MOVD R_DBASE, R_DEND
69 ADD R_DLEN, R_DEND, R_DEND
70 MOVD src_base+24(FP), R_SBASE
71 MOVD src_len+32(FP), R_SLEN
72 MOVD R_SBASE, R_SRC
73 MOVD R_SBASE, R_SEND
74 ADD R_SLEN, R_SEND, R_SEND
75 MOVD $0, R_OFF
76
77loop:
78 // for s < len(src)
79 CMP R_SEND, R_SRC
80 BEQ end
81
82 // R_LEN = uint32(src[s])
83 //
84 // switch src[s] & 0x03
85 MOVBU (R_SRC), R_LEN
86 MOVW R_LEN, R_TMP1
87 ANDW $3, R_TMP1
88 MOVW $1, R1
89 CMPW R1, R_TMP1
90 BGE tagCopy
91
92 // ----------------------------------------
93 // The code below handles literal tags.
94
95 // case tagLiteral:
96 // x := uint32(src[s] >> 2)
97 // switch
98 MOVW $60, R1
99 LSRW $2, R_LEN, R_LEN
100 CMPW R_LEN, R1
101 BLS tagLit60Plus
102
103 // case x < 60:
104 // s++
105 ADD $1, R_SRC, R_SRC
106
107doLit:
108 // This is the end of the inner "switch", when we have a literal tag.
109 //
110 // We assume that R_LEN == x and x fits in a uint32, where x is the variable
111 // used in the pure Go decode_other.go code.
112
113 // length = int(x) + 1
114 //
115 // Unlike the pure Go code, we don't need to check if length <= 0 because
116 // R_LEN can hold 64 bits, so the increment cannot overflow.
117 ADD $1, R_LEN, R_LEN
118
119 // Prepare to check if copying length bytes will run past the end of dst or
120 // src.
121 //
122 // R_TMP0 = len(dst) - d
123 // R_TMP1 = len(src) - s
124 MOVD R_DEND, R_TMP0
125 SUB R_DST, R_TMP0, R_TMP0
126 MOVD R_SEND, R_TMP1
127 SUB R_SRC, R_TMP1, R_TMP1
128
129 // !!! Try a faster technique for short (16 or fewer bytes) copies.
130 //
131 // if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
132 // goto callMemmove // Fall back on calling runtime·memmove.
133 // }
134 //
135 // The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
136 // against 21 instead of 16, because it cannot assume that all of its input
137 // is contiguous in memory and so it needs to leave enough source bytes to
138 // read the next tag without refilling buffers, but Go's Decode assumes
139 // contiguousness (the src argument is a []byte).
140 CMP $16, R_LEN
141 BGT callMemmove
142 CMP $16, R_TMP0
143 BLT callMemmove
144 CMP $16, R_TMP1
145 BLT callMemmove
146
147 // !!! Implement the copy from src to dst as a 16-byte load and store.
148 // (Decode's documentation says that dst and src must not overlap.)
149 //
150 // This always copies 16 bytes, instead of only length bytes, but that's
151 // OK. If the input is a valid Snappy encoding then subsequent iterations
152 // will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
153 // non-nil error), so the overrun will be ignored.
154 //
155 // Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
156 // 16-byte loads and stores. This technique probably wouldn't be as
157 // effective on architectures that are fussier about alignment.
158 LDP 0(R_SRC), (R_TMP2, R_TMP3)
159 STP (R_TMP2, R_TMP3), 0(R_DST)
160
161 // d += length
162 // s += length
163 ADD R_LEN, R_DST, R_DST
164 ADD R_LEN, R_SRC, R_SRC
165 B loop
166
167callMemmove:
168 // if length > len(dst)-d || length > len(src)-s { etc }
169 CMP R_TMP0, R_LEN
170 BGT errCorrupt
171 CMP R_TMP1, R_LEN
172 BGT errCorrupt
173
174 // copy(dst[d:], src[s:s+length])
175 //
176 // This means calling runtime·memmove(&dst[d], &src[s], length), so we push
177 // R_DST, R_SRC and R_LEN as arguments. Coincidentally, we also need to spill those
178 // three registers to the stack, to save local variables across the CALL.
179 MOVD R_DST, 8(RSP)
180 MOVD R_SRC, 16(RSP)
181 MOVD R_LEN, 24(RSP)
182 MOVD R_DST, 32(RSP)
183 MOVD R_SRC, 40(RSP)
184 MOVD R_LEN, 48(RSP)
185 MOVD R_OFF, 56(RSP)
186 CALL runtime·memmove(SB)
187
188 // Restore local variables: unspill registers from the stack and
189 // re-calculate R_DBASE-R_SEND.
190 MOVD 32(RSP), R_DST
191 MOVD 40(RSP), R_SRC
192 MOVD 48(RSP), R_LEN
193 MOVD 56(RSP), R_OFF
194 MOVD dst_base+0(FP), R_DBASE
195 MOVD dst_len+8(FP), R_DLEN
196 MOVD R_DBASE, R_DEND
197 ADD R_DLEN, R_DEND, R_DEND
198 MOVD src_base+24(FP), R_SBASE
199 MOVD src_len+32(FP), R_SLEN
200 MOVD R_SBASE, R_SEND
201 ADD R_SLEN, R_SEND, R_SEND
202
203 // d += length
204 // s += length
205 ADD R_LEN, R_DST, R_DST
206 ADD R_LEN, R_SRC, R_SRC
207 B loop
208
209tagLit60Plus:
210 // !!! This fragment does the
211 //
212 // s += x - 58; if uint(s) > uint(len(src)) { etc }
213 //
214 // checks. In the asm version, we code it once instead of once per switch case.
215 ADD R_LEN, R_SRC, R_SRC
216 SUB $58, R_SRC, R_SRC
217 TEST_SRC()
218
219 // case x == 60:
220 MOVW $61, R1
221 CMPW R1, R_LEN
222 BEQ tagLit61
223 BGT tagLit62Plus
224
225 // x = uint32(src[s-1])
226 MOVBU -1(R_SRC), R_LEN
227 B doLit
228
229tagLit61:
230 // case x == 61:
231 // x = uint32(src[s-2]) | uint32(src[s-1])<<8
232 MOVHU -2(R_SRC), R_LEN
233 B doLit
234
235tagLit62Plus:
236 CMPW $62, R_LEN
237 BHI tagLit63
238
239 // case x == 62:
240 // x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
241 MOVHU -3(R_SRC), R_LEN
242 MOVBU -1(R_SRC), R_TMP1
243 ORR R_TMP1<<16, R_LEN
244 B doLit
245
246tagLit63:
247 // case x == 63:
248 // x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
249 MOVWU -4(R_SRC), R_LEN
250 B doLit
251
252 // The code above handles literal tags.
253 // ----------------------------------------
254 // The code below handles copy tags.
255
256tagCopy4:
257 // case tagCopy4:
258 // s += 5
259 ADD $5, R_SRC, R_SRC
260
261 // if uint(s) > uint(len(src)) { etc }
262 MOVD R_SRC, R_TMP1
263 SUB R_SBASE, R_TMP1, R_TMP1
264 CMP R_SLEN, R_TMP1
265 BGT errCorrupt
266
267 // length = 1 + int(src[s-5])>>2
268 MOVD $1, R1
269 ADD R_LEN>>2, R1, R_LEN
270
271 // offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
272 MOVWU -4(R_SRC), R_OFF
273 B doCopy
274
275tagCopy2:
276 // case tagCopy2:
277 // s += 3
278 ADD $3, R_SRC, R_SRC
279
280 // if uint(s) > uint(len(src)) { etc }
281 TEST_SRC()
282
283 // length = 1 + int(src[s-3])>>2
284 MOVD $1, R1
285 ADD R_LEN>>2, R1, R_LEN
286
287 // offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
288 MOVHU -2(R_SRC), R_OFF
289 B doCopy
290
291tagCopy:
292 // We have a copy tag. We assume that:
293 // - R_TMP1 == src[s] & 0x03
294 // - R_LEN == src[s]
295 CMP $2, R_TMP1
296 BEQ tagCopy2
297 BGT tagCopy4
298
299 // case tagCopy1:
300 // s += 2
301 ADD $2, R_SRC, R_SRC
302
303 // if uint(s) > uint(len(src)) { etc }
304 TEST_SRC()
305
306 // offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
307 // Calculate offset in R_TMP0 in case it is a repeat.
308 MOVD R_LEN, R_TMP0
309 AND $0xe0, R_TMP0
310 MOVBU -1(R_SRC), R_TMP1
311 ORR R_TMP0<<3, R_TMP1, R_TMP0
312
313 // length = 4 + int(src[s-2])>>2&0x7
314 MOVD $7, R1
315 AND R_LEN>>2, R1, R_LEN
316 ADD $4, R_LEN, R_LEN
317
318 // check if repeat code with offset 0.
319 CMP $0, R_TMP0
320 BEQ repeatCode
321
322 // This is a regular copy, transfer our temporary value to R_OFF (offset)
323 MOVD R_TMP0, R_OFF
324 B doCopy
325
326 // This is a repeat code.
327repeatCode:
328 // If length < 9, reuse last offset, with the length already calculated.
329 CMP $9, R_LEN
330 BLT doCopyRepeat
331 BEQ repeatLen1
332 CMP $10, R_LEN
333 BEQ repeatLen2
334
335repeatLen3:
336 // s +=3
337 ADD $3, R_SRC, R_SRC
338
339 // if uint(s) > uint(len(src)) { etc }
340 TEST_SRC()
341
342 // length = uint32(src[s-3]) | (uint32(src[s-2])<<8) | (uint32(src[s-1])<<16) + 65540
343 MOVBU -1(R_SRC), R_TMP0
344 MOVHU -3(R_SRC), R_LEN
345 ORR R_TMP0<<16, R_LEN, R_LEN
346 ADD $65540, R_LEN, R_LEN
347 B doCopyRepeat
348
349repeatLen2:
350 // s +=2
351 ADD $2, R_SRC, R_SRC
352
353 // if uint(s) > uint(len(src)) { etc }
354 TEST_SRC()
355
356 // length = uint32(src[s-2]) | (uint32(src[s-1])<<8) + 260
357 MOVHU -2(R_SRC), R_LEN
358 ADD $260, R_LEN, R_LEN
359 B doCopyRepeat
360
361repeatLen1:
362 // s +=1
363 ADD $1, R_SRC, R_SRC
364
365 // if uint(s) > uint(len(src)) { etc }
366 TEST_SRC()
367
368 // length = src[s-1] + 8
369 MOVBU -1(R_SRC), R_LEN
370 ADD $8, R_LEN, R_LEN
371 B doCopyRepeat
372
373doCopy:
374 // This is the end of the outer "switch", when we have a copy tag.
375 //
376 // We assume that:
377 // - R_LEN == length && R_LEN > 0
378 // - R_OFF == offset
379
380 // if d < offset { etc }
381 MOVD R_DST, R_TMP1
382 SUB R_DBASE, R_TMP1, R_TMP1
383 CMP R_OFF, R_TMP1
384 BLT errCorrupt
385
386 // Repeat values can skip the test above, since any offset > 0 will be in dst.
387doCopyRepeat:
388
389 // if offset <= 0 { etc }
390 CMP $0, R_OFF
391 BLE errCorrupt
392
393 // if length > len(dst)-d { etc }
394 MOVD R_DEND, R_TMP1
395 SUB R_DST, R_TMP1, R_TMP1
396 CMP R_TMP1, R_LEN
397 BGT errCorrupt
398
399 // forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
400 //
401 // Set:
402 // - R_TMP2 = len(dst)-d
403 // - R_TMP3 = &dst[d-offset]
404 MOVD R_DEND, R_TMP2
405 SUB R_DST, R_TMP2, R_TMP2
406 MOVD R_DST, R_TMP3
407 SUB R_OFF, R_TMP3, R_TMP3
408
409 // !!! Try a faster technique for short (16 or fewer bytes) forward copies.
410 //
411 // First, try using two 8-byte load/stores, similar to the doLit technique
412 // above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
413 // still OK if offset >= 8. Note that this has to be two 8-byte load/stores
414 // and not one 16-byte load/store, and the first store has to be before the
415 // second load, due to the overlap if offset is in the range [8, 16).
416 //
417 // if length > 16 || offset < 8 || len(dst)-d < 16 {
418 // goto slowForwardCopy
419 // }
420 // copy 16 bytes
421 // d += length
422 CMP $16, R_LEN
423 BGT slowForwardCopy
424 CMP $8, R_OFF
425 BLT slowForwardCopy
426 CMP $16, R_TMP2
427 BLT slowForwardCopy
428 MOVD 0(R_TMP3), R_TMP0
429 MOVD R_TMP0, 0(R_DST)
430 MOVD 8(R_TMP3), R_TMP1
431 MOVD R_TMP1, 8(R_DST)
432 ADD R_LEN, R_DST, R_DST
433 B loop
434
435slowForwardCopy:
436 // !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
437 // can still try 8-byte load stores, provided we can overrun up to 10 extra
438 // bytes. As above, the overrun will be fixed up by subsequent iterations
439 // of the outermost loop.
440 //
441 // The C++ snappy code calls this technique IncrementalCopyFastPath. Its
442 // commentary says:
443 //
444 // ----
445 //
446 // The main part of this loop is a simple copy of eight bytes at a time
447 // until we've copied (at least) the requested amount of bytes. However,
448 // if d and d-offset are less than eight bytes apart (indicating a
449 // repeating pattern of length < 8), we first need to expand the pattern in
450 // order to get the correct results. For instance, if the buffer looks like
451 // this, with the eight-byte <d-offset> and <d> patterns marked as
452 // intervals:
453 //
454 // abxxxxxxxxxxxx
455 // [------] d-offset
456 // [------] d
457 //
458 // a single eight-byte copy from <d-offset> to <d> will repeat the pattern
459 // once, after which we can move <d> two bytes without moving <d-offset>:
460 //
461 // ababxxxxxxxxxx
462 // [------] d-offset
463 // [------] d
464 //
465 // and repeat the exercise until the two no longer overlap.
466 //
467 // This allows us to do very well in the special case of one single byte
468 // repeated many times, without taking a big hit for more general cases.
469 //
470 // The worst case of extra writing past the end of the match occurs when
471 // offset == 1 and length == 1; the last copy will read from byte positions
472 // [0..7] and write to [4..11], whereas it was only supposed to write to
473 // position 1. Thus, ten excess bytes.
474 //
475 // ----
476 //
477 // That "10 byte overrun" worst case is confirmed by Go's
478 // TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
479 // and finishSlowForwardCopy algorithm.
480 //
481 // if length > len(dst)-d-10 {
482 // goto verySlowForwardCopy
483 // }
484 SUB $10, R_TMP2, R_TMP2
485 CMP R_TMP2, R_LEN
486 BGT verySlowForwardCopy
487
488 // We want to keep the offset, so we use R_TMP2 from here.
489 MOVD R_OFF, R_TMP2
490
491makeOffsetAtLeast8:
492 // !!! As above, expand the pattern so that offset >= 8 and we can use
493 // 8-byte load/stores.
494 //
495 // for offset < 8 {
496 // copy 8 bytes from dst[d-offset:] to dst[d:]
497 // length -= offset
498 // d += offset
499 // offset += offset
500 // // The two previous lines together means that d-offset, and therefore
501 // // R_TMP3, is unchanged.
502 // }
503 CMP $8, R_TMP2
504 BGE fixUpSlowForwardCopy
505 MOVD (R_TMP3), R_TMP1
506 MOVD R_TMP1, (R_DST)
507 SUB R_TMP2, R_LEN, R_LEN
508 ADD R_TMP2, R_DST, R_DST
509 ADD R_TMP2, R_TMP2, R_TMP2
510 B makeOffsetAtLeast8
511
512fixUpSlowForwardCopy:
513 // !!! Add length (which might be negative now) to d (implied by R_DST being
514 // &dst[d]) so that d ends up at the right place when we jump back to the
515 // top of the loop. Before we do that, though, we save R_DST to R_TMP0 so that, if
516 // length is positive, copying the remaining length bytes will write to the
517 // right place.
518 MOVD R_DST, R_TMP0
519 ADD R_LEN, R_DST, R_DST
520
521finishSlowForwardCopy:
522 // !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
523 // length means that we overrun, but as above, that will be fixed up by
524 // subsequent iterations of the outermost loop.
525 MOVD $0, R1
526 CMP R1, R_LEN
527 BLE loop
528 MOVD (R_TMP3), R_TMP1
529 MOVD R_TMP1, (R_TMP0)
530 ADD $8, R_TMP3, R_TMP3
531 ADD $8, R_TMP0, R_TMP0
532 SUB $8, R_LEN, R_LEN
533 B finishSlowForwardCopy
534
535verySlowForwardCopy:
536 // verySlowForwardCopy is a simple implementation of forward copy. In C
537 // parlance, this is a do/while loop instead of a while loop, since we know
538 // that length > 0. In Go syntax:
539 //
540 // for {
541 // dst[d] = dst[d - offset]
542 // d++
543 // length--
544 // if length == 0 {
545 // break
546 // }
547 // }
548 MOVB (R_TMP3), R_TMP1
549 MOVB R_TMP1, (R_DST)
550 ADD $1, R_TMP3, R_TMP3
551 ADD $1, R_DST, R_DST
552 SUB $1, R_LEN, R_LEN
553 CBNZ R_LEN, verySlowForwardCopy
554 B loop
555
556 // The code above handles copy tags.
557 // ----------------------------------------
558
559end:
560 // This is the end of the "for s < len(src)".
561 //
562 // if d != len(dst) { etc }
563 CMP R_DEND, R_DST
564 BNE errCorrupt
565
566 // return 0
567 MOVD $0, ret+48(FP)
568 RET
569
570errCorrupt:
571 // return decodeErrCodeCorrupt
572 MOVD $1, R_TMP0
573 MOVD R_TMP0, ret+48(FP)
574 RET
diff --git a/vendor/github.com/klauspost/compress/s2/decode_asm.go b/vendor/github.com/klauspost/compress/s2/decode_asm.go
new file mode 100644
index 0000000..cb3576e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_asm.go
@@ -0,0 +1,17 @@
1// Copyright 2016 The Snappy-Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6//go:build (amd64 || arm64) && !appengine && gc && !noasm
7// +build amd64 arm64
8// +build !appengine
9// +build gc
10// +build !noasm
11
12package s2
13
14// decode has the same semantics as in decode_other.go.
15//
16//go:noescape
17func s2Decode(dst, src []byte) int
diff --git a/vendor/github.com/klauspost/compress/s2/decode_other.go b/vendor/github.com/klauspost/compress/s2/decode_other.go
new file mode 100644
index 0000000..2cb55c2
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/decode_other.go
@@ -0,0 +1,292 @@
1// Copyright 2016 The Snappy-Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6//go:build (!amd64 && !arm64) || appengine || !gc || noasm
7// +build !amd64,!arm64 appengine !gc noasm
8
9package s2
10
11import (
12 "fmt"
13 "strconv"
14)
15
16// decode writes the decoding of src to dst. It assumes that the varint-encoded
17// length of the decompressed bytes has already been read, and that len(dst)
18// equals that length.
19//
20// It returns 0 on success or a decodeErrCodeXxx error code on failure.
21func s2Decode(dst, src []byte) int {
22 const debug = false
23 if debug {
24 fmt.Println("Starting decode, dst len:", len(dst))
25 }
26 var d, s, length int
27 offset := 0
28
29 // As long as we can read at least 5 bytes...
30 for s < len(src)-5 {
31 // Removing bounds checks is SLOWER, when if doing
32 // in := src[s:s+5]
33 // Checked on Go 1.18
34 switch src[s] & 0x03 {
35 case tagLiteral:
36 x := uint32(src[s] >> 2)
37 switch {
38 case x < 60:
39 s++
40 case x == 60:
41 s += 2
42 x = uint32(src[s-1])
43 case x == 61:
44 in := src[s : s+3]
45 x = uint32(in[1]) | uint32(in[2])<<8
46 s += 3
47 case x == 62:
48 in := src[s : s+4]
49 // Load as 32 bit and shift down.
50 x = uint32(in[0]) | uint32(in[1])<<8 | uint32(in[2])<<16 | uint32(in[3])<<24
51 x >>= 8
52 s += 4
53 case x == 63:
54 in := src[s : s+5]
55 x = uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24
56 s += 5
57 }
58 length = int(x) + 1
59 if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
60 if debug {
61 fmt.Println("corrupt: lit size", length)
62 }
63 return decodeErrCodeCorrupt
64 }
65 if debug {
66 fmt.Println("literals, length:", length, "d-after:", d+length)
67 }
68
69 copy(dst[d:], src[s:s+length])
70 d += length
71 s += length
72 continue
73
74 case tagCopy1:
75 s += 2
76 toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
77 length = int(src[s-2]) >> 2 & 0x7
78 if toffset == 0 {
79 if debug {
80 fmt.Print("(repeat) ")
81 }
82 // keep last offset
83 switch length {
84 case 5:
85 length = int(src[s]) + 4
86 s += 1
87 case 6:
88 in := src[s : s+2]
89 length = int(uint32(in[0])|(uint32(in[1])<<8)) + (1 << 8)
90 s += 2
91 case 7:
92 in := src[s : s+3]
93 length = int((uint32(in[2])<<16)|(uint32(in[1])<<8)|uint32(in[0])) + (1 << 16)
94 s += 3
95 default: // 0-> 4
96 }
97 } else {
98 offset = toffset
99 }
100 length += 4
101 case tagCopy2:
102 in := src[s : s+3]
103 offset = int(uint32(in[1]) | uint32(in[2])<<8)
104 length = 1 + int(in[0])>>2
105 s += 3
106
107 case tagCopy4:
108 in := src[s : s+5]
109 offset = int(uint32(in[1]) | uint32(in[2])<<8 | uint32(in[3])<<16 | uint32(in[4])<<24)
110 length = 1 + int(in[0])>>2
111 s += 5
112 }
113
114 if offset <= 0 || d < offset || length > len(dst)-d {
115 if debug {
116 fmt.Println("corrupt: match, length", length, "offset:", offset, "dst avail:", len(dst)-d, "dst pos:", d)
117 }
118
119 return decodeErrCodeCorrupt
120 }
121
122 if debug {
123 fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
124 }
125
126 // Copy from an earlier sub-slice of dst to a later sub-slice.
127 // If no overlap, use the built-in copy:
128 if offset > length {
129 copy(dst[d:d+length], dst[d-offset:])
130 d += length
131 continue
132 }
133
134 // Unlike the built-in copy function, this byte-by-byte copy always runs
135 // forwards, even if the slices overlap. Conceptually, this is:
136 //
137 // d += forwardCopy(dst[d:d+length], dst[d-offset:])
138 //
139 // We align the slices into a and b and show the compiler they are the same size.
140 // This allows the loop to run without bounds checks.
141 a := dst[d : d+length]
142 b := dst[d-offset:]
143 b = b[:len(a)]
144 for i := range a {
145 a[i] = b[i]
146 }
147 d += length
148 }
149
150 // Remaining with extra checks...
151 for s < len(src) {
152 switch src[s] & 0x03 {
153 case tagLiteral:
154 x := uint32(src[s] >> 2)
155 switch {
156 case x < 60:
157 s++
158 case x == 60:
159 s += 2
160 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
161 return decodeErrCodeCorrupt
162 }
163 x = uint32(src[s-1])
164 case x == 61:
165 s += 3
166 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
167 return decodeErrCodeCorrupt
168 }
169 x = uint32(src[s-2]) | uint32(src[s-1])<<8
170 case x == 62:
171 s += 4
172 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
173 return decodeErrCodeCorrupt
174 }
175 x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
176 case x == 63:
177 s += 5
178 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
179 return decodeErrCodeCorrupt
180 }
181 x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
182 }
183 length = int(x) + 1
184 if length > len(dst)-d || length > len(src)-s || (strconv.IntSize == 32 && length <= 0) {
185 if debug {
186 fmt.Println("corrupt: lit size", length)
187 }
188 return decodeErrCodeCorrupt
189 }
190 if debug {
191 fmt.Println("literals, length:", length, "d-after:", d+length)
192 }
193
194 copy(dst[d:], src[s:s+length])
195 d += length
196 s += length
197 continue
198
199 case tagCopy1:
200 s += 2
201 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
202 return decodeErrCodeCorrupt
203 }
204 length = int(src[s-2]) >> 2 & 0x7
205 toffset := int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
206 if toffset == 0 {
207 if debug {
208 fmt.Print("(repeat) ")
209 }
210 // keep last offset
211 switch length {
212 case 5:
213 s += 1
214 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
215 return decodeErrCodeCorrupt
216 }
217 length = int(uint32(src[s-1])) + 4
218 case 6:
219 s += 2
220 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
221 return decodeErrCodeCorrupt
222 }
223 length = int(uint32(src[s-2])|(uint32(src[s-1])<<8)) + (1 << 8)
224 case 7:
225 s += 3
226 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
227 return decodeErrCodeCorrupt
228 }
229 length = int(uint32(src[s-3])|(uint32(src[s-2])<<8)|(uint32(src[s-1])<<16)) + (1 << 16)
230 default: // 0-> 4
231 }
232 } else {
233 offset = toffset
234 }
235 length += 4
236 case tagCopy2:
237 s += 3
238 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
239 return decodeErrCodeCorrupt
240 }
241 length = 1 + int(src[s-3])>>2
242 offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
243
244 case tagCopy4:
245 s += 5
246 if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
247 return decodeErrCodeCorrupt
248 }
249 length = 1 + int(src[s-5])>>2
250 offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
251 }
252
253 if offset <= 0 || d < offset || length > len(dst)-d {
254 if debug {
255 fmt.Println("corrupt: match, length", length, "offset:", offset, "dst avail:", len(dst)-d, "dst pos:", d)
256 }
257 return decodeErrCodeCorrupt
258 }
259
260 if debug {
261 fmt.Println("copy, length:", length, "offset:", offset, "d-after:", d+length)
262 }
263
264 // Copy from an earlier sub-slice of dst to a later sub-slice.
265 // If no overlap, use the built-in copy:
266 if offset > length {
267 copy(dst[d:d+length], dst[d-offset:])
268 d += length
269 continue
270 }
271
272 // Unlike the built-in copy function, this byte-by-byte copy always runs
273 // forwards, even if the slices overlap. Conceptually, this is:
274 //
275 // d += forwardCopy(dst[d:d+length], dst[d-offset:])
276 //
277 // We align the slices into a and b and show the compiler they are the same size.
278 // This allows the loop to run without bounds checks.
279 a := dst[d : d+length]
280 b := dst[d-offset:]
281 b = b[:len(a)]
282 for i := range a {
283 a[i] = b[i]
284 }
285 d += length
286 }
287
288 if d != len(dst) {
289 return decodeErrCodeCorrupt
290 }
291 return 0
292}
diff --git a/vendor/github.com/klauspost/compress/s2/dict.go b/vendor/github.com/klauspost/compress/s2/dict.go
new file mode 100644
index 0000000..f125ad0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/dict.go
@@ -0,0 +1,350 @@
1// Copyright (c) 2022+ Klaus Post. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package s2
6
7import (
8 "bytes"
9 "encoding/binary"
10 "sync"
11)
12
13const (
14 // MinDictSize is the minimum dictionary size when repeat has been read.
15 MinDictSize = 16
16
17 // MaxDictSize is the maximum dictionary size when repeat has been read.
18 MaxDictSize = 65536
19
20 // MaxDictSrcOffset is the maximum offset where a dictionary entry can start.
21 MaxDictSrcOffset = 65535
22)
23
24// Dict contains a dictionary that can be used for encoding and decoding s2
25type Dict struct {
26 dict []byte
27 repeat int // Repeat as index of dict
28
29 fast, better, best sync.Once
30 fastTable *[1 << 14]uint16
31
32 betterTableShort *[1 << 14]uint16
33 betterTableLong *[1 << 17]uint16
34
35 bestTableShort *[1 << 16]uint32
36 bestTableLong *[1 << 19]uint32
37}
38
39// NewDict will read a dictionary.
40// It will return nil if the dictionary is invalid.
41func NewDict(dict []byte) *Dict {
42 if len(dict) == 0 {
43 return nil
44 }
45 var d Dict
46 // Repeat is the first value of the dict
47 r, n := binary.Uvarint(dict)
48 if n <= 0 {
49 return nil
50 }
51 dict = dict[n:]
52 d.dict = dict
53 if cap(d.dict) < len(d.dict)+16 {
54 d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
55 }
56 if len(dict) < MinDictSize || len(dict) > MaxDictSize {
57 return nil
58 }
59 d.repeat = int(r)
60 if d.repeat > len(dict) {
61 return nil
62 }
63 return &d
64}
65
66// Bytes will return a serialized version of the dictionary.
67// The output can be sent to NewDict.
68func (d *Dict) Bytes() []byte {
69 dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
70 return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
71}
72
73// MakeDict will create a dictionary.
74// 'data' must be at least MinDictSize.
75// If data is longer than MaxDictSize only the last MaxDictSize bytes will be used.
76// If searchStart is set the start repeat value will be set to the last
77// match of this content.
78// If no matches are found, it will attempt to find shorter matches.
79// This content should match the typical start of a block.
80// If at least 4 bytes cannot be matched, repeat is set to start of block.
81func MakeDict(data []byte, searchStart []byte) *Dict {
82 if len(data) == 0 {
83 return nil
84 }
85 if len(data) > MaxDictSize {
86 data = data[len(data)-MaxDictSize:]
87 }
88 var d Dict
89 dict := data
90 d.dict = dict
91 if cap(d.dict) < len(d.dict)+16 {
92 d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
93 }
94 if len(dict) < MinDictSize {
95 return nil
96 }
97
98 // Find the longest match possible, last entry if multiple.
99 for s := len(searchStart); s > 4; s-- {
100 if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
101 d.repeat = idx
102 break
103 }
104 }
105
106 return &d
107}
108
109// MakeDictManual will create a dictionary.
110// 'data' must be at least MinDictSize and less than or equal to MaxDictSize.
111// A manual first repeat index into data must be provided.
112// It must be less than len(data)-8.
113func MakeDictManual(data []byte, firstIdx uint16) *Dict {
114 if len(data) < MinDictSize || int(firstIdx) >= len(data)-8 || len(data) > MaxDictSize {
115 return nil
116 }
117 var d Dict
118 dict := data
119 d.dict = dict
120 if cap(d.dict) < len(d.dict)+16 {
121 d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
122 }
123
124 d.repeat = int(firstIdx)
125 return &d
126}
127
128// Encode returns the encoded form of src. The returned slice may be a sub-
129// slice of dst if dst was large enough to hold the entire encoded block.
130// Otherwise, a newly allocated slice will be returned.
131//
132// The dst and src must not overlap. It is valid to pass a nil dst.
133//
134// The blocks will require the same amount of memory to decode as encoding,
135// and does not make for concurrent decoding.
136// Also note that blocks do not contain CRC information, so corruption may be undetected.
137//
138// If you need to encode larger amounts of data, consider using
139// the streaming interface which gives all of these features.
140func (d *Dict) Encode(dst, src []byte) []byte {
141 if n := MaxEncodedLen(len(src)); n < 0 {
142 panic(ErrTooLarge)
143 } else if cap(dst) < n {
144 dst = make([]byte, n)
145 } else {
146 dst = dst[:n]
147 }
148
149 // The block starts with the varint-encoded length of the decompressed bytes.
150 dstP := binary.PutUvarint(dst, uint64(len(src)))
151
152 if len(src) == 0 {
153 return dst[:dstP]
154 }
155 if len(src) < minNonLiteralBlockSize {
156 dstP += emitLiteral(dst[dstP:], src)
157 return dst[:dstP]
158 }
159 n := encodeBlockDictGo(dst[dstP:], src, d)
160 if n > 0 {
161 dstP += n
162 return dst[:dstP]
163 }
164 // Not compressible
165 dstP += emitLiteral(dst[dstP:], src)
166 return dst[:dstP]
167}
168
169// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
170// slice of dst if dst was large enough to hold the entire encoded block.
171// Otherwise, a newly allocated slice will be returned.
172//
173// EncodeBetter compresses better than Encode but typically with a
174// 10-40% speed decrease on both compression and decompression.
175//
176// The dst and src must not overlap. It is valid to pass a nil dst.
177//
178// The blocks will require the same amount of memory to decode as encoding,
179// and does not make for concurrent decoding.
180// Also note that blocks do not contain CRC information, so corruption may be undetected.
181//
182// If you need to encode larger amounts of data, consider using
183// the streaming interface which gives all of these features.
184func (d *Dict) EncodeBetter(dst, src []byte) []byte {
185 if n := MaxEncodedLen(len(src)); n < 0 {
186 panic(ErrTooLarge)
187 } else if len(dst) < n {
188 dst = make([]byte, n)
189 }
190
191 // The block starts with the varint-encoded length of the decompressed bytes.
192 dstP := binary.PutUvarint(dst, uint64(len(src)))
193
194 if len(src) == 0 {
195 return dst[:dstP]
196 }
197 if len(src) < minNonLiteralBlockSize {
198 dstP += emitLiteral(dst[dstP:], src)
199 return dst[:dstP]
200 }
201 n := encodeBlockBetterDict(dst[dstP:], src, d)
202 if n > 0 {
203 dstP += n
204 return dst[:dstP]
205 }
206 // Not compressible
207 dstP += emitLiteral(dst[dstP:], src)
208 return dst[:dstP]
209}
210
211// EncodeBest returns the encoded form of src. The returned slice may be a sub-
212// slice of dst if dst was large enough to hold the entire encoded block.
213// Otherwise, a newly allocated slice will be returned.
214//
215// EncodeBest compresses as good as reasonably possible but with a
216// big speed decrease.
217//
218// The dst and src must not overlap. It is valid to pass a nil dst.
219//
220// The blocks will require the same amount of memory to decode as encoding,
221// and does not make for concurrent decoding.
222// Also note that blocks do not contain CRC information, so corruption may be undetected.
223//
224// If you need to encode larger amounts of data, consider using
225// the streaming interface which gives all of these features.
226func (d *Dict) EncodeBest(dst, src []byte) []byte {
227 if n := MaxEncodedLen(len(src)); n < 0 {
228 panic(ErrTooLarge)
229 } else if len(dst) < n {
230 dst = make([]byte, n)
231 }
232
233 // The block starts with the varint-encoded length of the decompressed bytes.
234 dstP := binary.PutUvarint(dst, uint64(len(src)))
235
236 if len(src) == 0 {
237 return dst[:dstP]
238 }
239 if len(src) < minNonLiteralBlockSize {
240 dstP += emitLiteral(dst[dstP:], src)
241 return dst[:dstP]
242 }
243 n := encodeBlockBest(dst[dstP:], src, d)
244 if n > 0 {
245 dstP += n
246 return dst[:dstP]
247 }
248 // Not compressible
249 dstP += emitLiteral(dst[dstP:], src)
250 return dst[:dstP]
251}
252
253// Decode returns the decoded form of src. The returned slice may be a sub-
254// slice of dst if dst was large enough to hold the entire decoded block.
255// Otherwise, a newly allocated slice will be returned.
256//
257// The dst and src must not overlap. It is valid to pass a nil dst.
258func (d *Dict) Decode(dst, src []byte) ([]byte, error) {
259 dLen, s, err := decodedLen(src)
260 if err != nil {
261 return nil, err
262 }
263 if dLen <= cap(dst) {
264 dst = dst[:dLen]
265 } else {
266 dst = make([]byte, dLen)
267 }
268 if s2DecodeDict(dst, src[s:], d) != 0 {
269 return nil, ErrCorrupt
270 }
271 return dst, nil
272}
273
274func (d *Dict) initFast() {
275 d.fast.Do(func() {
276 const (
277 tableBits = 14
278 maxTableSize = 1 << tableBits
279 )
280
281 var table [maxTableSize]uint16
282 // We stop so any entry of length 8 can always be read.
283 for i := 0; i < len(d.dict)-8-2; i += 3 {
284 x0 := load64(d.dict, i)
285 h0 := hash6(x0, tableBits)
286 h1 := hash6(x0>>8, tableBits)
287 h2 := hash6(x0>>16, tableBits)
288 table[h0] = uint16(i)
289 table[h1] = uint16(i + 1)
290 table[h2] = uint16(i + 2)
291 }
292 d.fastTable = &table
293 })
294}
295
296func (d *Dict) initBetter() {
297 d.better.Do(func() {
298 const (
299 // Long hash matches.
300 lTableBits = 17
301 maxLTableSize = 1 << lTableBits
302
303 // Short hash matches.
304 sTableBits = 14
305 maxSTableSize = 1 << sTableBits
306 )
307
308 var lTable [maxLTableSize]uint16
309 var sTable [maxSTableSize]uint16
310
311 // We stop so any entry of length 8 can always be read.
312 for i := 0; i < len(d.dict)-8; i++ {
313 cv := load64(d.dict, i)
314 lTable[hash7(cv, lTableBits)] = uint16(i)
315 sTable[hash4(cv, sTableBits)] = uint16(i)
316 }
317 d.betterTableShort = &sTable
318 d.betterTableLong = &lTable
319 })
320}
321
322func (d *Dict) initBest() {
323 d.best.Do(func() {
324 const (
325 // Long hash matches.
326 lTableBits = 19
327 maxLTableSize = 1 << lTableBits
328
329 // Short hash matches.
330 sTableBits = 16
331 maxSTableSize = 1 << sTableBits
332 )
333
334 var lTable [maxLTableSize]uint32
335 var sTable [maxSTableSize]uint32
336
337 // We stop so any entry of length 8 can always be read.
338 for i := 0; i < len(d.dict)-8; i++ {
339 cv := load64(d.dict, i)
340 hashL := hash8(cv, lTableBits)
341 hashS := hash4(cv, sTableBits)
342 candidateL := lTable[hashL]
343 candidateS := sTable[hashS]
344 lTable[hashL] = uint32(i) | candidateL<<16
345 sTable[hashS] = uint32(i) | candidateS<<16
346 }
347 d.bestTableShort = &sTable
348 d.bestTableLong = &lTable
349 })
350}
diff --git a/vendor/github.com/klauspost/compress/s2/encode.go b/vendor/github.com/klauspost/compress/s2/encode.go
new file mode 100644
index 0000000..0c9088a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode.go
@@ -0,0 +1,393 @@
1// Copyright 2011 The Snappy-Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6package s2
7
8import (
9 "encoding/binary"
10 "math"
11 "math/bits"
12)
13
14// Encode returns the encoded form of src. The returned slice may be a sub-
15// slice of dst if dst was large enough to hold the entire encoded block.
16// Otherwise, a newly allocated slice will be returned.
17//
18// The dst and src must not overlap. It is valid to pass a nil dst.
19//
20// The blocks will require the same amount of memory to decode as encoding,
21// and does not make for concurrent decoding.
22// Also note that blocks do not contain CRC information, so corruption may be undetected.
23//
24// If you need to encode larger amounts of data, consider using
25// the streaming interface which gives all of these features.
26func Encode(dst, src []byte) []byte {
27 if n := MaxEncodedLen(len(src)); n < 0 {
28 panic(ErrTooLarge)
29 } else if cap(dst) < n {
30 dst = make([]byte, n)
31 } else {
32 dst = dst[:n]
33 }
34
35 // The block starts with the varint-encoded length of the decompressed bytes.
36 d := binary.PutUvarint(dst, uint64(len(src)))
37
38 if len(src) == 0 {
39 return dst[:d]
40 }
41 if len(src) < minNonLiteralBlockSize {
42 d += emitLiteral(dst[d:], src)
43 return dst[:d]
44 }
45 n := encodeBlock(dst[d:], src)
46 if n > 0 {
47 d += n
48 return dst[:d]
49 }
50 // Not compressible
51 d += emitLiteral(dst[d:], src)
52 return dst[:d]
53}
54
55// EstimateBlockSize will perform a very fast compression
56// without outputting the result and return the compressed output size.
57// The function returns -1 if no improvement could be achieved.
58// Using actual compression will most often produce better compression than the estimate.
59func EstimateBlockSize(src []byte) (d int) {
60 if len(src) <= inputMargin || int64(len(src)) > 0xffffffff {
61 return -1
62 }
63 if len(src) <= 1024 {
64 d = calcBlockSizeSmall(src)
65 } else {
66 d = calcBlockSize(src)
67 }
68
69 if d == 0 {
70 return -1
71 }
72 // Size of the varint encoded block size.
73 d += (bits.Len64(uint64(len(src))) + 7) / 7
74
75 if d >= len(src) {
76 return -1
77 }
78 return d
79}
80
81// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
82// slice of dst if dst was large enough to hold the entire encoded block.
83// Otherwise, a newly allocated slice will be returned.
84//
85// EncodeBetter compresses better than Encode but typically with a
86// 10-40% speed decrease on both compression and decompression.
87//
88// The dst and src must not overlap. It is valid to pass a nil dst.
89//
90// The blocks will require the same amount of memory to decode as encoding,
91// and does not make for concurrent decoding.
92// Also note that blocks do not contain CRC information, so corruption may be undetected.
93//
94// If you need to encode larger amounts of data, consider using
95// the streaming interface which gives all of these features.
96func EncodeBetter(dst, src []byte) []byte {
97 if n := MaxEncodedLen(len(src)); n < 0 {
98 panic(ErrTooLarge)
99 } else if len(dst) < n {
100 dst = make([]byte, n)
101 }
102
103 // The block starts with the varint-encoded length of the decompressed bytes.
104 d := binary.PutUvarint(dst, uint64(len(src)))
105
106 if len(src) == 0 {
107 return dst[:d]
108 }
109 if len(src) < minNonLiteralBlockSize {
110 d += emitLiteral(dst[d:], src)
111 return dst[:d]
112 }
113 n := encodeBlockBetter(dst[d:], src)
114 if n > 0 {
115 d += n
116 return dst[:d]
117 }
118 // Not compressible
119 d += emitLiteral(dst[d:], src)
120 return dst[:d]
121}
122
123// EncodeBest returns the encoded form of src. The returned slice may be a sub-
124// slice of dst if dst was large enough to hold the entire encoded block.
125// Otherwise, a newly allocated slice will be returned.
126//
127// EncodeBest compresses as good as reasonably possible but with a
128// big speed decrease.
129//
130// The dst and src must not overlap. It is valid to pass a nil dst.
131//
132// The blocks will require the same amount of memory to decode as encoding,
133// and does not make for concurrent decoding.
134// Also note that blocks do not contain CRC information, so corruption may be undetected.
135//
136// If you need to encode larger amounts of data, consider using
137// the streaming interface which gives all of these features.
138func EncodeBest(dst, src []byte) []byte {
139 if n := MaxEncodedLen(len(src)); n < 0 {
140 panic(ErrTooLarge)
141 } else if len(dst) < n {
142 dst = make([]byte, n)
143 }
144
145 // The block starts with the varint-encoded length of the decompressed bytes.
146 d := binary.PutUvarint(dst, uint64(len(src)))
147
148 if len(src) == 0 {
149 return dst[:d]
150 }
151 if len(src) < minNonLiteralBlockSize {
152 d += emitLiteral(dst[d:], src)
153 return dst[:d]
154 }
155 n := encodeBlockBest(dst[d:], src, nil)
156 if n > 0 {
157 d += n
158 return dst[:d]
159 }
160 // Not compressible
161 d += emitLiteral(dst[d:], src)
162 return dst[:d]
163}
164
165// EncodeSnappy returns the encoded form of src. The returned slice may be a sub-
166// slice of dst if dst was large enough to hold the entire encoded block.
167// Otherwise, a newly allocated slice will be returned.
168//
169// The output is Snappy compatible and will likely decompress faster.
170//
171// The dst and src must not overlap. It is valid to pass a nil dst.
172//
173// The blocks will require the same amount of memory to decode as encoding,
174// and does not make for concurrent decoding.
175// Also note that blocks do not contain CRC information, so corruption may be undetected.
176//
177// If you need to encode larger amounts of data, consider using
178// the streaming interface which gives all of these features.
179func EncodeSnappy(dst, src []byte) []byte {
180 if n := MaxEncodedLen(len(src)); n < 0 {
181 panic(ErrTooLarge)
182 } else if cap(dst) < n {
183 dst = make([]byte, n)
184 } else {
185 dst = dst[:n]
186 }
187
188 // The block starts with the varint-encoded length of the decompressed bytes.
189 d := binary.PutUvarint(dst, uint64(len(src)))
190
191 if len(src) == 0 {
192 return dst[:d]
193 }
194 if len(src) < minNonLiteralBlockSize {
195 d += emitLiteral(dst[d:], src)
196 return dst[:d]
197 }
198
199 n := encodeBlockSnappy(dst[d:], src)
200 if n > 0 {
201 d += n
202 return dst[:d]
203 }
204 // Not compressible
205 d += emitLiteral(dst[d:], src)
206 return dst[:d]
207}
208
209// EncodeSnappyBetter returns the encoded form of src. The returned slice may be a sub-
210// slice of dst if dst was large enough to hold the entire encoded block.
211// Otherwise, a newly allocated slice will be returned.
212//
213// The output is Snappy compatible and will likely decompress faster.
214//
215// The dst and src must not overlap. It is valid to pass a nil dst.
216//
217// The blocks will require the same amount of memory to decode as encoding,
218// and does not make for concurrent decoding.
219// Also note that blocks do not contain CRC information, so corruption may be undetected.
220//
221// If you need to encode larger amounts of data, consider using
222// the streaming interface which gives all of these features.
223func EncodeSnappyBetter(dst, src []byte) []byte {
224 if n := MaxEncodedLen(len(src)); n < 0 {
225 panic(ErrTooLarge)
226 } else if cap(dst) < n {
227 dst = make([]byte, n)
228 } else {
229 dst = dst[:n]
230 }
231
232 // The block starts with the varint-encoded length of the decompressed bytes.
233 d := binary.PutUvarint(dst, uint64(len(src)))
234
235 if len(src) == 0 {
236 return dst[:d]
237 }
238 if len(src) < minNonLiteralBlockSize {
239 d += emitLiteral(dst[d:], src)
240 return dst[:d]
241 }
242
243 n := encodeBlockBetterSnappy(dst[d:], src)
244 if n > 0 {
245 d += n
246 return dst[:d]
247 }
248 // Not compressible
249 d += emitLiteral(dst[d:], src)
250 return dst[:d]
251}
252
253// EncodeSnappyBest returns the encoded form of src. The returned slice may be a sub-
254// slice of dst if dst was large enough to hold the entire encoded block.
255// Otherwise, a newly allocated slice will be returned.
256//
257// The output is Snappy compatible and will likely decompress faster.
258//
259// The dst and src must not overlap. It is valid to pass a nil dst.
260//
261// The blocks will require the same amount of memory to decode as encoding,
262// and does not make for concurrent decoding.
263// Also note that blocks do not contain CRC information, so corruption may be undetected.
264//
265// If you need to encode larger amounts of data, consider using
266// the streaming interface which gives all of these features.
267func EncodeSnappyBest(dst, src []byte) []byte {
268 if n := MaxEncodedLen(len(src)); n < 0 {
269 panic(ErrTooLarge)
270 } else if cap(dst) < n {
271 dst = make([]byte, n)
272 } else {
273 dst = dst[:n]
274 }
275
276 // The block starts with the varint-encoded length of the decompressed bytes.
277 d := binary.PutUvarint(dst, uint64(len(src)))
278
279 if len(src) == 0 {
280 return dst[:d]
281 }
282 if len(src) < minNonLiteralBlockSize {
283 d += emitLiteral(dst[d:], src)
284 return dst[:d]
285 }
286
287 n := encodeBlockBestSnappy(dst[d:], src)
288 if n > 0 {
289 d += n
290 return dst[:d]
291 }
292 // Not compressible
293 d += emitLiteral(dst[d:], src)
294 return dst[:d]
295}
296
297// ConcatBlocks will concatenate the supplied blocks and append them to the supplied destination.
298// If the destination is nil or too small, a new will be allocated.
299// The blocks are not validated, so garbage in = garbage out.
300// dst may not overlap block data.
301// Any data in dst is preserved as is, so it will not be considered a block.
302func ConcatBlocks(dst []byte, blocks ...[]byte) ([]byte, error) {
303 totalSize := uint64(0)
304 compSize := 0
305 for _, b := range blocks {
306 l, hdr, err := decodedLen(b)
307 if err != nil {
308 return nil, err
309 }
310 totalSize += uint64(l)
311 compSize += len(b) - hdr
312 }
313 if totalSize == 0 {
314 dst = append(dst, 0)
315 return dst, nil
316 }
317 if totalSize > math.MaxUint32 {
318 return nil, ErrTooLarge
319 }
320 var tmp [binary.MaxVarintLen32]byte
321 hdrSize := binary.PutUvarint(tmp[:], totalSize)
322 wantSize := hdrSize + compSize
323
324 if cap(dst)-len(dst) < wantSize {
325 dst = append(make([]byte, 0, wantSize+len(dst)), dst...)
326 }
327 dst = append(dst, tmp[:hdrSize]...)
328 for _, b := range blocks {
329 _, hdr, err := decodedLen(b)
330 if err != nil {
331 return nil, err
332 }
333 dst = append(dst, b[hdr:]...)
334 }
335 return dst, nil
336}
337
338// inputMargin is the minimum number of extra input bytes to keep, inside
339// encodeBlock's inner loop. On some architectures, this margin lets us
340// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
341// literals can be implemented as a single load to and store from a 16-byte
342// register. That literal's actual length can be as short as 1 byte, so this
343// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
344// the encoding loop will fix up the copy overrun, and this inputMargin ensures
345// that we don't overrun the dst and src buffers.
346const inputMargin = 8
347
348// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
349// will be accepted by the encoder.
350const minNonLiteralBlockSize = 32
351
352const intReduction = 2 - (1 << (^uint(0) >> 63)) // 1 (32 bits) or 0 (64 bits)
353
354// MaxBlockSize is the maximum value where MaxEncodedLen will return a valid block size.
355// Blocks this big are highly discouraged, though.
356// Half the size on 32 bit systems.
357const MaxBlockSize = (1<<(32-intReduction) - 1) - binary.MaxVarintLen32 - 5
358
359// MaxEncodedLen returns the maximum length of a snappy block, given its
360// uncompressed length.
361//
362// It will return a negative value if srcLen is too large to encode.
363// 32 bit platforms will have lower thresholds for rejecting big content.
364func MaxEncodedLen(srcLen int) int {
365 n := uint64(srcLen)
366 if intReduction == 1 {
367 // 32 bits
368 if n > math.MaxInt32 {
369 // Also includes negative.
370 return -1
371 }
372 } else if n > 0xffffffff {
373 // 64 bits
374 // Also includes negative.
375 return -1
376 }
377 // Size of the varint encoded block size.
378 n = n + uint64((bits.Len64(n)+7)/7)
379
380 // Add maximum size of encoding block as literals.
381 n += uint64(literalExtraSize(int64(srcLen)))
382 if intReduction == 1 {
383 // 32 bits
384 if n > math.MaxInt32 {
385 return -1
386 }
387 } else if n > 0xffffffff {
388 // 64 bits
389 // Also includes negative.
390 return -1
391 }
392 return int(n)
393}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_all.go b/vendor/github.com/klauspost/compress/s2/encode_all.go
new file mode 100644
index 0000000..5e57995
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_all.go
@@ -0,0 +1,1048 @@
1// Copyright 2016 The Snappy-Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6package s2
7
8import (
9 "bytes"
10 "encoding/binary"
11 "fmt"
12 "math/bits"
13)
14
15func load32(b []byte, i int) uint32 {
16 return binary.LittleEndian.Uint32(b[i:])
17}
18
19func load64(b []byte, i int) uint64 {
20 return binary.LittleEndian.Uint64(b[i:])
21}
22
23// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
24// Preferably h should be a constant and should always be <64.
25func hash6(u uint64, h uint8) uint32 {
26 const prime6bytes = 227718039650203
27 return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
28}
29
30func encodeGo(dst, src []byte) []byte {
31 if n := MaxEncodedLen(len(src)); n < 0 {
32 panic(ErrTooLarge)
33 } else if len(dst) < n {
34 dst = make([]byte, n)
35 }
36
37 // The block starts with the varint-encoded length of the decompressed bytes.
38 d := binary.PutUvarint(dst, uint64(len(src)))
39
40 if len(src) == 0 {
41 return dst[:d]
42 }
43 if len(src) < minNonLiteralBlockSize {
44 d += emitLiteral(dst[d:], src)
45 return dst[:d]
46 }
47 n := encodeBlockGo(dst[d:], src)
48 if n > 0 {
49 d += n
50 return dst[:d]
51 }
52 // Not compressible
53 d += emitLiteral(dst[d:], src)
54 return dst[:d]
55}
56
57// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
58// assumes that the varint-encoded length of the decompressed bytes has already
59// been written.
60//
61// It also assumes that:
62//
63// len(dst) >= MaxEncodedLen(len(src)) &&
64// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
65func encodeBlockGo(dst, src []byte) (d int) {
66 // Initialize the hash table.
67 const (
68 tableBits = 14
69 maxTableSize = 1 << tableBits
70
71 debug = false
72 )
73
74 var table [maxTableSize]uint32
75
76 // sLimit is when to stop looking for offset/length copies. The inputMargin
77 // lets us use a fast path for emitLiteral in the main loop, while we are
78 // looking for copies.
79 sLimit := len(src) - inputMargin
80
81 // Bail if we can't compress to at least this.
82 dstLimit := len(src) - len(src)>>5 - 5
83
84 // nextEmit is where in src the next emitLiteral should start from.
85 nextEmit := 0
86
87 // The encoded form must start with a literal, as there are no previous
88 // bytes to copy, so we start looking for hash matches at s == 1.
89 s := 1
90 cv := load64(src, s)
91
92 // We search for a repeat at -1, but don't output repeats when nextEmit == 0
93 repeat := 1
94
95 for {
96 candidate := 0
97 for {
98 // Next src position to check
99 nextS := s + (s-nextEmit)>>6 + 4
100 if nextS > sLimit {
101 goto emitRemainder
102 }
103 hash0 := hash6(cv, tableBits)
104 hash1 := hash6(cv>>8, tableBits)
105 candidate = int(table[hash0])
106 candidate2 := int(table[hash1])
107 table[hash0] = uint32(s)
108 table[hash1] = uint32(s + 1)
109 hash2 := hash6(cv>>16, tableBits)
110
111 // Check repeat at offset checkRep.
112 const checkRep = 1
113 if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
114 base := s + checkRep
115 // Extend back
116 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
117 i--
118 base--
119 }
120 d += emitLiteral(dst[d:], src[nextEmit:base])
121
122 // Extend forward
123 candidate := s - repeat + 4 + checkRep
124 s += 4 + checkRep
125 for s <= sLimit {
126 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
127 s += bits.TrailingZeros64(diff) >> 3
128 break
129 }
130 s += 8
131 candidate += 8
132 }
133 if debug {
134 // Validate match.
135 if s <= candidate {
136 panic("s <= candidate")
137 }
138 a := src[base:s]
139 b := src[base-repeat : base-repeat+(s-base)]
140 if !bytes.Equal(a, b) {
141 panic("mismatch")
142 }
143 }
144 if nextEmit > 0 {
145 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
146 d += emitRepeat(dst[d:], repeat, s-base)
147 } else {
148 // First match, cannot be repeat.
149 d += emitCopy(dst[d:], repeat, s-base)
150 }
151 nextEmit = s
152 if s >= sLimit {
153 goto emitRemainder
154 }
155
156 cv = load64(src, s)
157 continue
158 }
159
160 if uint32(cv) == load32(src, candidate) {
161 break
162 }
163 candidate = int(table[hash2])
164 if uint32(cv>>8) == load32(src, candidate2) {
165 table[hash2] = uint32(s + 2)
166 candidate = candidate2
167 s++
168 break
169 }
170 table[hash2] = uint32(s + 2)
171 if uint32(cv>>16) == load32(src, candidate) {
172 s += 2
173 break
174 }
175
176 cv = load64(src, nextS)
177 s = nextS
178 }
179
180 // Extend backwards.
181 // The top bytes will be rechecked to get the full match.
182 for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
183 candidate--
184 s--
185 }
186
187 // Bail if we exceed the maximum size.
188 if d+(s-nextEmit) > dstLimit {
189 return 0
190 }
191
192 // A 4-byte match has been found. We'll later see if more than 4 bytes
193 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
194 // them as literal bytes.
195
196 d += emitLiteral(dst[d:], src[nextEmit:s])
197
198 // Call emitCopy, and then see if another emitCopy could be our next
199 // move. Repeat until we find no match for the input immediately after
200 // what was consumed by the last emitCopy call.
201 //
202 // If we exit this loop normally then we need to call emitLiteral next,
203 // though we don't yet know how big the literal will be. We handle that
204 // by proceeding to the next iteration of the main loop. We also can
205 // exit this loop via goto if we get close to exhausting the input.
206 for {
207 // Invariant: we have a 4-byte match at s, and no need to emit any
208 // literal bytes prior to s.
209 base := s
210 repeat = base - candidate
211
212 // Extend the 4-byte match as long as possible.
213 s += 4
214 candidate += 4
215 for s <= len(src)-8 {
216 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
217 s += bits.TrailingZeros64(diff) >> 3
218 break
219 }
220 s += 8
221 candidate += 8
222 }
223
224 d += emitCopy(dst[d:], repeat, s-base)
225 if debug {
226 // Validate match.
227 if s <= candidate {
228 panic("s <= candidate")
229 }
230 a := src[base:s]
231 b := src[base-repeat : base-repeat+(s-base)]
232 if !bytes.Equal(a, b) {
233 panic("mismatch")
234 }
235 }
236
237 nextEmit = s
238 if s >= sLimit {
239 goto emitRemainder
240 }
241
242 if d > dstLimit {
243 // Do we have space for more, if not bail.
244 return 0
245 }
246 // Check for an immediate match, otherwise start search at s+1
247 x := load64(src, s-2)
248 m2Hash := hash6(x, tableBits)
249 currHash := hash6(x>>16, tableBits)
250 candidate = int(table[currHash])
251 table[m2Hash] = uint32(s - 2)
252 table[currHash] = uint32(s)
253 if debug && s == candidate {
254 panic("s == candidate")
255 }
256 if uint32(x>>16) != load32(src, candidate) {
257 cv = load64(src, s+1)
258 s++
259 break
260 }
261 }
262 }
263
264emitRemainder:
265 if nextEmit < len(src) {
266 // Bail if we exceed the maximum size.
267 if d+len(src)-nextEmit > dstLimit {
268 return 0
269 }
270 d += emitLiteral(dst[d:], src[nextEmit:])
271 }
272 return d
273}
274
275func encodeBlockSnappyGo(dst, src []byte) (d int) {
276 // Initialize the hash table.
277 const (
278 tableBits = 14
279 maxTableSize = 1 << tableBits
280 )
281
282 var table [maxTableSize]uint32
283
284 // sLimit is when to stop looking for offset/length copies. The inputMargin
285 // lets us use a fast path for emitLiteral in the main loop, while we are
286 // looking for copies.
287 sLimit := len(src) - inputMargin
288
289 // Bail if we can't compress to at least this.
290 dstLimit := len(src) - len(src)>>5 - 5
291
292 // nextEmit is where in src the next emitLiteral should start from.
293 nextEmit := 0
294
295 // The encoded form must start with a literal, as there are no previous
296 // bytes to copy, so we start looking for hash matches at s == 1.
297 s := 1
298 cv := load64(src, s)
299
300 // We search for a repeat at -1, but don't output repeats when nextEmit == 0
301 repeat := 1
302
303 for {
304 candidate := 0
305 for {
306 // Next src position to check
307 nextS := s + (s-nextEmit)>>6 + 4
308 if nextS > sLimit {
309 goto emitRemainder
310 }
311 hash0 := hash6(cv, tableBits)
312 hash1 := hash6(cv>>8, tableBits)
313 candidate = int(table[hash0])
314 candidate2 := int(table[hash1])
315 table[hash0] = uint32(s)
316 table[hash1] = uint32(s + 1)
317 hash2 := hash6(cv>>16, tableBits)
318
319 // Check repeat at offset checkRep.
320 const checkRep = 1
321 if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
322 base := s + checkRep
323 // Extend back
324 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
325 i--
326 base--
327 }
328 d += emitLiteral(dst[d:], src[nextEmit:base])
329
330 // Extend forward
331 candidate := s - repeat + 4 + checkRep
332 s += 4 + checkRep
333 for s <= sLimit {
334 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
335 s += bits.TrailingZeros64(diff) >> 3
336 break
337 }
338 s += 8
339 candidate += 8
340 }
341
342 d += emitCopyNoRepeat(dst[d:], repeat, s-base)
343 nextEmit = s
344 if s >= sLimit {
345 goto emitRemainder
346 }
347
348 cv = load64(src, s)
349 continue
350 }
351
352 if uint32(cv) == load32(src, candidate) {
353 break
354 }
355 candidate = int(table[hash2])
356 if uint32(cv>>8) == load32(src, candidate2) {
357 table[hash2] = uint32(s + 2)
358 candidate = candidate2
359 s++
360 break
361 }
362 table[hash2] = uint32(s + 2)
363 if uint32(cv>>16) == load32(src, candidate) {
364 s += 2
365 break
366 }
367
368 cv = load64(src, nextS)
369 s = nextS
370 }
371
372 // Extend backwards
373 for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
374 candidate--
375 s--
376 }
377
378 // Bail if we exceed the maximum size.
379 if d+(s-nextEmit) > dstLimit {
380 return 0
381 }
382
383 // A 4-byte match has been found. We'll later see if more than 4 bytes
384 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
385 // them as literal bytes.
386
387 d += emitLiteral(dst[d:], src[nextEmit:s])
388
389 // Call emitCopy, and then see if another emitCopy could be our next
390 // move. Repeat until we find no match for the input immediately after
391 // what was consumed by the last emitCopy call.
392 //
393 // If we exit this loop normally then we need to call emitLiteral next,
394 // though we don't yet know how big the literal will be. We handle that
395 // by proceeding to the next iteration of the main loop. We also can
396 // exit this loop via goto if we get close to exhausting the input.
397 for {
398 // Invariant: we have a 4-byte match at s, and no need to emit any
399 // literal bytes prior to s.
400 base := s
401 repeat = base - candidate
402
403 // Extend the 4-byte match as long as possible.
404 s += 4
405 candidate += 4
406 for s <= len(src)-8 {
407 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
408 s += bits.TrailingZeros64(diff) >> 3
409 break
410 }
411 s += 8
412 candidate += 8
413 }
414
415 d += emitCopyNoRepeat(dst[d:], repeat, s-base)
416 if false {
417 // Validate match.
418 a := src[base:s]
419 b := src[base-repeat : base-repeat+(s-base)]
420 if !bytes.Equal(a, b) {
421 panic("mismatch")
422 }
423 }
424
425 nextEmit = s
426 if s >= sLimit {
427 goto emitRemainder
428 }
429
430 if d > dstLimit {
431 // Do we have space for more, if not bail.
432 return 0
433 }
434 // Check for an immediate match, otherwise start search at s+1
435 x := load64(src, s-2)
436 m2Hash := hash6(x, tableBits)
437 currHash := hash6(x>>16, tableBits)
438 candidate = int(table[currHash])
439 table[m2Hash] = uint32(s - 2)
440 table[currHash] = uint32(s)
441 if uint32(x>>16) != load32(src, candidate) {
442 cv = load64(src, s+1)
443 s++
444 break
445 }
446 }
447 }
448
449emitRemainder:
450 if nextEmit < len(src) {
451 // Bail if we exceed the maximum size.
452 if d+len(src)-nextEmit > dstLimit {
453 return 0
454 }
455 d += emitLiteral(dst[d:], src[nextEmit:])
456 }
457 return d
458}
459
460// encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
461// assumes that the varint-encoded length of the decompressed bytes has already
462// been written.
463//
464// It also assumes that:
465//
466// len(dst) >= MaxEncodedLen(len(src)) &&
467// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
468func encodeBlockDictGo(dst, src []byte, dict *Dict) (d int) {
469 // Initialize the hash table.
470 const (
471 tableBits = 14
472 maxTableSize = 1 << tableBits
473 maxAhead = 8 // maximum bytes ahead without checking sLimit
474
475 debug = false
476 )
477 dict.initFast()
478
479 var table [maxTableSize]uint32
480
481 // sLimit is when to stop looking for offset/length copies. The inputMargin
482 // lets us use a fast path for emitLiteral in the main loop, while we are
483 // looking for copies.
484 sLimit := len(src) - inputMargin
485 if sLimit > MaxDictSrcOffset-maxAhead {
486 sLimit = MaxDictSrcOffset - maxAhead
487 }
488
489 // Bail if we can't compress to at least this.
490 dstLimit := len(src) - len(src)>>5 - 5
491
492 // nextEmit is where in src the next emitLiteral should start from.
493 nextEmit := 0
494
495 // The encoded form can start with a dict entry (copy or repeat).
496 s := 0
497
498 // Convert dict repeat to offset
499 repeat := len(dict.dict) - dict.repeat
500 cv := load64(src, 0)
501
502 // While in dict
503searchDict:
504 for {
505 // Next src position to check
506 nextS := s + (s-nextEmit)>>6 + 4
507 hash0 := hash6(cv, tableBits)
508 hash1 := hash6(cv>>8, tableBits)
509 if nextS > sLimit {
510 if debug {
511 fmt.Println("slimit reached", s, nextS)
512 }
513 break searchDict
514 }
515 candidateDict := int(dict.fastTable[hash0])
516 candidateDict2 := int(dict.fastTable[hash1])
517 candidate2 := int(table[hash1])
518 candidate := int(table[hash0])
519 table[hash0] = uint32(s)
520 table[hash1] = uint32(s + 1)
521 hash2 := hash6(cv>>16, tableBits)
522
523 // Check repeat at offset checkRep.
524 const checkRep = 1
525
526 if repeat > s {
527 candidate := len(dict.dict) - repeat + s
528 if repeat-s >= 4 && uint32(cv) == load32(dict.dict, candidate) {
529 // Extend back
530 base := s
531 for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
532 i--
533 base--
534 }
535 d += emitLiteral(dst[d:], src[nextEmit:base])
536 if debug && nextEmit != base {
537 fmt.Println("emitted ", base-nextEmit, "literals")
538 }
539 s += 4
540 candidate += 4
541 for candidate < len(dict.dict)-8 && s <= len(src)-8 {
542 if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
543 s += bits.TrailingZeros64(diff) >> 3
544 break
545 }
546 s += 8
547 candidate += 8
548 }
549 d += emitRepeat(dst[d:], repeat, s-base)
550 if debug {
551 fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
552 }
553 nextEmit = s
554 if s >= sLimit {
555 break searchDict
556 }
557 cv = load64(src, s)
558 continue
559 }
560 } else if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
561 base := s + checkRep
562 // Extend back
563 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
564 i--
565 base--
566 }
567 d += emitLiteral(dst[d:], src[nextEmit:base])
568 if debug && nextEmit != base {
569 fmt.Println("emitted ", base-nextEmit, "literals")
570 }
571
572 // Extend forward
573 candidate := s - repeat + 4 + checkRep
574 s += 4 + checkRep
575 for s <= sLimit {
576 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
577 s += bits.TrailingZeros64(diff) >> 3
578 break
579 }
580 s += 8
581 candidate += 8
582 }
583 if debug {
584 // Validate match.
585 if s <= candidate {
586 panic("s <= candidate")
587 }
588 a := src[base:s]
589 b := src[base-repeat : base-repeat+(s-base)]
590 if !bytes.Equal(a, b) {
591 panic("mismatch")
592 }
593 }
594
595 if nextEmit > 0 {
596 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
597 d += emitRepeat(dst[d:], repeat, s-base)
598 } else {
599 // First match, cannot be repeat.
600 d += emitCopy(dst[d:], repeat, s-base)
601 }
602
603 nextEmit = s
604 if s >= sLimit {
605 break searchDict
606 }
607 if debug {
608 fmt.Println("emitted reg repeat", s-base, "s:", s)
609 }
610 cv = load64(src, s)
611 continue searchDict
612 }
613 if s == 0 {
614 cv = load64(src, nextS)
615 s = nextS
616 continue searchDict
617 }
618 // Start with table. These matches will always be closer.
619 if uint32(cv) == load32(src, candidate) {
620 goto emitMatch
621 }
622 candidate = int(table[hash2])
623 if uint32(cv>>8) == load32(src, candidate2) {
624 table[hash2] = uint32(s + 2)
625 candidate = candidate2
626 s++
627 goto emitMatch
628 }
629
630 // Check dict. Dicts have longer offsets, so we want longer matches.
631 if cv == load64(dict.dict, candidateDict) {
632 table[hash2] = uint32(s + 2)
633 goto emitDict
634 }
635
636 candidateDict = int(dict.fastTable[hash2])
637 // Check if upper 7 bytes match
638 if candidateDict2 >= 1 {
639 if cv^load64(dict.dict, candidateDict2-1) < (1 << 8) {
640 table[hash2] = uint32(s + 2)
641 candidateDict = candidateDict2
642 s++
643 goto emitDict
644 }
645 }
646
647 table[hash2] = uint32(s + 2)
648 if uint32(cv>>16) == load32(src, candidate) {
649 s += 2
650 goto emitMatch
651 }
652 if candidateDict >= 2 {
653 // Check if upper 6 bytes match
654 if cv^load64(dict.dict, candidateDict-2) < (1 << 16) {
655 s += 2
656 goto emitDict
657 }
658 }
659
660 cv = load64(src, nextS)
661 s = nextS
662 continue searchDict
663
664 emitDict:
665 {
666 if debug {
667 if load32(dict.dict, candidateDict) != load32(src, s) {
668 panic("dict emit mismatch")
669 }
670 }
671 // Extend backwards.
672 // The top bytes will be rechecked to get the full match.
673 for candidateDict > 0 && s > nextEmit && dict.dict[candidateDict-1] == src[s-1] {
674 candidateDict--
675 s--
676 }
677
678 // Bail if we exceed the maximum size.
679 if d+(s-nextEmit) > dstLimit {
680 return 0
681 }
682
683 // A 4-byte match has been found. We'll later see if more than 4 bytes
684 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
685 // them as literal bytes.
686
687 d += emitLiteral(dst[d:], src[nextEmit:s])
688 if debug && nextEmit != s {
689 fmt.Println("emitted ", s-nextEmit, "literals")
690 }
691 {
692 // Invariant: we have a 4-byte match at s, and no need to emit any
693 // literal bytes prior to s.
694 base := s
695 repeat = s + (len(dict.dict)) - candidateDict
696
697 // Extend the 4-byte match as long as possible.
698 s += 4
699 candidateDict += 4
700 for s <= len(src)-8 && len(dict.dict)-candidateDict >= 8 {
701 if diff := load64(src, s) ^ load64(dict.dict, candidateDict); diff != 0 {
702 s += bits.TrailingZeros64(diff) >> 3
703 break
704 }
705 s += 8
706 candidateDict += 8
707 }
708
709 // Matches longer than 64 are split.
710 if s <= sLimit || s-base < 8 {
711 d += emitCopy(dst[d:], repeat, s-base)
712 } else {
713 // Split to ensure we don't start a copy within next block
714 d += emitCopy(dst[d:], repeat, 4)
715 d += emitRepeat(dst[d:], repeat, s-base-4)
716 }
717 if false {
718 // Validate match.
719 if s <= candidate {
720 panic("s <= candidate")
721 }
722 a := src[base:s]
723 b := dict.dict[base-repeat : base-repeat+(s-base)]
724 if !bytes.Equal(a, b) {
725 panic("mismatch")
726 }
727 }
728 if debug {
729 fmt.Println("emitted dict copy, length", s-base, "offset:", repeat, "s:", s)
730 }
731 nextEmit = s
732 if s >= sLimit {
733 break searchDict
734 }
735
736 if d > dstLimit {
737 // Do we have space for more, if not bail.
738 return 0
739 }
740
741 // Index and continue loop to try new candidate.
742 x := load64(src, s-2)
743 m2Hash := hash6(x, tableBits)
744 currHash := hash6(x>>8, tableBits)
745 table[m2Hash] = uint32(s - 2)
746 table[currHash] = uint32(s - 1)
747 cv = load64(src, s)
748 }
749 continue
750 }
751 emitMatch:
752
753 // Extend backwards.
754 // The top bytes will be rechecked to get the full match.
755 for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
756 candidate--
757 s--
758 }
759
760 // Bail if we exceed the maximum size.
761 if d+(s-nextEmit) > dstLimit {
762 return 0
763 }
764
765 // A 4-byte match has been found. We'll later see if more than 4 bytes
766 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
767 // them as literal bytes.
768
769 d += emitLiteral(dst[d:], src[nextEmit:s])
770 if debug && nextEmit != s {
771 fmt.Println("emitted ", s-nextEmit, "literals")
772 }
773 // Call emitCopy, and then see if another emitCopy could be our next
774 // move. Repeat until we find no match for the input immediately after
775 // what was consumed by the last emitCopy call.
776 //
777 // If we exit this loop normally then we need to call emitLiteral next,
778 // though we don't yet know how big the literal will be. We handle that
779 // by proceeding to the next iteration of the main loop. We also can
780 // exit this loop via goto if we get close to exhausting the input.
781 for {
782 // Invariant: we have a 4-byte match at s, and no need to emit any
783 // literal bytes prior to s.
784 base := s
785 repeat = base - candidate
786
787 // Extend the 4-byte match as long as possible.
788 s += 4
789 candidate += 4
790 for s <= len(src)-8 {
791 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
792 s += bits.TrailingZeros64(diff) >> 3
793 break
794 }
795 s += 8
796 candidate += 8
797 }
798
799 d += emitCopy(dst[d:], repeat, s-base)
800 if debug {
801 // Validate match.
802 if s <= candidate {
803 panic("s <= candidate")
804 }
805 a := src[base:s]
806 b := src[base-repeat : base-repeat+(s-base)]
807 if !bytes.Equal(a, b) {
808 panic("mismatch")
809 }
810 }
811 if debug {
812 fmt.Println("emitted src copy, length", s-base, "offset:", repeat, "s:", s)
813 }
814 nextEmit = s
815 if s >= sLimit {
816 break searchDict
817 }
818
819 if d > dstLimit {
820 // Do we have space for more, if not bail.
821 return 0
822 }
823 // Check for an immediate match, otherwise start search at s+1
824 x := load64(src, s-2)
825 m2Hash := hash6(x, tableBits)
826 currHash := hash6(x>>16, tableBits)
827 candidate = int(table[currHash])
828 table[m2Hash] = uint32(s - 2)
829 table[currHash] = uint32(s)
830 if debug && s == candidate {
831 panic("s == candidate")
832 }
833 if uint32(x>>16) != load32(src, candidate) {
834 cv = load64(src, s+1)
835 s++
836 break
837 }
838 }
839 }
840
841 // Search without dict:
842 if repeat > s {
843 repeat = 0
844 }
845
846 // No more dict
847 sLimit = len(src) - inputMargin
848 if s >= sLimit {
849 goto emitRemainder
850 }
851 if debug {
852 fmt.Println("non-dict matching at", s, "repeat:", repeat)
853 }
854 cv = load64(src, s)
855 if debug {
856 fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
857 }
858 for {
859 candidate := 0
860 for {
861 // Next src position to check
862 nextS := s + (s-nextEmit)>>6 + 4
863 if nextS > sLimit {
864 goto emitRemainder
865 }
866 hash0 := hash6(cv, tableBits)
867 hash1 := hash6(cv>>8, tableBits)
868 candidate = int(table[hash0])
869 candidate2 := int(table[hash1])
870 table[hash0] = uint32(s)
871 table[hash1] = uint32(s + 1)
872 hash2 := hash6(cv>>16, tableBits)
873
874 // Check repeat at offset checkRep.
875 const checkRep = 1
876 if repeat > 0 && uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
877 base := s + checkRep
878 // Extend back
879 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
880 i--
881 base--
882 }
883 d += emitLiteral(dst[d:], src[nextEmit:base])
884 if debug && nextEmit != base {
885 fmt.Println("emitted ", base-nextEmit, "literals")
886 }
887 // Extend forward
888 candidate := s - repeat + 4 + checkRep
889 s += 4 + checkRep
890 for s <= sLimit {
891 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
892 s += bits.TrailingZeros64(diff) >> 3
893 break
894 }
895 s += 8
896 candidate += 8
897 }
898 if debug {
899 // Validate match.
900 if s <= candidate {
901 panic("s <= candidate")
902 }
903 a := src[base:s]
904 b := src[base-repeat : base-repeat+(s-base)]
905 if !bytes.Equal(a, b) {
906 panic("mismatch")
907 }
908 }
909 if nextEmit > 0 {
910 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
911 d += emitRepeat(dst[d:], repeat, s-base)
912 } else {
913 // First match, cannot be repeat.
914 d += emitCopy(dst[d:], repeat, s-base)
915 }
916 if debug {
917 fmt.Println("emitted src repeat length", s-base, "offset:", repeat, "s:", s)
918 }
919 nextEmit = s
920 if s >= sLimit {
921 goto emitRemainder
922 }
923
924 cv = load64(src, s)
925 continue
926 }
927
928 if uint32(cv) == load32(src, candidate) {
929 break
930 }
931 candidate = int(table[hash2])
932 if uint32(cv>>8) == load32(src, candidate2) {
933 table[hash2] = uint32(s + 2)
934 candidate = candidate2
935 s++
936 break
937 }
938 table[hash2] = uint32(s + 2)
939 if uint32(cv>>16) == load32(src, candidate) {
940 s += 2
941 break
942 }
943
944 cv = load64(src, nextS)
945 s = nextS
946 }
947
948 // Extend backwards.
949 // The top bytes will be rechecked to get the full match.
950 for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
951 candidate--
952 s--
953 }
954
955 // Bail if we exceed the maximum size.
956 if d+(s-nextEmit) > dstLimit {
957 return 0
958 }
959
960 // A 4-byte match has been found. We'll later see if more than 4 bytes
961 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
962 // them as literal bytes.
963
964 d += emitLiteral(dst[d:], src[nextEmit:s])
965 if debug && nextEmit != s {
966 fmt.Println("emitted ", s-nextEmit, "literals")
967 }
968 // Call emitCopy, and then see if another emitCopy could be our next
969 // move. Repeat until we find no match for the input immediately after
970 // what was consumed by the last emitCopy call.
971 //
972 // If we exit this loop normally then we need to call emitLiteral next,
973 // though we don't yet know how big the literal will be. We handle that
974 // by proceeding to the next iteration of the main loop. We also can
975 // exit this loop via goto if we get close to exhausting the input.
976 for {
977 // Invariant: we have a 4-byte match at s, and no need to emit any
978 // literal bytes prior to s.
979 base := s
980 repeat = base - candidate
981
982 // Extend the 4-byte match as long as possible.
983 s += 4
984 candidate += 4
985 for s <= len(src)-8 {
986 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
987 s += bits.TrailingZeros64(diff) >> 3
988 break
989 }
990 s += 8
991 candidate += 8
992 }
993
994 d += emitCopy(dst[d:], repeat, s-base)
995 if debug {
996 // Validate match.
997 if s <= candidate {
998 panic("s <= candidate")
999 }
1000 a := src[base:s]
1001 b := src[base-repeat : base-repeat+(s-base)]
1002 if !bytes.Equal(a, b) {
1003 panic("mismatch")
1004 }
1005 }
1006 if debug {
1007 fmt.Println("emitted src copy, length", s-base, "offset:", repeat, "s:", s)
1008 }
1009 nextEmit = s
1010 if s >= sLimit {
1011 goto emitRemainder
1012 }
1013
1014 if d > dstLimit {
1015 // Do we have space for more, if not bail.
1016 return 0
1017 }
1018 // Check for an immediate match, otherwise start search at s+1
1019 x := load64(src, s-2)
1020 m2Hash := hash6(x, tableBits)
1021 currHash := hash6(x>>16, tableBits)
1022 candidate = int(table[currHash])
1023 table[m2Hash] = uint32(s - 2)
1024 table[currHash] = uint32(s)
1025 if debug && s == candidate {
1026 panic("s == candidate")
1027 }
1028 if uint32(x>>16) != load32(src, candidate) {
1029 cv = load64(src, s+1)
1030 s++
1031 break
1032 }
1033 }
1034 }
1035
1036emitRemainder:
1037 if nextEmit < len(src) {
1038 // Bail if we exceed the maximum size.
1039 if d+len(src)-nextEmit > dstLimit {
1040 return 0
1041 }
1042 d += emitLiteral(dst[d:], src[nextEmit:])
1043 if debug && nextEmit != s {
1044 fmt.Println("emitted ", len(src)-nextEmit, "literals")
1045 }
1046 }
1047 return d
1048}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_amd64.go b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
new file mode 100644
index 0000000..ebc332a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_amd64.go
@@ -0,0 +1,148 @@
1//go:build !appengine && !noasm && gc
2// +build !appengine,!noasm,gc
3
4package s2
5
6const hasAmd64Asm = true
7
8// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
9// assumes that the varint-encoded length of the decompressed bytes has already
10// been written.
11//
12// It also assumes that:
13//
14// len(dst) >= MaxEncodedLen(len(src)) &&
15// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
16func encodeBlock(dst, src []byte) (d int) {
17 const (
18 // Use 12 bit table when less than...
19 limit12B = 16 << 10
20 // Use 10 bit table when less than...
21 limit10B = 4 << 10
22 // Use 8 bit table when less than...
23 limit8B = 512
24 )
25
26 if len(src) >= 4<<20 {
27 return encodeBlockAsm(dst, src)
28 }
29 if len(src) >= limit12B {
30 return encodeBlockAsm4MB(dst, src)
31 }
32 if len(src) >= limit10B {
33 return encodeBlockAsm12B(dst, src)
34 }
35 if len(src) >= limit8B {
36 return encodeBlockAsm10B(dst, src)
37 }
38 if len(src) < minNonLiteralBlockSize {
39 return 0
40 }
41 return encodeBlockAsm8B(dst, src)
42}
43
44// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
45// assumes that the varint-encoded length of the decompressed bytes has already
46// been written.
47//
48// It also assumes that:
49//
50// len(dst) >= MaxEncodedLen(len(src)) &&
51// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
52func encodeBlockBetter(dst, src []byte) (d int) {
53 const (
54 // Use 12 bit table when less than...
55 limit12B = 16 << 10
56 // Use 10 bit table when less than...
57 limit10B = 4 << 10
58 // Use 8 bit table when less than...
59 limit8B = 512
60 )
61
62 if len(src) > 4<<20 {
63 return encodeBetterBlockAsm(dst, src)
64 }
65 if len(src) >= limit12B {
66 return encodeBetterBlockAsm4MB(dst, src)
67 }
68 if len(src) >= limit10B {
69 return encodeBetterBlockAsm12B(dst, src)
70 }
71 if len(src) >= limit8B {
72 return encodeBetterBlockAsm10B(dst, src)
73 }
74 if len(src) < minNonLiteralBlockSize {
75 return 0
76 }
77 return encodeBetterBlockAsm8B(dst, src)
78}
79
80// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
81// assumes that the varint-encoded length of the decompressed bytes has already
82// been written.
83//
84// It also assumes that:
85//
86// len(dst) >= MaxEncodedLen(len(src)) &&
87// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
88func encodeBlockSnappy(dst, src []byte) (d int) {
89 const (
90 // Use 12 bit table when less than...
91 limit12B = 16 << 10
92 // Use 10 bit table when less than...
93 limit10B = 4 << 10
94 // Use 8 bit table when less than...
95 limit8B = 512
96 )
97 if len(src) >= 64<<10 {
98 return encodeSnappyBlockAsm(dst, src)
99 }
100 if len(src) >= limit12B {
101 return encodeSnappyBlockAsm64K(dst, src)
102 }
103 if len(src) >= limit10B {
104 return encodeSnappyBlockAsm12B(dst, src)
105 }
106 if len(src) >= limit8B {
107 return encodeSnappyBlockAsm10B(dst, src)
108 }
109 if len(src) < minNonLiteralBlockSize {
110 return 0
111 }
112 return encodeSnappyBlockAsm8B(dst, src)
113}
114
115// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
116// assumes that the varint-encoded length of the decompressed bytes has already
117// been written.
118//
119// It also assumes that:
120//
121// len(dst) >= MaxEncodedLen(len(src)) &&
122// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
123func encodeBlockBetterSnappy(dst, src []byte) (d int) {
124 const (
125 // Use 12 bit table when less than...
126 limit12B = 16 << 10
127 // Use 10 bit table when less than...
128 limit10B = 4 << 10
129 // Use 8 bit table when less than...
130 limit8B = 512
131 )
132 if len(src) >= 64<<10 {
133 return encodeSnappyBetterBlockAsm(dst, src)
134 }
135 if len(src) >= limit12B {
136 return encodeSnappyBetterBlockAsm64K(dst, src)
137 }
138 if len(src) >= limit10B {
139 return encodeSnappyBetterBlockAsm12B(dst, src)
140 }
141 if len(src) >= limit8B {
142 return encodeSnappyBetterBlockAsm10B(dst, src)
143 }
144 if len(src) < minNonLiteralBlockSize {
145 return 0
146 }
147 return encodeSnappyBetterBlockAsm8B(dst, src)
148}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_best.go b/vendor/github.com/klauspost/compress/s2/encode_best.go
new file mode 100644
index 0000000..47bac74
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_best.go
@@ -0,0 +1,796 @@
1// Copyright 2016 The Snappy-Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6package s2
7
8import (
9 "fmt"
10 "math"
11 "math/bits"
12)
13
14// encodeBlockBest encodes a non-empty src to a guaranteed-large-enough dst. It
15// assumes that the varint-encoded length of the decompressed bytes has already
16// been written.
17//
18// It also assumes that:
19//
20// len(dst) >= MaxEncodedLen(len(src)) &&
21// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
22func encodeBlockBest(dst, src []byte, dict *Dict) (d int) {
23 // Initialize the hash tables.
24 const (
25 // Long hash matches.
26 lTableBits = 19
27 maxLTableSize = 1 << lTableBits
28
29 // Short hash matches.
30 sTableBits = 16
31 maxSTableSize = 1 << sTableBits
32
33 inputMargin = 8 + 2
34
35 debug = false
36 )
37
38 // sLimit is when to stop looking for offset/length copies. The inputMargin
39 // lets us use a fast path for emitLiteral in the main loop, while we are
40 // looking for copies.
41 sLimit := len(src) - inputMargin
42 if len(src) < minNonLiteralBlockSize {
43 return 0
44 }
45 sLimitDict := len(src) - inputMargin
46 if sLimitDict > MaxDictSrcOffset-inputMargin {
47 sLimitDict = MaxDictSrcOffset - inputMargin
48 }
49
50 var lTable [maxLTableSize]uint64
51 var sTable [maxSTableSize]uint64
52
53 // Bail if we can't compress to at least this.
54 dstLimit := len(src) - 5
55
56 // nextEmit is where in src the next emitLiteral should start from.
57 nextEmit := 0
58
59 // The encoded form must start with a literal, as there are no previous
60 // bytes to copy, so we start looking for hash matches at s == 1.
61 s := 1
62 repeat := 1
63 if dict != nil {
64 dict.initBest()
65 s = 0
66 repeat = len(dict.dict) - dict.repeat
67 }
68 cv := load64(src, s)
69
70 // We search for a repeat at -1, but don't output repeats when nextEmit == 0
71 const lowbitMask = 0xffffffff
72 getCur := func(x uint64) int {
73 return int(x & lowbitMask)
74 }
75 getPrev := func(x uint64) int {
76 return int(x >> 32)
77 }
78 const maxSkip = 64
79
80 for {
81 type match struct {
82 offset int
83 s int
84 length int
85 score int
86 rep, dict bool
87 }
88 var best match
89 for {
90 // Next src position to check
91 nextS := (s-nextEmit)>>8 + 1
92 if nextS > maxSkip {
93 nextS = s + maxSkip
94 } else {
95 nextS += s
96 }
97 if nextS > sLimit {
98 goto emitRemainder
99 }
100 if dict != nil && s >= MaxDictSrcOffset {
101 dict = nil
102 if repeat > s {
103 repeat = math.MinInt32
104 }
105 }
106 hashL := hash8(cv, lTableBits)
107 hashS := hash4(cv, sTableBits)
108 candidateL := lTable[hashL]
109 candidateS := sTable[hashS]
110
111 score := func(m match) int {
112 // Matches that are longer forward are penalized since we must emit it as a literal.
113 score := m.length - m.s
114 if nextEmit == m.s {
115 // If we do not have to emit literals, we save 1 byte
116 score++
117 }
118 offset := m.s - m.offset
119 if m.rep {
120 return score - emitRepeatSize(offset, m.length)
121 }
122 return score - emitCopySize(offset, m.length)
123 }
124
125 matchAt := func(offset, s int, first uint32, rep bool) match {
126 if best.length != 0 && best.s-best.offset == s-offset {
127 // Don't retest if we have the same offset.
128 return match{offset: offset, s: s}
129 }
130 if load32(src, offset) != first {
131 return match{offset: offset, s: s}
132 }
133 m := match{offset: offset, s: s, length: 4 + offset, rep: rep}
134 s += 4
135 for s < len(src) {
136 if len(src)-s < 8 {
137 if src[s] == src[m.length] {
138 m.length++
139 s++
140 continue
141 }
142 break
143 }
144 if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
145 m.length += bits.TrailingZeros64(diff) >> 3
146 break
147 }
148 s += 8
149 m.length += 8
150 }
151 m.length -= offset
152 m.score = score(m)
153 if m.score <= -m.s {
154 // Eliminate if no savings, we might find a better one.
155 m.length = 0
156 }
157 return m
158 }
159 matchDict := func(candidate, s int, first uint32, rep bool) match {
160 if s >= MaxDictSrcOffset {
161 return match{offset: candidate, s: s}
162 }
163 // Calculate offset as if in continuous array with s
164 offset := -len(dict.dict) + candidate
165 if best.length != 0 && best.s-best.offset == s-offset && !rep {
166 // Don't retest if we have the same offset.
167 return match{offset: offset, s: s}
168 }
169
170 if load32(dict.dict, candidate) != first {
171 return match{offset: offset, s: s}
172 }
173 m := match{offset: offset, s: s, length: 4 + candidate, rep: rep, dict: true}
174 s += 4
175 if !rep {
176 for s < sLimitDict && m.length < len(dict.dict) {
177 if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
178 if src[s] == dict.dict[m.length] {
179 m.length++
180 s++
181 continue
182 }
183 break
184 }
185 if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
186 m.length += bits.TrailingZeros64(diff) >> 3
187 break
188 }
189 s += 8
190 m.length += 8
191 }
192 } else {
193 for s < len(src) && m.length < len(dict.dict) {
194 if len(src)-s < 8 || len(dict.dict)-m.length < 8 {
195 if src[s] == dict.dict[m.length] {
196 m.length++
197 s++
198 continue
199 }
200 break
201 }
202 if diff := load64(src, s) ^ load64(dict.dict, m.length); diff != 0 {
203 m.length += bits.TrailingZeros64(diff) >> 3
204 break
205 }
206 s += 8
207 m.length += 8
208 }
209 }
210 m.length -= candidate
211 m.score = score(m)
212 if m.score <= -m.s {
213 // Eliminate if no savings, we might find a better one.
214 m.length = 0
215 }
216 return m
217 }
218
219 bestOf := func(a, b match) match {
220 if b.length == 0 {
221 return a
222 }
223 if a.length == 0 {
224 return b
225 }
226 as := a.score + b.s
227 bs := b.score + a.s
228 if as >= bs {
229 return a
230 }
231 return b
232 }
233
234 if s > 0 {
235 best = bestOf(matchAt(getCur(candidateL), s, uint32(cv), false), matchAt(getPrev(candidateL), s, uint32(cv), false))
236 best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv), false))
237 best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv), false))
238 }
239 if dict != nil {
240 candidateL := dict.bestTableLong[hashL]
241 candidateS := dict.bestTableShort[hashS]
242 best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
243 best = bestOf(best, matchDict(int(candidateL>>16), s, uint32(cv), false))
244 best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
245 best = bestOf(best, matchDict(int(candidateS>>16), s, uint32(cv), false))
246 }
247 {
248 if (dict == nil || repeat <= s) && repeat > 0 {
249 best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8), true))
250 } else if s-repeat < -4 && dict != nil {
251 candidate := len(dict.dict) - (repeat - s)
252 best = bestOf(best, matchDict(candidate, s, uint32(cv), true))
253 candidate++
254 best = bestOf(best, matchDict(candidate, s+1, uint32(cv>>8), true))
255 }
256
257 if best.length > 0 {
258 hashS := hash4(cv>>8, sTableBits)
259 // s+1
260 nextShort := sTable[hashS]
261 s := s + 1
262 cv := load64(src, s)
263 hashL := hash8(cv, lTableBits)
264 nextLong := lTable[hashL]
265 best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
266 best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
267 best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
268 best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
269
270 // Dict at + 1
271 if dict != nil {
272 candidateL := dict.bestTableLong[hashL]
273 candidateS := dict.bestTableShort[hashS]
274
275 best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
276 best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
277 }
278
279 // s+2
280 if true {
281 hashS := hash4(cv>>8, sTableBits)
282
283 nextShort = sTable[hashS]
284 s++
285 cv = load64(src, s)
286 hashL := hash8(cv, lTableBits)
287 nextLong = lTable[hashL]
288
289 if (dict == nil || repeat <= s) && repeat > 0 {
290 // Repeat at + 2
291 best = bestOf(best, matchAt(s-repeat, s, uint32(cv), true))
292 } else if repeat-s > 4 && dict != nil {
293 candidate := len(dict.dict) - (repeat - s)
294 best = bestOf(best, matchDict(candidate, s, uint32(cv), true))
295 }
296 best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv), false))
297 best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv), false))
298 best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv), false))
299 best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv), false))
300
301 // Dict at +2
302 // Very small gain
303 if dict != nil {
304 candidateL := dict.bestTableLong[hashL]
305 candidateS := dict.bestTableShort[hashS]
306
307 best = bestOf(best, matchDict(int(candidateL&0xffff), s, uint32(cv), false))
308 best = bestOf(best, matchDict(int(candidateS&0xffff), s, uint32(cv), false))
309 }
310 }
311 // Search for a match at best match end, see if that is better.
312 // Allow some bytes at the beginning to mismatch.
313 // Sweet spot is around 1-2 bytes, but depends on input.
314 // The skipped bytes are tested in Extend backwards,
315 // and still picked up as part of the match if they do.
316 const skipBeginning = 2
317 const skipEnd = 1
318 if sAt := best.s + best.length - skipEnd; sAt < sLimit {
319
320 sBack := best.s + skipBeginning - skipEnd
321 backL := best.length - skipBeginning
322 // Load initial values
323 cv = load64(src, sBack)
324
325 // Grab candidates...
326 next := lTable[hash8(load64(src, sAt), lTableBits)]
327
328 if checkAt := getCur(next) - backL; checkAt > 0 {
329 best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
330 }
331 if checkAt := getPrev(next) - backL; checkAt > 0 {
332 best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
333 }
334 // Disabled: Extremely small gain
335 if false {
336 next = sTable[hash4(load64(src, sAt), sTableBits)]
337 if checkAt := getCur(next) - backL; checkAt > 0 {
338 best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
339 }
340 if checkAt := getPrev(next) - backL; checkAt > 0 {
341 best = bestOf(best, matchAt(checkAt, sBack, uint32(cv), false))
342 }
343 }
344 }
345 }
346 }
347
348 // Update table
349 lTable[hashL] = uint64(s) | candidateL<<32
350 sTable[hashS] = uint64(s) | candidateS<<32
351
352 if best.length > 0 {
353 break
354 }
355
356 cv = load64(src, nextS)
357 s = nextS
358 }
359
360 // Extend backwards, not needed for repeats...
361 s = best.s
362 if !best.rep && !best.dict {
363 for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
364 best.offset--
365 best.length++
366 s--
367 }
368 }
369 if false && best.offset >= s {
370 panic(fmt.Errorf("t %d >= s %d", best.offset, s))
371 }
372 // Bail if we exceed the maximum size.
373 if d+(s-nextEmit) > dstLimit {
374 return 0
375 }
376
377 base := s
378 offset := s - best.offset
379 s += best.length
380
381 if offset > 65535 && s-base <= 5 && !best.rep {
382 // Bail if the match is equal or worse to the encoding.
383 s = best.s + 1
384 if s >= sLimit {
385 goto emitRemainder
386 }
387 cv = load64(src, s)
388 continue
389 }
390 if debug && nextEmit != base {
391 fmt.Println("EMIT", base-nextEmit, "literals. base-after:", base)
392 }
393 d += emitLiteral(dst[d:], src[nextEmit:base])
394 if best.rep {
395 if nextEmit > 0 || best.dict {
396 if debug {
397 fmt.Println("REPEAT, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
398 }
399 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
400 d += emitRepeat(dst[d:], offset, best.length)
401 } else {
402 // First match without dict cannot be a repeat.
403 if debug {
404 fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
405 }
406 d += emitCopy(dst[d:], offset, best.length)
407 }
408 } else {
409 if debug {
410 fmt.Println("COPY, length", best.length, "offset:", offset, "s-after:", s, "dict:", best.dict, "best:", best)
411 }
412 d += emitCopy(dst[d:], offset, best.length)
413 }
414 repeat = offset
415
416 nextEmit = s
417 if s >= sLimit {
418 goto emitRemainder
419 }
420
421 if d > dstLimit {
422 // Do we have space for more, if not bail.
423 return 0
424 }
425 // Fill tables...
426 for i := best.s + 1; i < s; i++ {
427 cv0 := load64(src, i)
428 long0 := hash8(cv0, lTableBits)
429 short0 := hash4(cv0, sTableBits)
430 lTable[long0] = uint64(i) | lTable[long0]<<32
431 sTable[short0] = uint64(i) | sTable[short0]<<32
432 }
433 cv = load64(src, s)
434 }
435
436emitRemainder:
437 if nextEmit < len(src) {
438 // Bail if we exceed the maximum size.
439 if d+len(src)-nextEmit > dstLimit {
440 return 0
441 }
442 if debug && nextEmit != s {
443 fmt.Println("emitted ", len(src)-nextEmit, "literals")
444 }
445 d += emitLiteral(dst[d:], src[nextEmit:])
446 }
447 return d
448}
449
450// encodeBlockBestSnappy encodes a non-empty src to a guaranteed-large-enough dst. It
451// assumes that the varint-encoded length of the decompressed bytes has already
452// been written.
453//
454// It also assumes that:
455//
456// len(dst) >= MaxEncodedLen(len(src)) &&
457// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
458func encodeBlockBestSnappy(dst, src []byte) (d int) {
459 // Initialize the hash tables.
460 const (
461 // Long hash matches.
462 lTableBits = 19
463 maxLTableSize = 1 << lTableBits
464
465 // Short hash matches.
466 sTableBits = 16
467 maxSTableSize = 1 << sTableBits
468
469 inputMargin = 8 + 2
470 )
471
472 // sLimit is when to stop looking for offset/length copies. The inputMargin
473 // lets us use a fast path for emitLiteral in the main loop, while we are
474 // looking for copies.
475 sLimit := len(src) - inputMargin
476 if len(src) < minNonLiteralBlockSize {
477 return 0
478 }
479
480 var lTable [maxLTableSize]uint64
481 var sTable [maxSTableSize]uint64
482
483 // Bail if we can't compress to at least this.
484 dstLimit := len(src) - 5
485
486 // nextEmit is where in src the next emitLiteral should start from.
487 nextEmit := 0
488
489 // The encoded form must start with a literal, as there are no previous
490 // bytes to copy, so we start looking for hash matches at s == 1.
491 s := 1
492 cv := load64(src, s)
493
494 // We search for a repeat at -1, but don't output repeats when nextEmit == 0
495 repeat := 1
496 const lowbitMask = 0xffffffff
497 getCur := func(x uint64) int {
498 return int(x & lowbitMask)
499 }
500 getPrev := func(x uint64) int {
501 return int(x >> 32)
502 }
503 const maxSkip = 64
504
505 for {
506 type match struct {
507 offset int
508 s int
509 length int
510 score int
511 }
512 var best match
513 for {
514 // Next src position to check
515 nextS := (s-nextEmit)>>8 + 1
516 if nextS > maxSkip {
517 nextS = s + maxSkip
518 } else {
519 nextS += s
520 }
521 if nextS > sLimit {
522 goto emitRemainder
523 }
524 hashL := hash8(cv, lTableBits)
525 hashS := hash4(cv, sTableBits)
526 candidateL := lTable[hashL]
527 candidateS := sTable[hashS]
528
529 score := func(m match) int {
530 // Matches that are longer forward are penalized since we must emit it as a literal.
531 score := m.length - m.s
532 if nextEmit == m.s {
533 // If we do not have to emit literals, we save 1 byte
534 score++
535 }
536 offset := m.s - m.offset
537
538 return score - emitCopyNoRepeatSize(offset, m.length)
539 }
540
541 matchAt := func(offset, s int, first uint32) match {
542 if best.length != 0 && best.s-best.offset == s-offset {
543 // Don't retest if we have the same offset.
544 return match{offset: offset, s: s}
545 }
546 if load32(src, offset) != first {
547 return match{offset: offset, s: s}
548 }
549 m := match{offset: offset, s: s, length: 4 + offset}
550 s += 4
551 for s <= sLimit {
552 if diff := load64(src, s) ^ load64(src, m.length); diff != 0 {
553 m.length += bits.TrailingZeros64(diff) >> 3
554 break
555 }
556 s += 8
557 m.length += 8
558 }
559 m.length -= offset
560 m.score = score(m)
561 if m.score <= -m.s {
562 // Eliminate if no savings, we might find a better one.
563 m.length = 0
564 }
565 return m
566 }
567
568 bestOf := func(a, b match) match {
569 if b.length == 0 {
570 return a
571 }
572 if a.length == 0 {
573 return b
574 }
575 as := a.score + b.s
576 bs := b.score + a.s
577 if as >= bs {
578 return a
579 }
580 return b
581 }
582
583 best = bestOf(matchAt(getCur(candidateL), s, uint32(cv)), matchAt(getPrev(candidateL), s, uint32(cv)))
584 best = bestOf(best, matchAt(getCur(candidateS), s, uint32(cv)))
585 best = bestOf(best, matchAt(getPrev(candidateS), s, uint32(cv)))
586
587 {
588 best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
589 if best.length > 0 {
590 // s+1
591 nextShort := sTable[hash4(cv>>8, sTableBits)]
592 s := s + 1
593 cv := load64(src, s)
594 nextLong := lTable[hash8(cv, lTableBits)]
595 best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
596 best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
597 best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
598 best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
599 // Repeat at + 2
600 best = bestOf(best, matchAt(s-repeat+1, s+1, uint32(cv>>8)))
601
602 // s+2
603 if true {
604 nextShort = sTable[hash4(cv>>8, sTableBits)]
605 s++
606 cv = load64(src, s)
607 nextLong = lTable[hash8(cv, lTableBits)]
608 best = bestOf(best, matchAt(getCur(nextShort), s, uint32(cv)))
609 best = bestOf(best, matchAt(getPrev(nextShort), s, uint32(cv)))
610 best = bestOf(best, matchAt(getCur(nextLong), s, uint32(cv)))
611 best = bestOf(best, matchAt(getPrev(nextLong), s, uint32(cv)))
612 }
613 // Search for a match at best match end, see if that is better.
614 if sAt := best.s + best.length; sAt < sLimit {
615 sBack := best.s
616 backL := best.length
617 // Load initial values
618 cv = load64(src, sBack)
619 // Search for mismatch
620 next := lTable[hash8(load64(src, sAt), lTableBits)]
621 //next := sTable[hash4(load64(src, sAt), sTableBits)]
622
623 if checkAt := getCur(next) - backL; checkAt > 0 {
624 best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
625 }
626 if checkAt := getPrev(next) - backL; checkAt > 0 {
627 best = bestOf(best, matchAt(checkAt, sBack, uint32(cv)))
628 }
629 }
630 }
631 }
632
633 // Update table
634 lTable[hashL] = uint64(s) | candidateL<<32
635 sTable[hashS] = uint64(s) | candidateS<<32
636
637 if best.length > 0 {
638 break
639 }
640
641 cv = load64(src, nextS)
642 s = nextS
643 }
644
645 // Extend backwards, not needed for repeats...
646 s = best.s
647 if true {
648 for best.offset > 0 && s > nextEmit && src[best.offset-1] == src[s-1] {
649 best.offset--
650 best.length++
651 s--
652 }
653 }
654 if false && best.offset >= s {
655 panic(fmt.Errorf("t %d >= s %d", best.offset, s))
656 }
657 // Bail if we exceed the maximum size.
658 if d+(s-nextEmit) > dstLimit {
659 return 0
660 }
661
662 base := s
663 offset := s - best.offset
664
665 s += best.length
666
667 if offset > 65535 && s-base <= 5 {
668 // Bail if the match is equal or worse to the encoding.
669 s = best.s + 1
670 if s >= sLimit {
671 goto emitRemainder
672 }
673 cv = load64(src, s)
674 continue
675 }
676 d += emitLiteral(dst[d:], src[nextEmit:base])
677 d += emitCopyNoRepeat(dst[d:], offset, best.length)
678 repeat = offset
679
680 nextEmit = s
681 if s >= sLimit {
682 goto emitRemainder
683 }
684
685 if d > dstLimit {
686 // Do we have space for more, if not bail.
687 return 0
688 }
689 // Fill tables...
690 for i := best.s + 1; i < s; i++ {
691 cv0 := load64(src, i)
692 long0 := hash8(cv0, lTableBits)
693 short0 := hash4(cv0, sTableBits)
694 lTable[long0] = uint64(i) | lTable[long0]<<32
695 sTable[short0] = uint64(i) | sTable[short0]<<32
696 }
697 cv = load64(src, s)
698 }
699
700emitRemainder:
701 if nextEmit < len(src) {
702 // Bail if we exceed the maximum size.
703 if d+len(src)-nextEmit > dstLimit {
704 return 0
705 }
706 d += emitLiteral(dst[d:], src[nextEmit:])
707 }
708 return d
709}
710
711// emitCopySize returns the size to encode the offset+length
712//
713// It assumes that:
714//
715// 1 <= offset && offset <= math.MaxUint32
716// 4 <= length && length <= 1 << 24
717func emitCopySize(offset, length int) int {
718 if offset >= 65536 {
719 i := 0
720 if length > 64 {
721 length -= 64
722 if length >= 4 {
723 // Emit remaining as repeats
724 return 5 + emitRepeatSize(offset, length)
725 }
726 i = 5
727 }
728 if length == 0 {
729 return i
730 }
731 return i + 5
732 }
733
734 // Offset no more than 2 bytes.
735 if length > 64 {
736 if offset < 2048 {
737 // Emit 8 bytes, then rest as repeats...
738 return 2 + emitRepeatSize(offset, length-8)
739 }
740 // Emit remaining as repeats, at least 4 bytes remain.
741 return 3 + emitRepeatSize(offset, length-60)
742 }
743 if length >= 12 || offset >= 2048 {
744 return 3
745 }
746 // Emit the remaining copy, encoded as 2 bytes.
747 return 2
748}
749
750// emitCopyNoRepeatSize returns the size to encode the offset+length
751//
752// It assumes that:
753//
754// 1 <= offset && offset <= math.MaxUint32
755// 4 <= length && length <= 1 << 24
756func emitCopyNoRepeatSize(offset, length int) int {
757 if offset >= 65536 {
758 return 5 + 5*(length/64)
759 }
760
761 // Offset no more than 2 bytes.
762 if length > 64 {
763 // Emit remaining as repeats, at least 4 bytes remain.
764 return 3 + 3*(length/60)
765 }
766 if length >= 12 || offset >= 2048 {
767 return 3
768 }
769 // Emit the remaining copy, encoded as 2 bytes.
770 return 2
771}
772
773// emitRepeatSize returns the number of bytes required to encode a repeat.
774// Length must be at least 4 and < 1<<24
775func emitRepeatSize(offset, length int) int {
776 // Repeat offset, make length cheaper
777 if length <= 4+4 || (length < 8+4 && offset < 2048) {
778 return 2
779 }
780 if length < (1<<8)+4+4 {
781 return 3
782 }
783 if length < (1<<16)+(1<<8)+4 {
784 return 4
785 }
786 const maxRepeat = (1 << 24) - 1
787 length -= (1 << 16) - 4
788 left := 0
789 if length > maxRepeat {
790 left = length - maxRepeat + 4
791 }
792 if left > 0 {
793 return 5 + emitRepeatSize(offset, left)
794 }
795 return 5
796}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_better.go b/vendor/github.com/klauspost/compress/s2/encode_better.go
new file mode 100644
index 0000000..544cb1e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_better.go
@@ -0,0 +1,1106 @@
1// Copyright 2016 The Snappy-Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6package s2
7
8import (
9 "bytes"
10 "fmt"
11 "math/bits"
12)
13
14// hash4 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
15// Preferably h should be a constant and should always be <32.
16func hash4(u uint64, h uint8) uint32 {
17 const prime4bytes = 2654435761
18 return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
19}
20
21// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
22// Preferably h should be a constant and should always be <64.
23func hash5(u uint64, h uint8) uint32 {
24 const prime5bytes = 889523592379
25 return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
26}
27
28// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
29// Preferably h should be a constant and should always be <64.
30func hash7(u uint64, h uint8) uint32 {
31 const prime7bytes = 58295818150454627
32 return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
33}
34
35// hash8 returns the hash of u to fit in a hash table with h bits.
36// Preferably h should be a constant and should always be <64.
37func hash8(u uint64, h uint8) uint32 {
38 const prime8bytes = 0xcf1bbcdcb7a56463
39 return uint32((u * prime8bytes) >> ((64 - h) & 63))
40}
41
42// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
43// assumes that the varint-encoded length of the decompressed bytes has already
44// been written.
45//
46// It also assumes that:
47//
48// len(dst) >= MaxEncodedLen(len(src)) &&
49// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
50func encodeBlockBetterGo(dst, src []byte) (d int) {
51 // sLimit is when to stop looking for offset/length copies. The inputMargin
52 // lets us use a fast path for emitLiteral in the main loop, while we are
53 // looking for copies.
54 sLimit := len(src) - inputMargin
55 if len(src) < minNonLiteralBlockSize {
56 return 0
57 }
58
59 // Initialize the hash tables.
60 const (
61 // Long hash matches.
62 lTableBits = 17
63 maxLTableSize = 1 << lTableBits
64
65 // Short hash matches.
66 sTableBits = 14
67 maxSTableSize = 1 << sTableBits
68 )
69
70 var lTable [maxLTableSize]uint32
71 var sTable [maxSTableSize]uint32
72
73 // Bail if we can't compress to at least this.
74 dstLimit := len(src) - len(src)>>5 - 6
75
76 // nextEmit is where in src the next emitLiteral should start from.
77 nextEmit := 0
78
79 // The encoded form must start with a literal, as there are no previous
80 // bytes to copy, so we start looking for hash matches at s == 1.
81 s := 1
82 cv := load64(src, s)
83
84 // We initialize repeat to 0, so we never match on first attempt
85 repeat := 0
86
87 for {
88 candidateL := 0
89 nextS := 0
90 for {
91 // Next src position to check
92 nextS = s + (s-nextEmit)>>7 + 1
93 if nextS > sLimit {
94 goto emitRemainder
95 }
96 hashL := hash7(cv, lTableBits)
97 hashS := hash4(cv, sTableBits)
98 candidateL = int(lTable[hashL])
99 candidateS := int(sTable[hashS])
100 lTable[hashL] = uint32(s)
101 sTable[hashS] = uint32(s)
102
103 valLong := load64(src, candidateL)
104 valShort := load64(src, candidateS)
105
106 // If long matches at least 8 bytes, use that.
107 if cv == valLong {
108 break
109 }
110 if cv == valShort {
111 candidateL = candidateS
112 break
113 }
114
115 // Check repeat at offset checkRep.
116 const checkRep = 1
117 // Minimum length of a repeat. Tested with various values.
118 // While 4-5 offers improvements in some, 6 reduces
119 // regressions significantly.
120 const wantRepeatBytes = 6
121 const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
122 if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
123 base := s + checkRep
124 // Extend back
125 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
126 i--
127 base--
128 }
129 d += emitLiteral(dst[d:], src[nextEmit:base])
130
131 // Extend forward
132 candidate := s - repeat + wantRepeatBytes + checkRep
133 s += wantRepeatBytes + checkRep
134 for s < len(src) {
135 if len(src)-s < 8 {
136 if src[s] == src[candidate] {
137 s++
138 candidate++
139 continue
140 }
141 break
142 }
143 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
144 s += bits.TrailingZeros64(diff) >> 3
145 break
146 }
147 s += 8
148 candidate += 8
149 }
150 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
151 d += emitRepeat(dst[d:], repeat, s-base)
152 nextEmit = s
153 if s >= sLimit {
154 goto emitRemainder
155 }
156 // Index in-between
157 index0 := base + 1
158 index1 := s - 2
159
160 for index0 < index1 {
161 cv0 := load64(src, index0)
162 cv1 := load64(src, index1)
163 lTable[hash7(cv0, lTableBits)] = uint32(index0)
164 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
165
166 lTable[hash7(cv1, lTableBits)] = uint32(index1)
167 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
168 index0 += 2
169 index1 -= 2
170 }
171
172 cv = load64(src, s)
173 continue
174 }
175
176 // Long likely matches 7, so take that.
177 if uint32(cv) == uint32(valLong) {
178 break
179 }
180
181 // Check our short candidate
182 if uint32(cv) == uint32(valShort) {
183 // Try a long candidate at s+1
184 hashL = hash7(cv>>8, lTableBits)
185 candidateL = int(lTable[hashL])
186 lTable[hashL] = uint32(s + 1)
187 if uint32(cv>>8) == load32(src, candidateL) {
188 s++
189 break
190 }
191 // Use our short candidate.
192 candidateL = candidateS
193 break
194 }
195
196 cv = load64(src, nextS)
197 s = nextS
198 }
199
200 // Extend backwards
201 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
202 candidateL--
203 s--
204 }
205
206 // Bail if we exceed the maximum size.
207 if d+(s-nextEmit) > dstLimit {
208 return 0
209 }
210
211 base := s
212 offset := base - candidateL
213
214 // Extend the 4-byte match as long as possible.
215 s += 4
216 candidateL += 4
217 for s < len(src) {
218 if len(src)-s < 8 {
219 if src[s] == src[candidateL] {
220 s++
221 candidateL++
222 continue
223 }
224 break
225 }
226 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
227 s += bits.TrailingZeros64(diff) >> 3
228 break
229 }
230 s += 8
231 candidateL += 8
232 }
233
234 if offset > 65535 && s-base <= 5 && repeat != offset {
235 // Bail if the match is equal or worse to the encoding.
236 s = nextS + 1
237 if s >= sLimit {
238 goto emitRemainder
239 }
240 cv = load64(src, s)
241 continue
242 }
243
244 d += emitLiteral(dst[d:], src[nextEmit:base])
245 if repeat == offset {
246 d += emitRepeat(dst[d:], offset, s-base)
247 } else {
248 d += emitCopy(dst[d:], offset, s-base)
249 repeat = offset
250 }
251
252 nextEmit = s
253 if s >= sLimit {
254 goto emitRemainder
255 }
256
257 if d > dstLimit {
258 // Do we have space for more, if not bail.
259 return 0
260 }
261
262 // Index short & long
263 index0 := base + 1
264 index1 := s - 2
265
266 cv0 := load64(src, index0)
267 cv1 := load64(src, index1)
268 lTable[hash7(cv0, lTableBits)] = uint32(index0)
269 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
270
271 // lTable could be postponed, but very minor difference.
272 lTable[hash7(cv1, lTableBits)] = uint32(index1)
273 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
274 index0 += 1
275 index1 -= 1
276 cv = load64(src, s)
277
278 // Index large values sparsely in between.
279 // We do two starting from different offsets for speed.
280 index2 := (index0 + index1 + 1) >> 1
281 for index2 < index1 {
282 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
283 lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
284 index0 += 2
285 index2 += 2
286 }
287 }
288
289emitRemainder:
290 if nextEmit < len(src) {
291 // Bail if we exceed the maximum size.
292 if d+len(src)-nextEmit > dstLimit {
293 return 0
294 }
295 d += emitLiteral(dst[d:], src[nextEmit:])
296 }
297 return d
298}
299
300// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
301// assumes that the varint-encoded length of the decompressed bytes has already
302// been written.
303//
304// It also assumes that:
305//
306// len(dst) >= MaxEncodedLen(len(src)) &&
307// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
308func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
309 // sLimit is when to stop looking for offset/length copies. The inputMargin
310 // lets us use a fast path for emitLiteral in the main loop, while we are
311 // looking for copies.
312 sLimit := len(src) - inputMargin
313 if len(src) < minNonLiteralBlockSize {
314 return 0
315 }
316
317 // Initialize the hash tables.
318 const (
319 // Long hash matches.
320 lTableBits = 16
321 maxLTableSize = 1 << lTableBits
322
323 // Short hash matches.
324 sTableBits = 14
325 maxSTableSize = 1 << sTableBits
326 )
327
328 var lTable [maxLTableSize]uint32
329 var sTable [maxSTableSize]uint32
330
331 // Bail if we can't compress to at least this.
332 dstLimit := len(src) - len(src)>>5 - 6
333
334 // nextEmit is where in src the next emitLiteral should start from.
335 nextEmit := 0
336
337 // The encoded form must start with a literal, as there are no previous
338 // bytes to copy, so we start looking for hash matches at s == 1.
339 s := 1
340 cv := load64(src, s)
341
342 // We initialize repeat to 0, so we never match on first attempt
343 repeat := 0
344 const maxSkip = 100
345
346 for {
347 candidateL := 0
348 nextS := 0
349 for {
350 // Next src position to check
351 nextS = (s-nextEmit)>>7 + 1
352 if nextS > maxSkip {
353 nextS = s + maxSkip
354 } else {
355 nextS += s
356 }
357
358 if nextS > sLimit {
359 goto emitRemainder
360 }
361 hashL := hash7(cv, lTableBits)
362 hashS := hash4(cv, sTableBits)
363 candidateL = int(lTable[hashL])
364 candidateS := int(sTable[hashS])
365 lTable[hashL] = uint32(s)
366 sTable[hashS] = uint32(s)
367
368 if uint32(cv) == load32(src, candidateL) {
369 break
370 }
371
372 // Check our short candidate
373 if uint32(cv) == load32(src, candidateS) {
374 // Try a long candidate at s+1
375 hashL = hash7(cv>>8, lTableBits)
376 candidateL = int(lTable[hashL])
377 lTable[hashL] = uint32(s + 1)
378 if uint32(cv>>8) == load32(src, candidateL) {
379 s++
380 break
381 }
382 // Use our short candidate.
383 candidateL = candidateS
384 break
385 }
386
387 cv = load64(src, nextS)
388 s = nextS
389 }
390
391 // Extend backwards
392 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
393 candidateL--
394 s--
395 }
396
397 // Bail if we exceed the maximum size.
398 if d+(s-nextEmit) > dstLimit {
399 return 0
400 }
401
402 base := s
403 offset := base - candidateL
404
405 // Extend the 4-byte match as long as possible.
406 s += 4
407 candidateL += 4
408 for s < len(src) {
409 if len(src)-s < 8 {
410 if src[s] == src[candidateL] {
411 s++
412 candidateL++
413 continue
414 }
415 break
416 }
417 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
418 s += bits.TrailingZeros64(diff) >> 3
419 break
420 }
421 s += 8
422 candidateL += 8
423 }
424
425 if offset > 65535 && s-base <= 5 && repeat != offset {
426 // Bail if the match is equal or worse to the encoding.
427 s = nextS + 1
428 if s >= sLimit {
429 goto emitRemainder
430 }
431 cv = load64(src, s)
432 continue
433 }
434
435 d += emitLiteral(dst[d:], src[nextEmit:base])
436 d += emitCopyNoRepeat(dst[d:], offset, s-base)
437 repeat = offset
438
439 nextEmit = s
440 if s >= sLimit {
441 goto emitRemainder
442 }
443
444 if d > dstLimit {
445 // Do we have space for more, if not bail.
446 return 0
447 }
448
449 // Index short & long
450 index0 := base + 1
451 index1 := s - 2
452
453 cv0 := load64(src, index0)
454 cv1 := load64(src, index1)
455 lTable[hash7(cv0, lTableBits)] = uint32(index0)
456 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
457
458 lTable[hash7(cv1, lTableBits)] = uint32(index1)
459 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
460 index0 += 1
461 index1 -= 1
462 cv = load64(src, s)
463
464 // Index large values sparsely in between.
465 // We do two starting from different offsets for speed.
466 index2 := (index0 + index1 + 1) >> 1
467 for index2 < index1 {
468 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
469 lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
470 index0 += 2
471 index2 += 2
472 }
473 }
474
475emitRemainder:
476 if nextEmit < len(src) {
477 // Bail if we exceed the maximum size.
478 if d+len(src)-nextEmit > dstLimit {
479 return 0
480 }
481 d += emitLiteral(dst[d:], src[nextEmit:])
482 }
483 return d
484}
485
486// encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
487// assumes that the varint-encoded length of the decompressed bytes has already
488// been written.
489//
490// It also assumes that:
491//
492// len(dst) >= MaxEncodedLen(len(src)) &&
493// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
494func encodeBlockBetterDict(dst, src []byte, dict *Dict) (d int) {
495 // sLimit is when to stop looking for offset/length copies. The inputMargin
496 // lets us use a fast path for emitLiteral in the main loop, while we are
497 // looking for copies.
498 // Initialize the hash tables.
499 const (
500 // Long hash matches.
501 lTableBits = 17
502 maxLTableSize = 1 << lTableBits
503
504 // Short hash matches.
505 sTableBits = 14
506 maxSTableSize = 1 << sTableBits
507
508 maxAhead = 8 // maximum bytes ahead without checking sLimit
509
510 debug = false
511 )
512
513 sLimit := len(src) - inputMargin
514 if sLimit > MaxDictSrcOffset-maxAhead {
515 sLimit = MaxDictSrcOffset - maxAhead
516 }
517 if len(src) < minNonLiteralBlockSize {
518 return 0
519 }
520
521 dict.initBetter()
522
523 var lTable [maxLTableSize]uint32
524 var sTable [maxSTableSize]uint32
525
526 // Bail if we can't compress to at least this.
527 dstLimit := len(src) - len(src)>>5 - 6
528
529 // nextEmit is where in src the next emitLiteral should start from.
530 nextEmit := 0
531
532 // The encoded form must start with a literal, as there are no previous
533 // bytes to copy, so we start looking for hash matches at s == 1.
534 s := 0
535 cv := load64(src, s)
536
537 // We initialize repeat to 0, so we never match on first attempt
538 repeat := len(dict.dict) - dict.repeat
539
540 // While in dict
541searchDict:
542 for {
543 candidateL := 0
544 nextS := 0
545 for {
546 // Next src position to check
547 nextS = s + (s-nextEmit)>>7 + 1
548 if nextS > sLimit {
549 break searchDict
550 }
551 hashL := hash7(cv, lTableBits)
552 hashS := hash4(cv, sTableBits)
553 candidateL = int(lTable[hashL])
554 candidateS := int(sTable[hashS])
555 dictL := int(dict.betterTableLong[hashL])
556 dictS := int(dict.betterTableShort[hashS])
557 lTable[hashL] = uint32(s)
558 sTable[hashS] = uint32(s)
559
560 valLong := load64(src, candidateL)
561 valShort := load64(src, candidateS)
562
563 // If long matches at least 8 bytes, use that.
564 if s != 0 {
565 if cv == valLong {
566 goto emitMatch
567 }
568 if cv == valShort {
569 candidateL = candidateS
570 goto emitMatch
571 }
572 }
573
574 // Check dict repeat.
575 if repeat >= s+4 {
576 candidate := len(dict.dict) - repeat + s
577 if candidate > 0 && uint32(cv) == load32(dict.dict, candidate) {
578 // Extend back
579 base := s
580 for i := candidate; base > nextEmit && i > 0 && dict.dict[i-1] == src[base-1]; {
581 i--
582 base--
583 }
584 d += emitLiteral(dst[d:], src[nextEmit:base])
585 if debug && nextEmit != base {
586 fmt.Println("emitted ", base-nextEmit, "literals")
587 }
588 s += 4
589 candidate += 4
590 for candidate < len(dict.dict)-8 && s <= len(src)-8 {
591 if diff := load64(src, s) ^ load64(dict.dict, candidate); diff != 0 {
592 s += bits.TrailingZeros64(diff) >> 3
593 break
594 }
595 s += 8
596 candidate += 8
597 }
598 d += emitRepeat(dst[d:], repeat, s-base)
599 if debug {
600 fmt.Println("emitted dict repeat length", s-base, "offset:", repeat, "s:", s)
601 }
602 nextEmit = s
603 if s >= sLimit {
604 break searchDict
605 }
606 // Index in-between
607 index0 := base + 1
608 index1 := s - 2
609
610 cv = load64(src, s)
611 for index0 < index1 {
612 cv0 := load64(src, index0)
613 cv1 := load64(src, index1)
614 lTable[hash7(cv0, lTableBits)] = uint32(index0)
615 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
616
617 lTable[hash7(cv1, lTableBits)] = uint32(index1)
618 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
619 index0 += 2
620 index1 -= 2
621 }
622 continue
623 }
624 }
625 // Don't try to find match at s==0
626 if s == 0 {
627 cv = load64(src, nextS)
628 s = nextS
629 continue
630 }
631
632 // Long likely matches 7, so take that.
633 if uint32(cv) == uint32(valLong) {
634 goto emitMatch
635 }
636
637 // Long dict...
638 if uint32(cv) == load32(dict.dict, dictL) {
639 candidateL = dictL
640 goto emitDict
641 }
642
643 // Check our short candidate
644 if uint32(cv) == uint32(valShort) {
645 // Try a long candidate at s+1
646 hashL = hash7(cv>>8, lTableBits)
647 candidateL = int(lTable[hashL])
648 lTable[hashL] = uint32(s + 1)
649 if uint32(cv>>8) == load32(src, candidateL) {
650 s++
651 goto emitMatch
652 }
653 // Use our short candidate.
654 candidateL = candidateS
655 goto emitMatch
656 }
657 if uint32(cv) == load32(dict.dict, dictS) {
658 // Try a long candidate at s+1
659 hashL = hash7(cv>>8, lTableBits)
660 candidateL = int(lTable[hashL])
661 lTable[hashL] = uint32(s + 1)
662 if uint32(cv>>8) == load32(src, candidateL) {
663 s++
664 goto emitMatch
665 }
666 candidateL = dictS
667 goto emitDict
668 }
669 cv = load64(src, nextS)
670 s = nextS
671 }
672 emitDict:
673 {
674 if debug {
675 if load32(dict.dict, candidateL) != load32(src, s) {
676 panic("dict emit mismatch")
677 }
678 }
679 // Extend backwards.
680 // The top bytes will be rechecked to get the full match.
681 for candidateL > 0 && s > nextEmit && dict.dict[candidateL-1] == src[s-1] {
682 candidateL--
683 s--
684 }
685
686 // Bail if we exceed the maximum size.
687 if d+(s-nextEmit) > dstLimit {
688 return 0
689 }
690
691 // A 4-byte match has been found. We'll later see if more than 4 bytes
692 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
693 // them as literal bytes.
694
695 d += emitLiteral(dst[d:], src[nextEmit:s])
696 if debug && nextEmit != s {
697 fmt.Println("emitted ", s-nextEmit, "literals")
698 }
699 {
700 // Invariant: we have a 4-byte match at s, and no need to emit any
701 // literal bytes prior to s.
702 base := s
703 offset := s + (len(dict.dict)) - candidateL
704
705 // Extend the 4-byte match as long as possible.
706 s += 4
707 candidateL += 4
708 for s <= len(src)-8 && len(dict.dict)-candidateL >= 8 {
709 if diff := load64(src, s) ^ load64(dict.dict, candidateL); diff != 0 {
710 s += bits.TrailingZeros64(diff) >> 3
711 break
712 }
713 s += 8
714 candidateL += 8
715 }
716
717 if repeat == offset {
718 if debug {
719 fmt.Println("emitted dict repeat, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
720 }
721 d += emitRepeat(dst[d:], offset, s-base)
722 } else {
723 if debug {
724 fmt.Println("emitted dict copy, length", s-base, "offset:", offset, "s:", s, "dict offset:", candidateL)
725 }
726 // Matches longer than 64 are split.
727 if s <= sLimit || s-base < 8 {
728 d += emitCopy(dst[d:], offset, s-base)
729 } else {
730 // Split to ensure we don't start a copy within next block.
731 d += emitCopy(dst[d:], offset, 4)
732 d += emitRepeat(dst[d:], offset, s-base-4)
733 }
734 repeat = offset
735 }
736 if false {
737 // Validate match.
738 if s <= candidateL {
739 panic("s <= candidate")
740 }
741 a := src[base:s]
742 b := dict.dict[base-repeat : base-repeat+(s-base)]
743 if !bytes.Equal(a, b) {
744 panic("mismatch")
745 }
746 }
747
748 nextEmit = s
749 if s >= sLimit {
750 break searchDict
751 }
752
753 if d > dstLimit {
754 // Do we have space for more, if not bail.
755 return 0
756 }
757
758 // Index short & long
759 index0 := base + 1
760 index1 := s - 2
761
762 cv0 := load64(src, index0)
763 cv1 := load64(src, index1)
764 lTable[hash7(cv0, lTableBits)] = uint32(index0)
765 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
766
767 lTable[hash7(cv1, lTableBits)] = uint32(index1)
768 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
769 index0 += 1
770 index1 -= 1
771 cv = load64(src, s)
772
773 // index every second long in between.
774 for index0 < index1 {
775 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
776 lTable[hash7(load64(src, index1), lTableBits)] = uint32(index1)
777 index0 += 2
778 index1 -= 2
779 }
780 }
781 continue
782 }
783 emitMatch:
784
785 // Extend backwards
786 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
787 candidateL--
788 s--
789 }
790
791 // Bail if we exceed the maximum size.
792 if d+(s-nextEmit) > dstLimit {
793 return 0
794 }
795
796 base := s
797 offset := base - candidateL
798
799 // Extend the 4-byte match as long as possible.
800 s += 4
801 candidateL += 4
802 for s < len(src) {
803 if len(src)-s < 8 {
804 if src[s] == src[candidateL] {
805 s++
806 candidateL++
807 continue
808 }
809 break
810 }
811 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
812 s += bits.TrailingZeros64(diff) >> 3
813 break
814 }
815 s += 8
816 candidateL += 8
817 }
818
819 if offset > 65535 && s-base <= 5 && repeat != offset {
820 // Bail if the match is equal or worse to the encoding.
821 s = nextS + 1
822 if s >= sLimit {
823 goto emitRemainder
824 }
825 cv = load64(src, s)
826 continue
827 }
828
829 d += emitLiteral(dst[d:], src[nextEmit:base])
830 if debug && nextEmit != s {
831 fmt.Println("emitted ", s-nextEmit, "literals")
832 }
833 if repeat == offset {
834 if debug {
835 fmt.Println("emitted match repeat, length", s-base, "offset:", offset, "s:", s)
836 }
837 d += emitRepeat(dst[d:], offset, s-base)
838 } else {
839 if debug {
840 fmt.Println("emitted match copy, length", s-base, "offset:", offset, "s:", s)
841 }
842 d += emitCopy(dst[d:], offset, s-base)
843 repeat = offset
844 }
845
846 nextEmit = s
847 if s >= sLimit {
848 goto emitRemainder
849 }
850
851 if d > dstLimit {
852 // Do we have space for more, if not bail.
853 return 0
854 }
855
856 // Index short & long
857 index0 := base + 1
858 index1 := s - 2
859
860 cv0 := load64(src, index0)
861 cv1 := load64(src, index1)
862 lTable[hash7(cv0, lTableBits)] = uint32(index0)
863 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
864
865 lTable[hash7(cv1, lTableBits)] = uint32(index1)
866 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
867 index0 += 1
868 index1 -= 1
869 cv = load64(src, s)
870
871 // Index large values sparsely in between.
872 // We do two starting from different offsets for speed.
873 index2 := (index0 + index1 + 1) >> 1
874 for index2 < index1 {
875 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
876 lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
877 index0 += 2
878 index2 += 2
879 }
880 }
881
882 // Search without dict:
883 if repeat > s {
884 repeat = 0
885 }
886
887 // No more dict
888 sLimit = len(src) - inputMargin
889 if s >= sLimit {
890 goto emitRemainder
891 }
892 cv = load64(src, s)
893 if debug {
894 fmt.Println("now", s, "->", sLimit, "out:", d, "left:", len(src)-s, "nextemit:", nextEmit, "dstLimit:", dstLimit, "s:", s)
895 }
896 for {
897 candidateL := 0
898 nextS := 0
899 for {
900 // Next src position to check
901 nextS = s + (s-nextEmit)>>7 + 1
902 if nextS > sLimit {
903 goto emitRemainder
904 }
905 hashL := hash7(cv, lTableBits)
906 hashS := hash4(cv, sTableBits)
907 candidateL = int(lTable[hashL])
908 candidateS := int(sTable[hashS])
909 lTable[hashL] = uint32(s)
910 sTable[hashS] = uint32(s)
911
912 valLong := load64(src, candidateL)
913 valShort := load64(src, candidateS)
914
915 // If long matches at least 8 bytes, use that.
916 if cv == valLong {
917 break
918 }
919 if cv == valShort {
920 candidateL = candidateS
921 break
922 }
923
924 // Check repeat at offset checkRep.
925 const checkRep = 1
926 // Minimum length of a repeat. Tested with various values.
927 // While 4-5 offers improvements in some, 6 reduces
928 // regressions significantly.
929 const wantRepeatBytes = 6
930 const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
931 if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
932 base := s + checkRep
933 // Extend back
934 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
935 i--
936 base--
937 }
938 d += emitLiteral(dst[d:], src[nextEmit:base])
939
940 // Extend forward
941 candidate := s - repeat + wantRepeatBytes + checkRep
942 s += wantRepeatBytes + checkRep
943 for s < len(src) {
944 if len(src)-s < 8 {
945 if src[s] == src[candidate] {
946 s++
947 candidate++
948 continue
949 }
950 break
951 }
952 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
953 s += bits.TrailingZeros64(diff) >> 3
954 break
955 }
956 s += 8
957 candidate += 8
958 }
959 // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
960 d += emitRepeat(dst[d:], repeat, s-base)
961 nextEmit = s
962 if s >= sLimit {
963 goto emitRemainder
964 }
965 // Index in-between
966 index0 := base + 1
967 index1 := s - 2
968
969 for index0 < index1 {
970 cv0 := load64(src, index0)
971 cv1 := load64(src, index1)
972 lTable[hash7(cv0, lTableBits)] = uint32(index0)
973 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
974
975 lTable[hash7(cv1, lTableBits)] = uint32(index1)
976 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
977 index0 += 2
978 index1 -= 2
979 }
980
981 cv = load64(src, s)
982 continue
983 }
984
985 // Long likely matches 7, so take that.
986 if uint32(cv) == uint32(valLong) {
987 break
988 }
989
990 // Check our short candidate
991 if uint32(cv) == uint32(valShort) {
992 // Try a long candidate at s+1
993 hashL = hash7(cv>>8, lTableBits)
994 candidateL = int(lTable[hashL])
995 lTable[hashL] = uint32(s + 1)
996 if uint32(cv>>8) == load32(src, candidateL) {
997 s++
998 break
999 }
1000 // Use our short candidate.
1001 candidateL = candidateS
1002 break
1003 }
1004
1005 cv = load64(src, nextS)
1006 s = nextS
1007 }
1008
1009 // Extend backwards
1010 for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
1011 candidateL--
1012 s--
1013 }
1014
1015 // Bail if we exceed the maximum size.
1016 if d+(s-nextEmit) > dstLimit {
1017 return 0
1018 }
1019
1020 base := s
1021 offset := base - candidateL
1022
1023 // Extend the 4-byte match as long as possible.
1024 s += 4
1025 candidateL += 4
1026 for s < len(src) {
1027 if len(src)-s < 8 {
1028 if src[s] == src[candidateL] {
1029 s++
1030 candidateL++
1031 continue
1032 }
1033 break
1034 }
1035 if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
1036 s += bits.TrailingZeros64(diff) >> 3
1037 break
1038 }
1039 s += 8
1040 candidateL += 8
1041 }
1042
1043 if offset > 65535 && s-base <= 5 && repeat != offset {
1044 // Bail if the match is equal or worse to the encoding.
1045 s = nextS + 1
1046 if s >= sLimit {
1047 goto emitRemainder
1048 }
1049 cv = load64(src, s)
1050 continue
1051 }
1052
1053 d += emitLiteral(dst[d:], src[nextEmit:base])
1054 if repeat == offset {
1055 d += emitRepeat(dst[d:], offset, s-base)
1056 } else {
1057 d += emitCopy(dst[d:], offset, s-base)
1058 repeat = offset
1059 }
1060
1061 nextEmit = s
1062 if s >= sLimit {
1063 goto emitRemainder
1064 }
1065
1066 if d > dstLimit {
1067 // Do we have space for more, if not bail.
1068 return 0
1069 }
1070
1071 // Index short & long
1072 index0 := base + 1
1073 index1 := s - 2
1074
1075 cv0 := load64(src, index0)
1076 cv1 := load64(src, index1)
1077 lTable[hash7(cv0, lTableBits)] = uint32(index0)
1078 sTable[hash4(cv0>>8, sTableBits)] = uint32(index0 + 1)
1079
1080 lTable[hash7(cv1, lTableBits)] = uint32(index1)
1081 sTable[hash4(cv1>>8, sTableBits)] = uint32(index1 + 1)
1082 index0 += 1
1083 index1 -= 1
1084 cv = load64(src, s)
1085
1086 // Index large values sparsely in between.
1087 // We do two starting from different offsets for speed.
1088 index2 := (index0 + index1 + 1) >> 1
1089 for index2 < index1 {
1090 lTable[hash7(load64(src, index0), lTableBits)] = uint32(index0)
1091 lTable[hash7(load64(src, index2), lTableBits)] = uint32(index2)
1092 index0 += 2
1093 index2 += 2
1094 }
1095 }
1096
1097emitRemainder:
1098 if nextEmit < len(src) {
1099 // Bail if we exceed the maximum size.
1100 if d+len(src)-nextEmit > dstLimit {
1101 return 0
1102 }
1103 d += emitLiteral(dst[d:], src[nextEmit:])
1104 }
1105 return d
1106}
diff --git a/vendor/github.com/klauspost/compress/s2/encode_go.go b/vendor/github.com/klauspost/compress/s2/encode_go.go
new file mode 100644
index 0000000..6b393c3
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encode_go.go
@@ -0,0 +1,729 @@
1//go:build !amd64 || appengine || !gc || noasm
2// +build !amd64 appengine !gc noasm
3
4package s2
5
6import (
7 "bytes"
8 "math/bits"
9)
10
11const hasAmd64Asm = false
12
13// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
14// assumes that the varint-encoded length of the decompressed bytes has already
15// been written.
16//
17// It also assumes that:
18//
19// len(dst) >= MaxEncodedLen(len(src))
20func encodeBlock(dst, src []byte) (d int) {
21 if len(src) < minNonLiteralBlockSize {
22 return 0
23 }
24 return encodeBlockGo(dst, src)
25}
26
27// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
28// assumes that the varint-encoded length of the decompressed bytes has already
29// been written.
30//
31// It also assumes that:
32//
33// len(dst) >= MaxEncodedLen(len(src))
34func encodeBlockBetter(dst, src []byte) (d int) {
35 return encodeBlockBetterGo(dst, src)
36}
37
38// encodeBlockBetter encodes a non-empty src to a guaranteed-large-enough dst. It
39// assumes that the varint-encoded length of the decompressed bytes has already
40// been written.
41//
42// It also assumes that:
43//
44// len(dst) >= MaxEncodedLen(len(src))
45func encodeBlockBetterSnappy(dst, src []byte) (d int) {
46 return encodeBlockBetterSnappyGo(dst, src)
47}
48
49// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
50// assumes that the varint-encoded length of the decompressed bytes has already
51// been written.
52//
53// It also assumes that:
54//
55// len(dst) >= MaxEncodedLen(len(src))
56func encodeBlockSnappy(dst, src []byte) (d int) {
57 if len(src) < minNonLiteralBlockSize {
58 return 0
59 }
60 return encodeBlockSnappyGo(dst, src)
61}
62
63// emitLiteral writes a literal chunk and returns the number of bytes written.
64//
65// It assumes that:
66//
67// dst is long enough to hold the encoded bytes
68// 0 <= len(lit) && len(lit) <= math.MaxUint32
69func emitLiteral(dst, lit []byte) int {
70 if len(lit) == 0 {
71 return 0
72 }
73 const num = 63<<2 | tagLiteral
74 i, n := 0, uint(len(lit)-1)
75 switch {
76 case n < 60:
77 dst[0] = uint8(n)<<2 | tagLiteral
78 i = 1
79 case n < 1<<8:
80 dst[1] = uint8(n)
81 dst[0] = 60<<2 | tagLiteral
82 i = 2
83 case n < 1<<16:
84 dst[2] = uint8(n >> 8)
85 dst[1] = uint8(n)
86 dst[0] = 61<<2 | tagLiteral
87 i = 3
88 case n < 1<<24:
89 dst[3] = uint8(n >> 16)
90 dst[2] = uint8(n >> 8)
91 dst[1] = uint8(n)
92 dst[0] = 62<<2 | tagLiteral
93 i = 4
94 default:
95 dst[4] = uint8(n >> 24)
96 dst[3] = uint8(n >> 16)
97 dst[2] = uint8(n >> 8)
98 dst[1] = uint8(n)
99 dst[0] = 63<<2 | tagLiteral
100 i = 5
101 }
102 return i + copy(dst[i:], lit)
103}
104
105// emitRepeat writes a repeat chunk and returns the number of bytes written.
106// Length must be at least 4 and < 1<<24
107func emitRepeat(dst []byte, offset, length int) int {
108 // Repeat offset, make length cheaper
109 length -= 4
110 if length <= 4 {
111 dst[0] = uint8(length)<<2 | tagCopy1
112 dst[1] = 0
113 return 2
114 }
115 if length < 8 && offset < 2048 {
116 // Encode WITH offset
117 dst[1] = uint8(offset)
118 dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
119 return 2
120 }
121 if length < (1<<8)+4 {
122 length -= 4
123 dst[2] = uint8(length)
124 dst[1] = 0
125 dst[0] = 5<<2 | tagCopy1
126 return 3
127 }
128 if length < (1<<16)+(1<<8) {
129 length -= 1 << 8
130 dst[3] = uint8(length >> 8)
131 dst[2] = uint8(length >> 0)
132 dst[1] = 0
133 dst[0] = 6<<2 | tagCopy1
134 return 4
135 }
136 const maxRepeat = (1 << 24) - 1
137 length -= 1 << 16
138 left := 0
139 if length > maxRepeat {
140 left = length - maxRepeat + 4
141 length = maxRepeat - 4
142 }
143 dst[4] = uint8(length >> 16)
144 dst[3] = uint8(length >> 8)
145 dst[2] = uint8(length >> 0)
146 dst[1] = 0
147 dst[0] = 7<<2 | tagCopy1
148 if left > 0 {
149 return 5 + emitRepeat(dst[5:], offset, left)
150 }
151 return 5
152}
153
154// emitCopy writes a copy chunk and returns the number of bytes written.
155//
156// It assumes that:
157//
158// dst is long enough to hold the encoded bytes
159// 1 <= offset && offset <= math.MaxUint32
160// 4 <= length && length <= 1 << 24
161func emitCopy(dst []byte, offset, length int) int {
162 if offset >= 65536 {
163 i := 0
164 if length > 64 {
165 // Emit a length 64 copy, encoded as 5 bytes.
166 dst[4] = uint8(offset >> 24)
167 dst[3] = uint8(offset >> 16)
168 dst[2] = uint8(offset >> 8)
169 dst[1] = uint8(offset)
170 dst[0] = 63<<2 | tagCopy4
171 length -= 64
172 if length >= 4 {
173 // Emit remaining as repeats
174 return 5 + emitRepeat(dst[5:], offset, length)
175 }
176 i = 5
177 }
178 if length == 0 {
179 return i
180 }
181 // Emit a copy, offset encoded as 4 bytes.
182 dst[i+0] = uint8(length-1)<<2 | tagCopy4
183 dst[i+1] = uint8(offset)
184 dst[i+2] = uint8(offset >> 8)
185 dst[i+3] = uint8(offset >> 16)
186 dst[i+4] = uint8(offset >> 24)
187 return i + 5
188 }
189
190 // Offset no more than 2 bytes.
191 if length > 64 {
192 off := 3
193 if offset < 2048 {
194 // emit 8 bytes as tagCopy1, rest as repeats.
195 dst[1] = uint8(offset)
196 dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
197 length -= 8
198 off = 2
199 } else {
200 // Emit a length 60 copy, encoded as 3 bytes.
201 // Emit remaining as repeat value (minimum 4 bytes).
202 dst[2] = uint8(offset >> 8)
203 dst[1] = uint8(offset)
204 dst[0] = 59<<2 | tagCopy2
205 length -= 60
206 }
207 // Emit remaining as repeats, at least 4 bytes remain.
208 return off + emitRepeat(dst[off:], offset, length)
209 }
210 if length >= 12 || offset >= 2048 {
211 // Emit the remaining copy, encoded as 3 bytes.
212 dst[2] = uint8(offset >> 8)
213 dst[1] = uint8(offset)
214 dst[0] = uint8(length-1)<<2 | tagCopy2
215 return 3
216 }
217 // Emit the remaining copy, encoded as 2 bytes.
218 dst[1] = uint8(offset)
219 dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
220 return 2
221}
222
223// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
224//
225// It assumes that:
226//
227// dst is long enough to hold the encoded bytes
228// 1 <= offset && offset <= math.MaxUint32
229// 4 <= length && length <= 1 << 24
230func emitCopyNoRepeat(dst []byte, offset, length int) int {
231 if offset >= 65536 {
232 i := 0
233 if length > 64 {
234 // Emit a length 64 copy, encoded as 5 bytes.
235 dst[4] = uint8(offset >> 24)
236 dst[3] = uint8(offset >> 16)
237 dst[2] = uint8(offset >> 8)
238 dst[1] = uint8(offset)
239 dst[0] = 63<<2 | tagCopy4
240 length -= 64
241 if length >= 4 {
242 // Emit remaining as repeats
243 return 5 + emitCopyNoRepeat(dst[5:], offset, length)
244 }
245 i = 5
246 }
247 if length == 0 {
248 return i
249 }
250 // Emit a copy, offset encoded as 4 bytes.
251 dst[i+0] = uint8(length-1)<<2 | tagCopy4
252 dst[i+1] = uint8(offset)
253 dst[i+2] = uint8(offset >> 8)
254 dst[i+3] = uint8(offset >> 16)
255 dst[i+4] = uint8(offset >> 24)
256 return i + 5
257 }
258
259 // Offset no more than 2 bytes.
260 if length > 64 {
261 // Emit a length 60 copy, encoded as 3 bytes.
262 // Emit remaining as repeat value (minimum 4 bytes).
263 dst[2] = uint8(offset >> 8)
264 dst[1] = uint8(offset)
265 dst[0] = 59<<2 | tagCopy2
266 length -= 60
267 // Emit remaining as repeats, at least 4 bytes remain.
268 return 3 + emitCopyNoRepeat(dst[3:], offset, length)
269 }
270 if length >= 12 || offset >= 2048 {
271 // Emit the remaining copy, encoded as 3 bytes.
272 dst[2] = uint8(offset >> 8)
273 dst[1] = uint8(offset)
274 dst[0] = uint8(length-1)<<2 | tagCopy2
275 return 3
276 }
277 // Emit the remaining copy, encoded as 2 bytes.
278 dst[1] = uint8(offset)
279 dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
280 return 2
281}
282
283// matchLen returns how many bytes match in a and b
284//
285// It assumes that:
286//
287// len(a) <= len(b)
288func matchLen(a []byte, b []byte) int {
289 b = b[:len(a)]
290 var checked int
291 if len(a) > 4 {
292 // Try 4 bytes first
293 if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
294 return bits.TrailingZeros32(diff) >> 3
295 }
296 // Switch to 8 byte matching.
297 checked = 4
298 a = a[4:]
299 b = b[4:]
300 for len(a) >= 8 {
301 b = b[:len(a)]
302 if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
303 return checked + (bits.TrailingZeros64(diff) >> 3)
304 }
305 checked += 8
306 a = a[8:]
307 b = b[8:]
308 }
309 }
310 b = b[:len(a)]
311 for i := range a {
312 if a[i] != b[i] {
313 return int(i) + checked
314 }
315 }
316 return len(a) + checked
317}
318
319// input must be > inputMargin
320func calcBlockSize(src []byte) (d int) {
321 // Initialize the hash table.
322 const (
323 tableBits = 13
324 maxTableSize = 1 << tableBits
325 )
326
327 var table [maxTableSize]uint32
328
329 // sLimit is when to stop looking for offset/length copies. The inputMargin
330 // lets us use a fast path for emitLiteral in the main loop, while we are
331 // looking for copies.
332 sLimit := len(src) - inputMargin
333
334 // Bail if we can't compress to at least this.
335 dstLimit := len(src) - len(src)>>5 - 5
336
337 // nextEmit is where in src the next emitLiteral should start from.
338 nextEmit := 0
339
340 // The encoded form must start with a literal, as there are no previous
341 // bytes to copy, so we start looking for hash matches at s == 1.
342 s := 1
343 cv := load64(src, s)
344
345 // We search for a repeat at -1, but don't output repeats when nextEmit == 0
346 repeat := 1
347
348 for {
349 candidate := 0
350 for {
351 // Next src position to check
352 nextS := s + (s-nextEmit)>>6 + 4
353 if nextS > sLimit {
354 goto emitRemainder
355 }
356 hash0 := hash6(cv, tableBits)
357 hash1 := hash6(cv>>8, tableBits)
358 candidate = int(table[hash0])
359 candidate2 := int(table[hash1])
360 table[hash0] = uint32(s)
361 table[hash1] = uint32(s + 1)
362 hash2 := hash6(cv>>16, tableBits)
363
364 // Check repeat at offset checkRep.
365 const checkRep = 1
366 if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
367 base := s + checkRep
368 // Extend back
369 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
370 i--
371 base--
372 }
373 d += emitLiteralSize(src[nextEmit:base])
374
375 // Extend forward
376 candidate := s - repeat + 4 + checkRep
377 s += 4 + checkRep
378 for s <= sLimit {
379 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
380 s += bits.TrailingZeros64(diff) >> 3
381 break
382 }
383 s += 8
384 candidate += 8
385 }
386
387 d += emitCopyNoRepeatSize(repeat, s-base)
388 nextEmit = s
389 if s >= sLimit {
390 goto emitRemainder
391 }
392
393 cv = load64(src, s)
394 continue
395 }
396
397 if uint32(cv) == load32(src, candidate) {
398 break
399 }
400 candidate = int(table[hash2])
401 if uint32(cv>>8) == load32(src, candidate2) {
402 table[hash2] = uint32(s + 2)
403 candidate = candidate2
404 s++
405 break
406 }
407 table[hash2] = uint32(s + 2)
408 if uint32(cv>>16) == load32(src, candidate) {
409 s += 2
410 break
411 }
412
413 cv = load64(src, nextS)
414 s = nextS
415 }
416
417 // Extend backwards
418 for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
419 candidate--
420 s--
421 }
422
423 // Bail if we exceed the maximum size.
424 if d+(s-nextEmit) > dstLimit {
425 return 0
426 }
427
428 // A 4-byte match has been found. We'll later see if more than 4 bytes
429 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
430 // them as literal bytes.
431
432 d += emitLiteralSize(src[nextEmit:s])
433
434 // Call emitCopy, and then see if another emitCopy could be our next
435 // move. Repeat until we find no match for the input immediately after
436 // what was consumed by the last emitCopy call.
437 //
438 // If we exit this loop normally then we need to call emitLiteral next,
439 // though we don't yet know how big the literal will be. We handle that
440 // by proceeding to the next iteration of the main loop. We also can
441 // exit this loop via goto if we get close to exhausting the input.
442 for {
443 // Invariant: we have a 4-byte match at s, and no need to emit any
444 // literal bytes prior to s.
445 base := s
446 repeat = base - candidate
447
448 // Extend the 4-byte match as long as possible.
449 s += 4
450 candidate += 4
451 for s <= len(src)-8 {
452 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
453 s += bits.TrailingZeros64(diff) >> 3
454 break
455 }
456 s += 8
457 candidate += 8
458 }
459
460 d += emitCopyNoRepeatSize(repeat, s-base)
461 if false {
462 // Validate match.
463 a := src[base:s]
464 b := src[base-repeat : base-repeat+(s-base)]
465 if !bytes.Equal(a, b) {
466 panic("mismatch")
467 }
468 }
469
470 nextEmit = s
471 if s >= sLimit {
472 goto emitRemainder
473 }
474
475 if d > dstLimit {
476 // Do we have space for more, if not bail.
477 return 0
478 }
479 // Check for an immediate match, otherwise start search at s+1
480 x := load64(src, s-2)
481 m2Hash := hash6(x, tableBits)
482 currHash := hash6(x>>16, tableBits)
483 candidate = int(table[currHash])
484 table[m2Hash] = uint32(s - 2)
485 table[currHash] = uint32(s)
486 if uint32(x>>16) != load32(src, candidate) {
487 cv = load64(src, s+1)
488 s++
489 break
490 }
491 }
492 }
493
494emitRemainder:
495 if nextEmit < len(src) {
496 // Bail if we exceed the maximum size.
497 if d+len(src)-nextEmit > dstLimit {
498 return 0
499 }
500 d += emitLiteralSize(src[nextEmit:])
501 }
502 return d
503}
504
505// length must be > inputMargin.
506func calcBlockSizeSmall(src []byte) (d int) {
507 // Initialize the hash table.
508 const (
509 tableBits = 9
510 maxTableSize = 1 << tableBits
511 )
512
513 var table [maxTableSize]uint32
514
515 // sLimit is when to stop looking for offset/length copies. The inputMargin
516 // lets us use a fast path for emitLiteral in the main loop, while we are
517 // looking for copies.
518 sLimit := len(src) - inputMargin
519
520 // Bail if we can't compress to at least this.
521 dstLimit := len(src) - len(src)>>5 - 5
522
523 // nextEmit is where in src the next emitLiteral should start from.
524 nextEmit := 0
525
526 // The encoded form must start with a literal, as there are no previous
527 // bytes to copy, so we start looking for hash matches at s == 1.
528 s := 1
529 cv := load64(src, s)
530
531 // We search for a repeat at -1, but don't output repeats when nextEmit == 0
532 repeat := 1
533
534 for {
535 candidate := 0
536 for {
537 // Next src position to check
538 nextS := s + (s-nextEmit)>>6 + 4
539 if nextS > sLimit {
540 goto emitRemainder
541 }
542 hash0 := hash6(cv, tableBits)
543 hash1 := hash6(cv>>8, tableBits)
544 candidate = int(table[hash0])
545 candidate2 := int(table[hash1])
546 table[hash0] = uint32(s)
547 table[hash1] = uint32(s + 1)
548 hash2 := hash6(cv>>16, tableBits)
549
550 // Check repeat at offset checkRep.
551 const checkRep = 1
552 if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
553 base := s + checkRep
554 // Extend back
555 for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
556 i--
557 base--
558 }
559 d += emitLiteralSize(src[nextEmit:base])
560
561 // Extend forward
562 candidate := s - repeat + 4 + checkRep
563 s += 4 + checkRep
564 for s <= sLimit {
565 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
566 s += bits.TrailingZeros64(diff) >> 3
567 break
568 }
569 s += 8
570 candidate += 8
571 }
572
573 d += emitCopyNoRepeatSize(repeat, s-base)
574 nextEmit = s
575 if s >= sLimit {
576 goto emitRemainder
577 }
578
579 cv = load64(src, s)
580 continue
581 }
582
583 if uint32(cv) == load32(src, candidate) {
584 break
585 }
586 candidate = int(table[hash2])
587 if uint32(cv>>8) == load32(src, candidate2) {
588 table[hash2] = uint32(s + 2)
589 candidate = candidate2
590 s++
591 break
592 }
593 table[hash2] = uint32(s + 2)
594 if uint32(cv>>16) == load32(src, candidate) {
595 s += 2
596 break
597 }
598
599 cv = load64(src, nextS)
600 s = nextS
601 }
602
603 // Extend backwards
604 for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
605 candidate--
606 s--
607 }
608
609 // Bail if we exceed the maximum size.
610 if d+(s-nextEmit) > dstLimit {
611 return 0
612 }
613
614 // A 4-byte match has been found. We'll later see if more than 4 bytes
615 // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
616 // them as literal bytes.
617
618 d += emitLiteralSize(src[nextEmit:s])
619
620 // Call emitCopy, and then see if another emitCopy could be our next
621 // move. Repeat until we find no match for the input immediately after
622 // what was consumed by the last emitCopy call.
623 //
624 // If we exit this loop normally then we need to call emitLiteral next,
625 // though we don't yet know how big the literal will be. We handle that
626 // by proceeding to the next iteration of the main loop. We also can
627 // exit this loop via goto if we get close to exhausting the input.
628 for {
629 // Invariant: we have a 4-byte match at s, and no need to emit any
630 // literal bytes prior to s.
631 base := s
632 repeat = base - candidate
633
634 // Extend the 4-byte match as long as possible.
635 s += 4
636 candidate += 4
637 for s <= len(src)-8 {
638 if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
639 s += bits.TrailingZeros64(diff) >> 3
640 break
641 }
642 s += 8
643 candidate += 8
644 }
645
646 d += emitCopyNoRepeatSize(repeat, s-base)
647 if false {
648 // Validate match.
649 a := src[base:s]
650 b := src[base-repeat : base-repeat+(s-base)]
651 if !bytes.Equal(a, b) {
652 panic("mismatch")
653 }
654 }
655
656 nextEmit = s
657 if s >= sLimit {
658 goto emitRemainder
659 }
660
661 if d > dstLimit {
662 // Do we have space for more, if not bail.
663 return 0
664 }
665 // Check for an immediate match, otherwise start search at s+1
666 x := load64(src, s-2)
667 m2Hash := hash6(x, tableBits)
668 currHash := hash6(x>>16, tableBits)
669 candidate = int(table[currHash])
670 table[m2Hash] = uint32(s - 2)
671 table[currHash] = uint32(s)
672 if uint32(x>>16) != load32(src, candidate) {
673 cv = load64(src, s+1)
674 s++
675 break
676 }
677 }
678 }
679
680emitRemainder:
681 if nextEmit < len(src) {
682 // Bail if we exceed the maximum size.
683 if d+len(src)-nextEmit > dstLimit {
684 return 0
685 }
686 d += emitLiteralSize(src[nextEmit:])
687 }
688 return d
689}
690
691// emitLiteral writes a literal chunk and returns the number of bytes written.
692//
693// It assumes that:
694//
695// dst is long enough to hold the encoded bytes
696// 0 <= len(lit) && len(lit) <= math.MaxUint32
697func emitLiteralSize(lit []byte) int {
698 if len(lit) == 0 {
699 return 0
700 }
701 switch {
702 case len(lit) <= 60:
703 return len(lit) + 1
704 case len(lit) <= 1<<8:
705 return len(lit) + 2
706 case len(lit) <= 1<<16:
707 return len(lit) + 3
708 case len(lit) <= 1<<24:
709 return len(lit) + 4
710 default:
711 return len(lit) + 5
712 }
713}
714
715func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
716 panic("cvtLZ4BlockAsm should be unreachable")
717}
718
719func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
720 panic("cvtLZ4BlockSnappyAsm should be unreachable")
721}
722
723func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
724 panic("cvtLZ4sBlockAsm should be unreachable")
725}
726
727func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) {
728 panic("cvtLZ4sBlockSnappyAsm should be unreachable")
729}
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
new file mode 100644
index 0000000..297e415
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.go
@@ -0,0 +1,228 @@
1// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
2
3//go:build !appengine && !noasm && gc && !noasm
4
5package s2
6
7func _dummy_()
8
9// encodeBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
10// Maximum input 4294967295 bytes.
11// It assumes that the varint-encoded length of the decompressed bytes has already been written.
12//
13//go:noescape
14func encodeBlockAsm(dst []byte, src []byte) int
15
16// encodeBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
17// Maximum input 4194304 bytes.
18// It assumes that the varint-encoded length of the decompressed bytes has already been written.
19//
20//go:noescape
21func encodeBlockAsm4MB(dst []byte, src []byte) int
22
23// encodeBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
24// Maximum input 16383 bytes.
25// It assumes that the varint-encoded length of the decompressed bytes has already been written.
26//
27//go:noescape
28func encodeBlockAsm12B(dst []byte, src []byte) int
29
30// encodeBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
31// Maximum input 4095 bytes.
32// It assumes that the varint-encoded length of the decompressed bytes has already been written.
33//
34//go:noescape
35func encodeBlockAsm10B(dst []byte, src []byte) int
36
37// encodeBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
38// Maximum input 511 bytes.
39// It assumes that the varint-encoded length of the decompressed bytes has already been written.
40//
41//go:noescape
42func encodeBlockAsm8B(dst []byte, src []byte) int
43
44// encodeBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
45// Maximum input 4294967295 bytes.
46// It assumes that the varint-encoded length of the decompressed bytes has already been written.
47//
48//go:noescape
49func encodeBetterBlockAsm(dst []byte, src []byte) int
50
51// encodeBetterBlockAsm4MB encodes a non-empty src to a guaranteed-large-enough dst.
52// Maximum input 4194304 bytes.
53// It assumes that the varint-encoded length of the decompressed bytes has already been written.
54//
55//go:noescape
56func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
57
58// encodeBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
59// Maximum input 16383 bytes.
60// It assumes that the varint-encoded length of the decompressed bytes has already been written.
61//
62//go:noescape
63func encodeBetterBlockAsm12B(dst []byte, src []byte) int
64
65// encodeBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
66// Maximum input 4095 bytes.
67// It assumes that the varint-encoded length of the decompressed bytes has already been written.
68//
69//go:noescape
70func encodeBetterBlockAsm10B(dst []byte, src []byte) int
71
72// encodeBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
73// Maximum input 511 bytes.
74// It assumes that the varint-encoded length of the decompressed bytes has already been written.
75//
76//go:noescape
77func encodeBetterBlockAsm8B(dst []byte, src []byte) int
78
79// encodeSnappyBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
80// Maximum input 4294967295 bytes.
81// It assumes that the varint-encoded length of the decompressed bytes has already been written.
82//
83//go:noescape
84func encodeSnappyBlockAsm(dst []byte, src []byte) int
85
86// encodeSnappyBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
87// Maximum input 65535 bytes.
88// It assumes that the varint-encoded length of the decompressed bytes has already been written.
89//
90//go:noescape
91func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
92
93// encodeSnappyBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
94// Maximum input 16383 bytes.
95// It assumes that the varint-encoded length of the decompressed bytes has already been written.
96//
97//go:noescape
98func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
99
100// encodeSnappyBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
101// Maximum input 4095 bytes.
102// It assumes that the varint-encoded length of the decompressed bytes has already been written.
103//
104//go:noescape
105func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
106
107// encodeSnappyBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
108// Maximum input 511 bytes.
109// It assumes that the varint-encoded length of the decompressed bytes has already been written.
110//
111//go:noescape
112func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
113
114// encodeSnappyBetterBlockAsm encodes a non-empty src to a guaranteed-large-enough dst.
115// Maximum input 4294967295 bytes.
116// It assumes that the varint-encoded length of the decompressed bytes has already been written.
117//
118//go:noescape
119func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
120
121// encodeSnappyBetterBlockAsm64K encodes a non-empty src to a guaranteed-large-enough dst.
122// Maximum input 65535 bytes.
123// It assumes that the varint-encoded length of the decompressed bytes has already been written.
124//
125//go:noescape
126func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
127
128// encodeSnappyBetterBlockAsm12B encodes a non-empty src to a guaranteed-large-enough dst.
129// Maximum input 16383 bytes.
130// It assumes that the varint-encoded length of the decompressed bytes has already been written.
131//
132//go:noescape
133func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
134
135// encodeSnappyBetterBlockAsm10B encodes a non-empty src to a guaranteed-large-enough dst.
136// Maximum input 4095 bytes.
137// It assumes that the varint-encoded length of the decompressed bytes has already been written.
138//
139//go:noescape
140func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
141
142// encodeSnappyBetterBlockAsm8B encodes a non-empty src to a guaranteed-large-enough dst.
143// Maximum input 511 bytes.
144// It assumes that the varint-encoded length of the decompressed bytes has already been written.
145//
146//go:noescape
147func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
148
149// calcBlockSize encodes a non-empty src to a guaranteed-large-enough dst.
150// Maximum input 4294967295 bytes.
151// It assumes that the varint-encoded length of the decompressed bytes has already been written.
152//
153//go:noescape
154func calcBlockSize(src []byte) int
155
156// calcBlockSizeSmall encodes a non-empty src to a guaranteed-large-enough dst.
157// Maximum input 1024 bytes.
158// It assumes that the varint-encoded length of the decompressed bytes has already been written.
159//
160//go:noescape
161func calcBlockSizeSmall(src []byte) int
162
163// emitLiteral writes a literal chunk and returns the number of bytes written.
164//
165// It assumes that:
166//
167// dst is long enough to hold the encoded bytes with margin of 0 bytes
168// 0 <= len(lit) && len(lit) <= math.MaxUint32
169//
170//go:noescape
171func emitLiteral(dst []byte, lit []byte) int
172
173// emitRepeat writes a repeat chunk and returns the number of bytes written.
174// Length must be at least 4 and < 1<<32
175//
176//go:noescape
177func emitRepeat(dst []byte, offset int, length int) int
178
179// emitCopy writes a copy chunk and returns the number of bytes written.
180//
181// It assumes that:
182//
183// dst is long enough to hold the encoded bytes
184// 1 <= offset && offset <= math.MaxUint32
185// 4 <= length && length <= 1 << 24
186//
187//go:noescape
188func emitCopy(dst []byte, offset int, length int) int
189
190// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.
191//
192// It assumes that:
193//
194// dst is long enough to hold the encoded bytes
195// 1 <= offset && offset <= math.MaxUint32
196// 4 <= length && length <= 1 << 24
197//
198//go:noescape
199func emitCopyNoRepeat(dst []byte, offset int, length int) int
200
201// matchLen returns how many bytes match in a and b
202//
203// It assumes that:
204//
205// len(a) <= len(b)
206//
207//go:noescape
208func matchLen(a []byte, b []byte) int
209
210// cvtLZ4Block converts an LZ4 block to S2
211//
212//go:noescape
213func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
214
215// cvtLZ4sBlock converts an LZ4s block to S2
216//
217//go:noescape
218func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
219
220// cvtLZ4Block converts an LZ4 block to Snappy
221//
222//go:noescape
223func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
224
225// cvtLZ4sBlock converts an LZ4s block to Snappy
226//
227//go:noescape
228func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
diff --git a/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
new file mode 100644
index 0000000..5f110d1
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/encodeblock_amd64.s
@@ -0,0 +1,21169 @@
1// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
2
3//go:build !appengine && !noasm && gc && !noasm
4
5#include "textflag.h"
6
7// func _dummy_()
8TEXT ·_dummy_(SB), $0
9#ifdef GOAMD64_v4
10#ifndef GOAMD64_v3
11#define GOAMD64_v3
12#endif
13#endif
14 RET
15
16// func encodeBlockAsm(dst []byte, src []byte) int
17// Requires: BMI, SSE2
18TEXT ·encodeBlockAsm(SB), $65560-56
19 MOVQ dst_base+0(FP), AX
20 MOVQ $0x00000200, CX
21 LEAQ 24(SP), DX
22 PXOR X0, X0
23
24zero_loop_encodeBlockAsm:
25 MOVOU X0, (DX)
26 MOVOU X0, 16(DX)
27 MOVOU X0, 32(DX)
28 MOVOU X0, 48(DX)
29 MOVOU X0, 64(DX)
30 MOVOU X0, 80(DX)
31 MOVOU X0, 96(DX)
32 MOVOU X0, 112(DX)
33 ADDQ $0x80, DX
34 DECQ CX
35 JNZ zero_loop_encodeBlockAsm
36 MOVL $0x00000000, 12(SP)
37 MOVQ src_len+32(FP), CX
38 LEAQ -9(CX), DX
39 LEAQ -8(CX), BX
40 MOVL BX, 8(SP)
41 SHRQ $0x05, CX
42 SUBL CX, DX
43 LEAQ (AX)(DX*1), DX
44 MOVQ DX, (SP)
45 MOVL $0x00000001, CX
46 MOVL CX, 16(SP)
47 MOVQ src_base+24(FP), DX
48
49search_loop_encodeBlockAsm:
50 MOVL CX, BX
51 SUBL 12(SP), BX
52 SHRL $0x06, BX
53 LEAL 4(CX)(BX*1), BX
54 CMPL BX, 8(SP)
55 JAE emit_remainder_encodeBlockAsm
56 MOVQ (DX)(CX*1), SI
57 MOVL BX, 20(SP)
58 MOVQ $0x0000cf1bbcdcbf9b, R8
59 MOVQ SI, R9
60 MOVQ SI, R10
61 SHRQ $0x08, R10
62 SHLQ $0x10, R9
63 IMULQ R8, R9
64 SHRQ $0x32, R9
65 SHLQ $0x10, R10
66 IMULQ R8, R10
67 SHRQ $0x32, R10
68 MOVL 24(SP)(R9*4), BX
69 MOVL 24(SP)(R10*4), DI
70 MOVL CX, 24(SP)(R9*4)
71 LEAL 1(CX), R9
72 MOVL R9, 24(SP)(R10*4)
73 MOVQ SI, R9
74 SHRQ $0x10, R9
75 SHLQ $0x10, R9
76 IMULQ R8, R9
77 SHRQ $0x32, R9
78 MOVL CX, R8
79 SUBL 16(SP), R8
80 MOVL 1(DX)(R8*1), R10
81 MOVQ SI, R8
82 SHRQ $0x08, R8
83 CMPL R8, R10
84 JNE no_repeat_found_encodeBlockAsm
85 LEAL 1(CX), SI
86 MOVL 12(SP), DI
87 MOVL SI, BX
88 SUBL 16(SP), BX
89 JZ repeat_extend_back_end_encodeBlockAsm
90
91repeat_extend_back_loop_encodeBlockAsm:
92 CMPL SI, DI
93 JBE repeat_extend_back_end_encodeBlockAsm
94 MOVB -1(DX)(BX*1), R8
95 MOVB -1(DX)(SI*1), R9
96 CMPB R8, R9
97 JNE repeat_extend_back_end_encodeBlockAsm
98 LEAL -1(SI), SI
99 DECL BX
100 JNZ repeat_extend_back_loop_encodeBlockAsm
101
102repeat_extend_back_end_encodeBlockAsm:
103 MOVL 12(SP), BX
104 CMPL BX, SI
105 JEQ emit_literal_done_repeat_emit_encodeBlockAsm
106 MOVL SI, R8
107 MOVL SI, 12(SP)
108 LEAQ (DX)(BX*1), R9
109 SUBL BX, R8
110 LEAL -1(R8), BX
111 CMPL BX, $0x3c
112 JB one_byte_repeat_emit_encodeBlockAsm
113 CMPL BX, $0x00000100
114 JB two_bytes_repeat_emit_encodeBlockAsm
115 CMPL BX, $0x00010000
116 JB three_bytes_repeat_emit_encodeBlockAsm
117 CMPL BX, $0x01000000
118 JB four_bytes_repeat_emit_encodeBlockAsm
119 MOVB $0xfc, (AX)
120 MOVL BX, 1(AX)
121 ADDQ $0x05, AX
122 JMP memmove_long_repeat_emit_encodeBlockAsm
123
124four_bytes_repeat_emit_encodeBlockAsm:
125 MOVL BX, R10
126 SHRL $0x10, R10
127 MOVB $0xf8, (AX)
128 MOVW BX, 1(AX)
129 MOVB R10, 3(AX)
130 ADDQ $0x04, AX
131 JMP memmove_long_repeat_emit_encodeBlockAsm
132
133three_bytes_repeat_emit_encodeBlockAsm:
134 MOVB $0xf4, (AX)
135 MOVW BX, 1(AX)
136 ADDQ $0x03, AX
137 JMP memmove_long_repeat_emit_encodeBlockAsm
138
139two_bytes_repeat_emit_encodeBlockAsm:
140 MOVB $0xf0, (AX)
141 MOVB BL, 1(AX)
142 ADDQ $0x02, AX
143 CMPL BX, $0x40
144 JB memmove_repeat_emit_encodeBlockAsm
145 JMP memmove_long_repeat_emit_encodeBlockAsm
146
147one_byte_repeat_emit_encodeBlockAsm:
148 SHLB $0x02, BL
149 MOVB BL, (AX)
150 ADDQ $0x01, AX
151
152memmove_repeat_emit_encodeBlockAsm:
153 LEAQ (AX)(R8*1), BX
154
155 // genMemMoveShort
156 CMPQ R8, $0x08
157 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
158 CMPQ R8, $0x10
159 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
160 CMPQ R8, $0x20
161 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
162 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
163
164emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
165 MOVQ (R9), R10
166 MOVQ R10, (AX)
167 JMP memmove_end_copy_repeat_emit_encodeBlockAsm
168
169emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
170 MOVQ (R9), R10
171 MOVQ -8(R9)(R8*1), R9
172 MOVQ R10, (AX)
173 MOVQ R9, -8(AX)(R8*1)
174 JMP memmove_end_copy_repeat_emit_encodeBlockAsm
175
176emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
177 MOVOU (R9), X0
178 MOVOU -16(R9)(R8*1), X1
179 MOVOU X0, (AX)
180 MOVOU X1, -16(AX)(R8*1)
181 JMP memmove_end_copy_repeat_emit_encodeBlockAsm
182
183emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
184 MOVOU (R9), X0
185 MOVOU 16(R9), X1
186 MOVOU -32(R9)(R8*1), X2
187 MOVOU -16(R9)(R8*1), X3
188 MOVOU X0, (AX)
189 MOVOU X1, 16(AX)
190 MOVOU X2, -32(AX)(R8*1)
191 MOVOU X3, -16(AX)(R8*1)
192
193memmove_end_copy_repeat_emit_encodeBlockAsm:
194 MOVQ BX, AX
195 JMP emit_literal_done_repeat_emit_encodeBlockAsm
196
197memmove_long_repeat_emit_encodeBlockAsm:
198 LEAQ (AX)(R8*1), BX
199
200 // genMemMoveLong
201 MOVOU (R9), X0
202 MOVOU 16(R9), X1
203 MOVOU -32(R9)(R8*1), X2
204 MOVOU -16(R9)(R8*1), X3
205 MOVQ R8, R11
206 SHRQ $0x05, R11
207 MOVQ AX, R10
208 ANDL $0x0000001f, R10
209 MOVQ $0x00000040, R12
210 SUBQ R10, R12
211 DECQ R11
212 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
213 LEAQ -32(R9)(R12*1), R10
214 LEAQ -32(AX)(R12*1), R13
215
216emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
217 MOVOU (R10), X4
218 MOVOU 16(R10), X5
219 MOVOA X4, (R13)
220 MOVOA X5, 16(R13)
221 ADDQ $0x20, R13
222 ADDQ $0x20, R10
223 ADDQ $0x20, R12
224 DECQ R11
225 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
226
227emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
228 MOVOU -32(R9)(R12*1), X4
229 MOVOU -16(R9)(R12*1), X5
230 MOVOA X4, -32(AX)(R12*1)
231 MOVOA X5, -16(AX)(R12*1)
232 ADDQ $0x20, R12
233 CMPQ R8, R12
234 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
235 MOVOU X0, (AX)
236 MOVOU X1, 16(AX)
237 MOVOU X2, -32(AX)(R8*1)
238 MOVOU X3, -16(AX)(R8*1)
239 MOVQ BX, AX
240
241emit_literal_done_repeat_emit_encodeBlockAsm:
242 ADDL $0x05, CX
243 MOVL CX, BX
244 SUBL 16(SP), BX
245 MOVQ src_len+32(FP), R8
246 SUBL CX, R8
247 LEAQ (DX)(CX*1), R9
248 LEAQ (DX)(BX*1), BX
249
250 // matchLen
251 XORL R11, R11
252
253matchlen_loopback_16_repeat_extend_encodeBlockAsm:
254 CMPL R8, $0x10
255 JB matchlen_match8_repeat_extend_encodeBlockAsm
256 MOVQ (R9)(R11*1), R10
257 MOVQ 8(R9)(R11*1), R12
258 XORQ (BX)(R11*1), R10
259 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm
260 XORQ 8(BX)(R11*1), R12
261 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm
262 LEAL -16(R8), R8
263 LEAL 16(R11), R11
264 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm
265
266matchlen_bsf_16repeat_extend_encodeBlockAsm:
267#ifdef GOAMD64_v3
268 TZCNTQ R12, R12
269
270#else
271 BSFQ R12, R12
272
273#endif
274 SARQ $0x03, R12
275 LEAL 8(R11)(R12*1), R11
276 JMP repeat_extend_forward_end_encodeBlockAsm
277
278matchlen_match8_repeat_extend_encodeBlockAsm:
279 CMPL R8, $0x08
280 JB matchlen_match4_repeat_extend_encodeBlockAsm
281 MOVQ (R9)(R11*1), R10
282 XORQ (BX)(R11*1), R10
283 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm
284 LEAL -8(R8), R8
285 LEAL 8(R11), R11
286 JMP matchlen_match4_repeat_extend_encodeBlockAsm
287
288matchlen_bsf_8_repeat_extend_encodeBlockAsm:
289#ifdef GOAMD64_v3
290 TZCNTQ R10, R10
291
292#else
293 BSFQ R10, R10
294
295#endif
296 SARQ $0x03, R10
297 LEAL (R11)(R10*1), R11
298 JMP repeat_extend_forward_end_encodeBlockAsm
299
300matchlen_match4_repeat_extend_encodeBlockAsm:
301 CMPL R8, $0x04
302 JB matchlen_match2_repeat_extend_encodeBlockAsm
303 MOVL (R9)(R11*1), R10
304 CMPL (BX)(R11*1), R10
305 JNE matchlen_match2_repeat_extend_encodeBlockAsm
306 LEAL -4(R8), R8
307 LEAL 4(R11), R11
308
309matchlen_match2_repeat_extend_encodeBlockAsm:
310 CMPL R8, $0x01
311 JE matchlen_match1_repeat_extend_encodeBlockAsm
312 JB repeat_extend_forward_end_encodeBlockAsm
313 MOVW (R9)(R11*1), R10
314 CMPW (BX)(R11*1), R10
315 JNE matchlen_match1_repeat_extend_encodeBlockAsm
316 LEAL 2(R11), R11
317 SUBL $0x02, R8
318 JZ repeat_extend_forward_end_encodeBlockAsm
319
320matchlen_match1_repeat_extend_encodeBlockAsm:
321 MOVB (R9)(R11*1), R10
322 CMPB (BX)(R11*1), R10
323 JNE repeat_extend_forward_end_encodeBlockAsm
324 LEAL 1(R11), R11
325
326repeat_extend_forward_end_encodeBlockAsm:
327 ADDL R11, CX
328 MOVL CX, BX
329 SUBL SI, BX
330 MOVL 16(SP), SI
331 TESTL DI, DI
332 JZ repeat_as_copy_encodeBlockAsm
333
334 // emitRepeat
335emit_repeat_again_match_repeat_encodeBlockAsm:
336 MOVL BX, DI
337 LEAL -4(BX), BX
338 CMPL DI, $0x08
339 JBE repeat_two_match_repeat_encodeBlockAsm
340 CMPL DI, $0x0c
341 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm
342 CMPL SI, $0x00000800
343 JB repeat_two_offset_match_repeat_encodeBlockAsm
344
345cant_repeat_two_offset_match_repeat_encodeBlockAsm:
346 CMPL BX, $0x00000104
347 JB repeat_three_match_repeat_encodeBlockAsm
348 CMPL BX, $0x00010100
349 JB repeat_four_match_repeat_encodeBlockAsm
350 CMPL BX, $0x0100ffff
351 JB repeat_five_match_repeat_encodeBlockAsm
352 LEAL -16842747(BX), BX
353 MOVL $0xfffb001d, (AX)
354 MOVB $0xff, 4(AX)
355 ADDQ $0x05, AX
356 JMP emit_repeat_again_match_repeat_encodeBlockAsm
357
358repeat_five_match_repeat_encodeBlockAsm:
359 LEAL -65536(BX), BX
360 MOVL BX, SI
361 MOVW $0x001d, (AX)
362 MOVW BX, 2(AX)
363 SARL $0x10, SI
364 MOVB SI, 4(AX)
365 ADDQ $0x05, AX
366 JMP repeat_end_emit_encodeBlockAsm
367
368repeat_four_match_repeat_encodeBlockAsm:
369 LEAL -256(BX), BX
370 MOVW $0x0019, (AX)
371 MOVW BX, 2(AX)
372 ADDQ $0x04, AX
373 JMP repeat_end_emit_encodeBlockAsm
374
375repeat_three_match_repeat_encodeBlockAsm:
376 LEAL -4(BX), BX
377 MOVW $0x0015, (AX)
378 MOVB BL, 2(AX)
379 ADDQ $0x03, AX
380 JMP repeat_end_emit_encodeBlockAsm
381
382repeat_two_match_repeat_encodeBlockAsm:
383 SHLL $0x02, BX
384 ORL $0x01, BX
385 MOVW BX, (AX)
386 ADDQ $0x02, AX
387 JMP repeat_end_emit_encodeBlockAsm
388
389repeat_two_offset_match_repeat_encodeBlockAsm:
390 XORQ DI, DI
391 LEAL 1(DI)(BX*4), BX
392 MOVB SI, 1(AX)
393 SARL $0x08, SI
394 SHLL $0x05, SI
395 ORL SI, BX
396 MOVB BL, (AX)
397 ADDQ $0x02, AX
398 JMP repeat_end_emit_encodeBlockAsm
399
400repeat_as_copy_encodeBlockAsm:
401 // emitCopy
402 CMPL SI, $0x00010000
403 JB two_byte_offset_repeat_as_copy_encodeBlockAsm
404 CMPL BX, $0x40
405 JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm
406 MOVB $0xff, (AX)
407 MOVL SI, 1(AX)
408 LEAL -64(BX), BX
409 ADDQ $0x05, AX
410 CMPL BX, $0x04
411 JB four_bytes_remain_repeat_as_copy_encodeBlockAsm
412
413 // emitRepeat
414emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
415 MOVL BX, DI
416 LEAL -4(BX), BX
417 CMPL DI, $0x08
418 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
419 CMPL DI, $0x0c
420 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
421 CMPL SI, $0x00000800
422 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
423
424cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
425 CMPL BX, $0x00000104
426 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
427 CMPL BX, $0x00010100
428 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
429 CMPL BX, $0x0100ffff
430 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
431 LEAL -16842747(BX), BX
432 MOVL $0xfffb001d, (AX)
433 MOVB $0xff, 4(AX)
434 ADDQ $0x05, AX
435 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
436
437repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
438 LEAL -65536(BX), BX
439 MOVL BX, SI
440 MOVW $0x001d, (AX)
441 MOVW BX, 2(AX)
442 SARL $0x10, SI
443 MOVB SI, 4(AX)
444 ADDQ $0x05, AX
445 JMP repeat_end_emit_encodeBlockAsm
446
447repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
448 LEAL -256(BX), BX
449 MOVW $0x0019, (AX)
450 MOVW BX, 2(AX)
451 ADDQ $0x04, AX
452 JMP repeat_end_emit_encodeBlockAsm
453
454repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
455 LEAL -4(BX), BX
456 MOVW $0x0015, (AX)
457 MOVB BL, 2(AX)
458 ADDQ $0x03, AX
459 JMP repeat_end_emit_encodeBlockAsm
460
461repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
462 SHLL $0x02, BX
463 ORL $0x01, BX
464 MOVW BX, (AX)
465 ADDQ $0x02, AX
466 JMP repeat_end_emit_encodeBlockAsm
467
468repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
469 XORQ DI, DI
470 LEAL 1(DI)(BX*4), BX
471 MOVB SI, 1(AX)
472 SARL $0x08, SI
473 SHLL $0x05, SI
474 ORL SI, BX
475 MOVB BL, (AX)
476 ADDQ $0x02, AX
477 JMP repeat_end_emit_encodeBlockAsm
478
479four_bytes_remain_repeat_as_copy_encodeBlockAsm:
480 TESTL BX, BX
481 JZ repeat_end_emit_encodeBlockAsm
482 XORL DI, DI
483 LEAL -1(DI)(BX*4), BX
484 MOVB BL, (AX)
485 MOVL SI, 1(AX)
486 ADDQ $0x05, AX
487 JMP repeat_end_emit_encodeBlockAsm
488
489two_byte_offset_repeat_as_copy_encodeBlockAsm:
490 CMPL BX, $0x40
491 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
492 CMPL SI, $0x00000800
493 JAE long_offset_short_repeat_as_copy_encodeBlockAsm
494 MOVL $0x00000001, DI
495 LEAL 16(DI), DI
496 MOVB SI, 1(AX)
497 MOVL SI, R8
498 SHRL $0x08, R8
499 SHLL $0x05, R8
500 ORL R8, DI
501 MOVB DI, (AX)
502 ADDQ $0x02, AX
503 SUBL $0x08, BX
504
505 // emitRepeat
506 LEAL -4(BX), BX
507 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
508
509emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
510 MOVL BX, DI
511 LEAL -4(BX), BX
512 CMPL DI, $0x08
513 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
514 CMPL DI, $0x0c
515 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
516 CMPL SI, $0x00000800
517 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
518
519cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
520 CMPL BX, $0x00000104
521 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
522 CMPL BX, $0x00010100
523 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
524 CMPL BX, $0x0100ffff
525 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
526 LEAL -16842747(BX), BX
527 MOVL $0xfffb001d, (AX)
528 MOVB $0xff, 4(AX)
529 ADDQ $0x05, AX
530 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
531
532repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
533 LEAL -65536(BX), BX
534 MOVL BX, SI
535 MOVW $0x001d, (AX)
536 MOVW BX, 2(AX)
537 SARL $0x10, SI
538 MOVB SI, 4(AX)
539 ADDQ $0x05, AX
540 JMP repeat_end_emit_encodeBlockAsm
541
542repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
543 LEAL -256(BX), BX
544 MOVW $0x0019, (AX)
545 MOVW BX, 2(AX)
546 ADDQ $0x04, AX
547 JMP repeat_end_emit_encodeBlockAsm
548
549repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
550 LEAL -4(BX), BX
551 MOVW $0x0015, (AX)
552 MOVB BL, 2(AX)
553 ADDQ $0x03, AX
554 JMP repeat_end_emit_encodeBlockAsm
555
556repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
557 SHLL $0x02, BX
558 ORL $0x01, BX
559 MOVW BX, (AX)
560 ADDQ $0x02, AX
561 JMP repeat_end_emit_encodeBlockAsm
562
563repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
564 XORQ DI, DI
565 LEAL 1(DI)(BX*4), BX
566 MOVB SI, 1(AX)
567 SARL $0x08, SI
568 SHLL $0x05, SI
569 ORL SI, BX
570 MOVB BL, (AX)
571 ADDQ $0x02, AX
572 JMP repeat_end_emit_encodeBlockAsm
573
574long_offset_short_repeat_as_copy_encodeBlockAsm:
575 MOVB $0xee, (AX)
576 MOVW SI, 1(AX)
577 LEAL -60(BX), BX
578 ADDQ $0x03, AX
579
580 // emitRepeat
581emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
582 MOVL BX, DI
583 LEAL -4(BX), BX
584 CMPL DI, $0x08
585 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
586 CMPL DI, $0x0c
587 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
588 CMPL SI, $0x00000800
589 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
590
591cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
592 CMPL BX, $0x00000104
593 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
594 CMPL BX, $0x00010100
595 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
596 CMPL BX, $0x0100ffff
597 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
598 LEAL -16842747(BX), BX
599 MOVL $0xfffb001d, (AX)
600 MOVB $0xff, 4(AX)
601 ADDQ $0x05, AX
602 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
603
604repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
605 LEAL -65536(BX), BX
606 MOVL BX, SI
607 MOVW $0x001d, (AX)
608 MOVW BX, 2(AX)
609 SARL $0x10, SI
610 MOVB SI, 4(AX)
611 ADDQ $0x05, AX
612 JMP repeat_end_emit_encodeBlockAsm
613
614repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
615 LEAL -256(BX), BX
616 MOVW $0x0019, (AX)
617 MOVW BX, 2(AX)
618 ADDQ $0x04, AX
619 JMP repeat_end_emit_encodeBlockAsm
620
621repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
622 LEAL -4(BX), BX
623 MOVW $0x0015, (AX)
624 MOVB BL, 2(AX)
625 ADDQ $0x03, AX
626 JMP repeat_end_emit_encodeBlockAsm
627
628repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
629 SHLL $0x02, BX
630 ORL $0x01, BX
631 MOVW BX, (AX)
632 ADDQ $0x02, AX
633 JMP repeat_end_emit_encodeBlockAsm
634
635repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
636 XORQ DI, DI
637 LEAL 1(DI)(BX*4), BX
638 MOVB SI, 1(AX)
639 SARL $0x08, SI
640 SHLL $0x05, SI
641 ORL SI, BX
642 MOVB BL, (AX)
643 ADDQ $0x02, AX
644 JMP repeat_end_emit_encodeBlockAsm
645
646two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
647 MOVL BX, DI
648 SHLL $0x02, DI
649 CMPL BX, $0x0c
650 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm
651 CMPL SI, $0x00000800
652 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm
653 LEAL -15(DI), DI
654 MOVB SI, 1(AX)
655 SHRL $0x08, SI
656 SHLL $0x05, SI
657 ORL SI, DI
658 MOVB DI, (AX)
659 ADDQ $0x02, AX
660 JMP repeat_end_emit_encodeBlockAsm
661
662emit_copy_three_repeat_as_copy_encodeBlockAsm:
663 LEAL -2(DI), DI
664 MOVB DI, (AX)
665 MOVW SI, 1(AX)
666 ADDQ $0x03, AX
667
668repeat_end_emit_encodeBlockAsm:
669 MOVL CX, 12(SP)
670 JMP search_loop_encodeBlockAsm
671
672no_repeat_found_encodeBlockAsm:
673 CMPL (DX)(BX*1), SI
674 JEQ candidate_match_encodeBlockAsm
675 SHRQ $0x08, SI
676 MOVL 24(SP)(R9*4), BX
677 LEAL 2(CX), R8
678 CMPL (DX)(DI*1), SI
679 JEQ candidate2_match_encodeBlockAsm
680 MOVL R8, 24(SP)(R9*4)
681 SHRQ $0x08, SI
682 CMPL (DX)(BX*1), SI
683 JEQ candidate3_match_encodeBlockAsm
684 MOVL 20(SP), CX
685 JMP search_loop_encodeBlockAsm
686
687candidate3_match_encodeBlockAsm:
688 ADDL $0x02, CX
689 JMP candidate_match_encodeBlockAsm
690
691candidate2_match_encodeBlockAsm:
692 MOVL R8, 24(SP)(R9*4)
693 INCL CX
694 MOVL DI, BX
695
696candidate_match_encodeBlockAsm:
697 MOVL 12(SP), SI
698 TESTL BX, BX
699 JZ match_extend_back_end_encodeBlockAsm
700
701match_extend_back_loop_encodeBlockAsm:
702 CMPL CX, SI
703 JBE match_extend_back_end_encodeBlockAsm
704 MOVB -1(DX)(BX*1), DI
705 MOVB -1(DX)(CX*1), R8
706 CMPB DI, R8
707 JNE match_extend_back_end_encodeBlockAsm
708 LEAL -1(CX), CX
709 DECL BX
710 JZ match_extend_back_end_encodeBlockAsm
711 JMP match_extend_back_loop_encodeBlockAsm
712
713match_extend_back_end_encodeBlockAsm:
714 MOVL CX, SI
715 SUBL 12(SP), SI
716 LEAQ 5(AX)(SI*1), SI
717 CMPQ SI, (SP)
718 JB match_dst_size_check_encodeBlockAsm
719 MOVQ $0x00000000, ret+48(FP)
720 RET
721
722match_dst_size_check_encodeBlockAsm:
723 MOVL CX, SI
724 MOVL 12(SP), DI
725 CMPL DI, SI
726 JEQ emit_literal_done_match_emit_encodeBlockAsm
727 MOVL SI, R8
728 MOVL SI, 12(SP)
729 LEAQ (DX)(DI*1), SI
730 SUBL DI, R8
731 LEAL -1(R8), DI
732 CMPL DI, $0x3c
733 JB one_byte_match_emit_encodeBlockAsm
734 CMPL DI, $0x00000100
735 JB two_bytes_match_emit_encodeBlockAsm
736 CMPL DI, $0x00010000
737 JB three_bytes_match_emit_encodeBlockAsm
738 CMPL DI, $0x01000000
739 JB four_bytes_match_emit_encodeBlockAsm
740 MOVB $0xfc, (AX)
741 MOVL DI, 1(AX)
742 ADDQ $0x05, AX
743 JMP memmove_long_match_emit_encodeBlockAsm
744
745four_bytes_match_emit_encodeBlockAsm:
746 MOVL DI, R9
747 SHRL $0x10, R9
748 MOVB $0xf8, (AX)
749 MOVW DI, 1(AX)
750 MOVB R9, 3(AX)
751 ADDQ $0x04, AX
752 JMP memmove_long_match_emit_encodeBlockAsm
753
754three_bytes_match_emit_encodeBlockAsm:
755 MOVB $0xf4, (AX)
756 MOVW DI, 1(AX)
757 ADDQ $0x03, AX
758 JMP memmove_long_match_emit_encodeBlockAsm
759
760two_bytes_match_emit_encodeBlockAsm:
761 MOVB $0xf0, (AX)
762 MOVB DI, 1(AX)
763 ADDQ $0x02, AX
764 CMPL DI, $0x40
765 JB memmove_match_emit_encodeBlockAsm
766 JMP memmove_long_match_emit_encodeBlockAsm
767
768one_byte_match_emit_encodeBlockAsm:
769 SHLB $0x02, DI
770 MOVB DI, (AX)
771 ADDQ $0x01, AX
772
773memmove_match_emit_encodeBlockAsm:
774 LEAQ (AX)(R8*1), DI
775
776 // genMemMoveShort
777 CMPQ R8, $0x08
778 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
779 CMPQ R8, $0x10
780 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
781 CMPQ R8, $0x20
782 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
783 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
784
785emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
786 MOVQ (SI), R9
787 MOVQ R9, (AX)
788 JMP memmove_end_copy_match_emit_encodeBlockAsm
789
790emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
791 MOVQ (SI), R9
792 MOVQ -8(SI)(R8*1), SI
793 MOVQ R9, (AX)
794 MOVQ SI, -8(AX)(R8*1)
795 JMP memmove_end_copy_match_emit_encodeBlockAsm
796
797emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
798 MOVOU (SI), X0
799 MOVOU -16(SI)(R8*1), X1
800 MOVOU X0, (AX)
801 MOVOU X1, -16(AX)(R8*1)
802 JMP memmove_end_copy_match_emit_encodeBlockAsm
803
804emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
805 MOVOU (SI), X0
806 MOVOU 16(SI), X1
807 MOVOU -32(SI)(R8*1), X2
808 MOVOU -16(SI)(R8*1), X3
809 MOVOU X0, (AX)
810 MOVOU X1, 16(AX)
811 MOVOU X2, -32(AX)(R8*1)
812 MOVOU X3, -16(AX)(R8*1)
813
814memmove_end_copy_match_emit_encodeBlockAsm:
815 MOVQ DI, AX
816 JMP emit_literal_done_match_emit_encodeBlockAsm
817
818memmove_long_match_emit_encodeBlockAsm:
819 LEAQ (AX)(R8*1), DI
820
821 // genMemMoveLong
822 MOVOU (SI), X0
823 MOVOU 16(SI), X1
824 MOVOU -32(SI)(R8*1), X2
825 MOVOU -16(SI)(R8*1), X3
826 MOVQ R8, R10
827 SHRQ $0x05, R10
828 MOVQ AX, R9
829 ANDL $0x0000001f, R9
830 MOVQ $0x00000040, R11
831 SUBQ R9, R11
832 DECQ R10
833 JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
834 LEAQ -32(SI)(R11*1), R9
835 LEAQ -32(AX)(R11*1), R12
836
837emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
838 MOVOU (R9), X4
839 MOVOU 16(R9), X5
840 MOVOA X4, (R12)
841 MOVOA X5, 16(R12)
842 ADDQ $0x20, R12
843 ADDQ $0x20, R9
844 ADDQ $0x20, R11
845 DECQ R10
846 JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
847
848emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
849 MOVOU -32(SI)(R11*1), X4
850 MOVOU -16(SI)(R11*1), X5
851 MOVOA X4, -32(AX)(R11*1)
852 MOVOA X5, -16(AX)(R11*1)
853 ADDQ $0x20, R11
854 CMPQ R8, R11
855 JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
856 MOVOU X0, (AX)
857 MOVOU X1, 16(AX)
858 MOVOU X2, -32(AX)(R8*1)
859 MOVOU X3, -16(AX)(R8*1)
860 MOVQ DI, AX
861
862emit_literal_done_match_emit_encodeBlockAsm:
863match_nolit_loop_encodeBlockAsm:
864 MOVL CX, SI
865 SUBL BX, SI
866 MOVL SI, 16(SP)
867 ADDL $0x04, CX
868 ADDL $0x04, BX
869 MOVQ src_len+32(FP), SI
870 SUBL CX, SI
871 LEAQ (DX)(CX*1), DI
872 LEAQ (DX)(BX*1), BX
873
874 // matchLen
875 XORL R9, R9
876
877matchlen_loopback_16_match_nolit_encodeBlockAsm:
878 CMPL SI, $0x10
879 JB matchlen_match8_match_nolit_encodeBlockAsm
880 MOVQ (DI)(R9*1), R8
881 MOVQ 8(DI)(R9*1), R10
882 XORQ (BX)(R9*1), R8
883 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm
884 XORQ 8(BX)(R9*1), R10
885 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm
886 LEAL -16(SI), SI
887 LEAL 16(R9), R9
888 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm
889
890matchlen_bsf_16match_nolit_encodeBlockAsm:
891#ifdef GOAMD64_v3
892 TZCNTQ R10, R10
893
894#else
895 BSFQ R10, R10
896
897#endif
898 SARQ $0x03, R10
899 LEAL 8(R9)(R10*1), R9
900 JMP match_nolit_end_encodeBlockAsm
901
902matchlen_match8_match_nolit_encodeBlockAsm:
903 CMPL SI, $0x08
904 JB matchlen_match4_match_nolit_encodeBlockAsm
905 MOVQ (DI)(R9*1), R8
906 XORQ (BX)(R9*1), R8
907 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm
908 LEAL -8(SI), SI
909 LEAL 8(R9), R9
910 JMP matchlen_match4_match_nolit_encodeBlockAsm
911
912matchlen_bsf_8_match_nolit_encodeBlockAsm:
913#ifdef GOAMD64_v3
914 TZCNTQ R8, R8
915
916#else
917 BSFQ R8, R8
918
919#endif
920 SARQ $0x03, R8
921 LEAL (R9)(R8*1), R9
922 JMP match_nolit_end_encodeBlockAsm
923
924matchlen_match4_match_nolit_encodeBlockAsm:
925 CMPL SI, $0x04
926 JB matchlen_match2_match_nolit_encodeBlockAsm
927 MOVL (DI)(R9*1), R8
928 CMPL (BX)(R9*1), R8
929 JNE matchlen_match2_match_nolit_encodeBlockAsm
930 LEAL -4(SI), SI
931 LEAL 4(R9), R9
932
933matchlen_match2_match_nolit_encodeBlockAsm:
934 CMPL SI, $0x01
935 JE matchlen_match1_match_nolit_encodeBlockAsm
936 JB match_nolit_end_encodeBlockAsm
937 MOVW (DI)(R9*1), R8
938 CMPW (BX)(R9*1), R8
939 JNE matchlen_match1_match_nolit_encodeBlockAsm
940 LEAL 2(R9), R9
941 SUBL $0x02, SI
942 JZ match_nolit_end_encodeBlockAsm
943
944matchlen_match1_match_nolit_encodeBlockAsm:
945 MOVB (DI)(R9*1), R8
946 CMPB (BX)(R9*1), R8
947 JNE match_nolit_end_encodeBlockAsm
948 LEAL 1(R9), R9
949
950match_nolit_end_encodeBlockAsm:
951 ADDL R9, CX
952 MOVL 16(SP), BX
953 ADDL $0x04, R9
954 MOVL CX, 12(SP)
955
956 // emitCopy
957 CMPL BX, $0x00010000
958 JB two_byte_offset_match_nolit_encodeBlockAsm
959 CMPL R9, $0x40
960 JBE four_bytes_remain_match_nolit_encodeBlockAsm
961 MOVB $0xff, (AX)
962 MOVL BX, 1(AX)
963 LEAL -64(R9), R9
964 ADDQ $0x05, AX
965 CMPL R9, $0x04
966 JB four_bytes_remain_match_nolit_encodeBlockAsm
967
968 // emitRepeat
969emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
970 MOVL R9, SI
971 LEAL -4(R9), R9
972 CMPL SI, $0x08
973 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy
974 CMPL SI, $0x0c
975 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
976 CMPL BX, $0x00000800
977 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
978
979cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
980 CMPL R9, $0x00000104
981 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy
982 CMPL R9, $0x00010100
983 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy
984 CMPL R9, $0x0100ffff
985 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy
986 LEAL -16842747(R9), R9
987 MOVL $0xfffb001d, (AX)
988 MOVB $0xff, 4(AX)
989 ADDQ $0x05, AX
990 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
991
992repeat_five_match_nolit_encodeBlockAsm_emit_copy:
993 LEAL -65536(R9), R9
994 MOVL R9, BX
995 MOVW $0x001d, (AX)
996 MOVW R9, 2(AX)
997 SARL $0x10, BX
998 MOVB BL, 4(AX)
999 ADDQ $0x05, AX
1000 JMP match_nolit_emitcopy_end_encodeBlockAsm
1001
1002repeat_four_match_nolit_encodeBlockAsm_emit_copy:
1003 LEAL -256(R9), R9
1004 MOVW $0x0019, (AX)
1005 MOVW R9, 2(AX)
1006 ADDQ $0x04, AX
1007 JMP match_nolit_emitcopy_end_encodeBlockAsm
1008
1009repeat_three_match_nolit_encodeBlockAsm_emit_copy:
1010 LEAL -4(R9), R9
1011 MOVW $0x0015, (AX)
1012 MOVB R9, 2(AX)
1013 ADDQ $0x03, AX
1014 JMP match_nolit_emitcopy_end_encodeBlockAsm
1015
1016repeat_two_match_nolit_encodeBlockAsm_emit_copy:
1017 SHLL $0x02, R9
1018 ORL $0x01, R9
1019 MOVW R9, (AX)
1020 ADDQ $0x02, AX
1021 JMP match_nolit_emitcopy_end_encodeBlockAsm
1022
1023repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
1024 XORQ SI, SI
1025 LEAL 1(SI)(R9*4), R9
1026 MOVB BL, 1(AX)
1027 SARL $0x08, BX
1028 SHLL $0x05, BX
1029 ORL BX, R9
1030 MOVB R9, (AX)
1031 ADDQ $0x02, AX
1032 JMP match_nolit_emitcopy_end_encodeBlockAsm
1033
1034four_bytes_remain_match_nolit_encodeBlockAsm:
1035 TESTL R9, R9
1036 JZ match_nolit_emitcopy_end_encodeBlockAsm
1037 XORL SI, SI
1038 LEAL -1(SI)(R9*4), R9
1039 MOVB R9, (AX)
1040 MOVL BX, 1(AX)
1041 ADDQ $0x05, AX
1042 JMP match_nolit_emitcopy_end_encodeBlockAsm
1043
1044two_byte_offset_match_nolit_encodeBlockAsm:
1045 CMPL R9, $0x40
1046 JBE two_byte_offset_short_match_nolit_encodeBlockAsm
1047 CMPL BX, $0x00000800
1048 JAE long_offset_short_match_nolit_encodeBlockAsm
1049 MOVL $0x00000001, SI
1050 LEAL 16(SI), SI
1051 MOVB BL, 1(AX)
1052 MOVL BX, DI
1053 SHRL $0x08, DI
1054 SHLL $0x05, DI
1055 ORL DI, SI
1056 MOVB SI, (AX)
1057 ADDQ $0x02, AX
1058 SUBL $0x08, R9
1059
1060 // emitRepeat
1061 LEAL -4(R9), R9
1062 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
1063
1064emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1065 MOVL R9, SI
1066 LEAL -4(R9), R9
1067 CMPL SI, $0x08
1068 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
1069 CMPL SI, $0x0c
1070 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
1071 CMPL BX, $0x00000800
1072 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
1073
1074cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1075 CMPL R9, $0x00000104
1076 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
1077 CMPL R9, $0x00010100
1078 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
1079 CMPL R9, $0x0100ffff
1080 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
1081 LEAL -16842747(R9), R9
1082 MOVL $0xfffb001d, (AX)
1083 MOVB $0xff, 4(AX)
1084 ADDQ $0x05, AX
1085 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b
1086
1087repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1088 LEAL -65536(R9), R9
1089 MOVL R9, BX
1090 MOVW $0x001d, (AX)
1091 MOVW R9, 2(AX)
1092 SARL $0x10, BX
1093 MOVB BL, 4(AX)
1094 ADDQ $0x05, AX
1095 JMP match_nolit_emitcopy_end_encodeBlockAsm
1096
1097repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1098 LEAL -256(R9), R9
1099 MOVW $0x0019, (AX)
1100 MOVW R9, 2(AX)
1101 ADDQ $0x04, AX
1102 JMP match_nolit_emitcopy_end_encodeBlockAsm
1103
1104repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1105 LEAL -4(R9), R9
1106 MOVW $0x0015, (AX)
1107 MOVB R9, 2(AX)
1108 ADDQ $0x03, AX
1109 JMP match_nolit_emitcopy_end_encodeBlockAsm
1110
1111repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1112 SHLL $0x02, R9
1113 ORL $0x01, R9
1114 MOVW R9, (AX)
1115 ADDQ $0x02, AX
1116 JMP match_nolit_emitcopy_end_encodeBlockAsm
1117
1118repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1119 XORQ SI, SI
1120 LEAL 1(SI)(R9*4), R9
1121 MOVB BL, 1(AX)
1122 SARL $0x08, BX
1123 SHLL $0x05, BX
1124 ORL BX, R9
1125 MOVB R9, (AX)
1126 ADDQ $0x02, AX
1127 JMP match_nolit_emitcopy_end_encodeBlockAsm
1128
1129long_offset_short_match_nolit_encodeBlockAsm:
1130 MOVB $0xee, (AX)
1131 MOVW BX, 1(AX)
1132 LEAL -60(R9), R9
1133 ADDQ $0x03, AX
1134
1135 // emitRepeat
1136emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
1137 MOVL R9, SI
1138 LEAL -4(R9), R9
1139 CMPL SI, $0x08
1140 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
1141 CMPL SI, $0x0c
1142 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
1143 CMPL BX, $0x00000800
1144 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
1145
1146cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
1147 CMPL R9, $0x00000104
1148 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
1149 CMPL R9, $0x00010100
1150 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
1151 CMPL R9, $0x0100ffff
1152 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
1153 LEAL -16842747(R9), R9
1154 MOVL $0xfffb001d, (AX)
1155 MOVB $0xff, 4(AX)
1156 ADDQ $0x05, AX
1157 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
1158
1159repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
1160 LEAL -65536(R9), R9
1161 MOVL R9, BX
1162 MOVW $0x001d, (AX)
1163 MOVW R9, 2(AX)
1164 SARL $0x10, BX
1165 MOVB BL, 4(AX)
1166 ADDQ $0x05, AX
1167 JMP match_nolit_emitcopy_end_encodeBlockAsm
1168
1169repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
1170 LEAL -256(R9), R9
1171 MOVW $0x0019, (AX)
1172 MOVW R9, 2(AX)
1173 ADDQ $0x04, AX
1174 JMP match_nolit_emitcopy_end_encodeBlockAsm
1175
1176repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
1177 LEAL -4(R9), R9
1178 MOVW $0x0015, (AX)
1179 MOVB R9, 2(AX)
1180 ADDQ $0x03, AX
1181 JMP match_nolit_emitcopy_end_encodeBlockAsm
1182
1183repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
1184 SHLL $0x02, R9
1185 ORL $0x01, R9
1186 MOVW R9, (AX)
1187 ADDQ $0x02, AX
1188 JMP match_nolit_emitcopy_end_encodeBlockAsm
1189
1190repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
1191 XORQ SI, SI
1192 LEAL 1(SI)(R9*4), R9
1193 MOVB BL, 1(AX)
1194 SARL $0x08, BX
1195 SHLL $0x05, BX
1196 ORL BX, R9
1197 MOVB R9, (AX)
1198 ADDQ $0x02, AX
1199 JMP match_nolit_emitcopy_end_encodeBlockAsm
1200
1201two_byte_offset_short_match_nolit_encodeBlockAsm:
1202 MOVL R9, SI
1203 SHLL $0x02, SI
1204 CMPL R9, $0x0c
1205 JAE emit_copy_three_match_nolit_encodeBlockAsm
1206 CMPL BX, $0x00000800
1207 JAE emit_copy_three_match_nolit_encodeBlockAsm
1208 LEAL -15(SI), SI
1209 MOVB BL, 1(AX)
1210 SHRL $0x08, BX
1211 SHLL $0x05, BX
1212 ORL BX, SI
1213 MOVB SI, (AX)
1214 ADDQ $0x02, AX
1215 JMP match_nolit_emitcopy_end_encodeBlockAsm
1216
1217emit_copy_three_match_nolit_encodeBlockAsm:
1218 LEAL -2(SI), SI
1219 MOVB SI, (AX)
1220 MOVW BX, 1(AX)
1221 ADDQ $0x03, AX
1222
1223match_nolit_emitcopy_end_encodeBlockAsm:
1224 CMPL CX, 8(SP)
1225 JAE emit_remainder_encodeBlockAsm
1226 MOVQ -2(DX)(CX*1), SI
1227 CMPQ AX, (SP)
1228 JB match_nolit_dst_ok_encodeBlockAsm
1229 MOVQ $0x00000000, ret+48(FP)
1230 RET
1231
1232match_nolit_dst_ok_encodeBlockAsm:
1233 MOVQ $0x0000cf1bbcdcbf9b, R8
1234 MOVQ SI, DI
1235 SHRQ $0x10, SI
1236 MOVQ SI, BX
1237 SHLQ $0x10, DI
1238 IMULQ R8, DI
1239 SHRQ $0x32, DI
1240 SHLQ $0x10, BX
1241 IMULQ R8, BX
1242 SHRQ $0x32, BX
1243 LEAL -2(CX), R8
1244 LEAQ 24(SP)(BX*4), R9
1245 MOVL (R9), BX
1246 MOVL R8, 24(SP)(DI*4)
1247 MOVL CX, (R9)
1248 CMPL (DX)(BX*1), SI
1249 JEQ match_nolit_loop_encodeBlockAsm
1250 INCL CX
1251 JMP search_loop_encodeBlockAsm
1252
1253emit_remainder_encodeBlockAsm:
1254 MOVQ src_len+32(FP), CX
1255 SUBL 12(SP), CX
1256 LEAQ 5(AX)(CX*1), CX
1257 CMPQ CX, (SP)
1258 JB emit_remainder_ok_encodeBlockAsm
1259 MOVQ $0x00000000, ret+48(FP)
1260 RET
1261
1262emit_remainder_ok_encodeBlockAsm:
1263 MOVQ src_len+32(FP), CX
1264 MOVL 12(SP), BX
1265 CMPL BX, CX
1266 JEQ emit_literal_done_emit_remainder_encodeBlockAsm
1267 MOVL CX, SI
1268 MOVL CX, 12(SP)
1269 LEAQ (DX)(BX*1), CX
1270 SUBL BX, SI
1271 LEAL -1(SI), DX
1272 CMPL DX, $0x3c
1273 JB one_byte_emit_remainder_encodeBlockAsm
1274 CMPL DX, $0x00000100
1275 JB two_bytes_emit_remainder_encodeBlockAsm
1276 CMPL DX, $0x00010000
1277 JB three_bytes_emit_remainder_encodeBlockAsm
1278 CMPL DX, $0x01000000
1279 JB four_bytes_emit_remainder_encodeBlockAsm
1280 MOVB $0xfc, (AX)
1281 MOVL DX, 1(AX)
1282 ADDQ $0x05, AX
1283 JMP memmove_long_emit_remainder_encodeBlockAsm
1284
1285four_bytes_emit_remainder_encodeBlockAsm:
1286 MOVL DX, BX
1287 SHRL $0x10, BX
1288 MOVB $0xf8, (AX)
1289 MOVW DX, 1(AX)
1290 MOVB BL, 3(AX)
1291 ADDQ $0x04, AX
1292 JMP memmove_long_emit_remainder_encodeBlockAsm
1293
1294three_bytes_emit_remainder_encodeBlockAsm:
1295 MOVB $0xf4, (AX)
1296 MOVW DX, 1(AX)
1297 ADDQ $0x03, AX
1298 JMP memmove_long_emit_remainder_encodeBlockAsm
1299
1300two_bytes_emit_remainder_encodeBlockAsm:
1301 MOVB $0xf0, (AX)
1302 MOVB DL, 1(AX)
1303 ADDQ $0x02, AX
1304 CMPL DX, $0x40
1305 JB memmove_emit_remainder_encodeBlockAsm
1306 JMP memmove_long_emit_remainder_encodeBlockAsm
1307
1308one_byte_emit_remainder_encodeBlockAsm:
1309 SHLB $0x02, DL
1310 MOVB DL, (AX)
1311 ADDQ $0x01, AX
1312
1313memmove_emit_remainder_encodeBlockAsm:
1314 LEAQ (AX)(SI*1), DX
1315 MOVL SI, BX
1316
1317 // genMemMoveShort
1318 CMPQ BX, $0x03
1319 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
1320 JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
1321 CMPQ BX, $0x08
1322 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
1323 CMPQ BX, $0x10
1324 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
1325 CMPQ BX, $0x20
1326 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
1327 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
1328
1329emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
1330 MOVB (CX), SI
1331 MOVB -1(CX)(BX*1), CL
1332 MOVB SI, (AX)
1333 MOVB CL, -1(AX)(BX*1)
1334 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1335
1336emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
1337 MOVW (CX), SI
1338 MOVB 2(CX), CL
1339 MOVW SI, (AX)
1340 MOVB CL, 2(AX)
1341 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1342
1343emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
1344 MOVL (CX), SI
1345 MOVL -4(CX)(BX*1), CX
1346 MOVL SI, (AX)
1347 MOVL CX, -4(AX)(BX*1)
1348 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1349
1350emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
1351 MOVQ (CX), SI
1352 MOVQ -8(CX)(BX*1), CX
1353 MOVQ SI, (AX)
1354 MOVQ CX, -8(AX)(BX*1)
1355 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1356
1357emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
1358 MOVOU (CX), X0
1359 MOVOU -16(CX)(BX*1), X1
1360 MOVOU X0, (AX)
1361 MOVOU X1, -16(AX)(BX*1)
1362 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1363
1364emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
1365 MOVOU (CX), X0
1366 MOVOU 16(CX), X1
1367 MOVOU -32(CX)(BX*1), X2
1368 MOVOU -16(CX)(BX*1), X3
1369 MOVOU X0, (AX)
1370 MOVOU X1, 16(AX)
1371 MOVOU X2, -32(AX)(BX*1)
1372 MOVOU X3, -16(AX)(BX*1)
1373
1374memmove_end_copy_emit_remainder_encodeBlockAsm:
1375 MOVQ DX, AX
1376 JMP emit_literal_done_emit_remainder_encodeBlockAsm
1377
1378memmove_long_emit_remainder_encodeBlockAsm:
1379 LEAQ (AX)(SI*1), DX
1380 MOVL SI, BX
1381
1382 // genMemMoveLong
1383 MOVOU (CX), X0
1384 MOVOU 16(CX), X1
1385 MOVOU -32(CX)(BX*1), X2
1386 MOVOU -16(CX)(BX*1), X3
1387 MOVQ BX, DI
1388 SHRQ $0x05, DI
1389 MOVQ AX, SI
1390 ANDL $0x0000001f, SI
1391 MOVQ $0x00000040, R8
1392 SUBQ SI, R8
1393 DECQ DI
1394 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
1395 LEAQ -32(CX)(R8*1), SI
1396 LEAQ -32(AX)(R8*1), R9
1397
1398emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
1399 MOVOU (SI), X4
1400 MOVOU 16(SI), X5
1401 MOVOA X4, (R9)
1402 MOVOA X5, 16(R9)
1403 ADDQ $0x20, R9
1404 ADDQ $0x20, SI
1405 ADDQ $0x20, R8
1406 DECQ DI
1407 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
1408
1409emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
1410 MOVOU -32(CX)(R8*1), X4
1411 MOVOU -16(CX)(R8*1), X5
1412 MOVOA X4, -32(AX)(R8*1)
1413 MOVOA X5, -16(AX)(R8*1)
1414 ADDQ $0x20, R8
1415 CMPQ BX, R8
1416 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
1417 MOVOU X0, (AX)
1418 MOVOU X1, 16(AX)
1419 MOVOU X2, -32(AX)(BX*1)
1420 MOVOU X3, -16(AX)(BX*1)
1421 MOVQ DX, AX
1422
1423emit_literal_done_emit_remainder_encodeBlockAsm:
1424 MOVQ dst_base+0(FP), CX
1425 SUBQ CX, AX
1426 MOVQ AX, ret+48(FP)
1427 RET
1428
1429// func encodeBlockAsm4MB(dst []byte, src []byte) int
1430// Requires: BMI, SSE2
1431TEXT ·encodeBlockAsm4MB(SB), $65560-56
1432 MOVQ dst_base+0(FP), AX
1433 MOVQ $0x00000200, CX
1434 LEAQ 24(SP), DX
1435 PXOR X0, X0
1436
1437zero_loop_encodeBlockAsm4MB:
1438 MOVOU X0, (DX)
1439 MOVOU X0, 16(DX)
1440 MOVOU X0, 32(DX)
1441 MOVOU X0, 48(DX)
1442 MOVOU X0, 64(DX)
1443 MOVOU X0, 80(DX)
1444 MOVOU X0, 96(DX)
1445 MOVOU X0, 112(DX)
1446 ADDQ $0x80, DX
1447 DECQ CX
1448 JNZ zero_loop_encodeBlockAsm4MB
1449 MOVL $0x00000000, 12(SP)
1450 MOVQ src_len+32(FP), CX
1451 LEAQ -9(CX), DX
1452 LEAQ -8(CX), BX
1453 MOVL BX, 8(SP)
1454 SHRQ $0x05, CX
1455 SUBL CX, DX
1456 LEAQ (AX)(DX*1), DX
1457 MOVQ DX, (SP)
1458 MOVL $0x00000001, CX
1459 MOVL CX, 16(SP)
1460 MOVQ src_base+24(FP), DX
1461
1462search_loop_encodeBlockAsm4MB:
1463 MOVL CX, BX
1464 SUBL 12(SP), BX
1465 SHRL $0x06, BX
1466 LEAL 4(CX)(BX*1), BX
1467 CMPL BX, 8(SP)
1468 JAE emit_remainder_encodeBlockAsm4MB
1469 MOVQ (DX)(CX*1), SI
1470 MOVL BX, 20(SP)
1471 MOVQ $0x0000cf1bbcdcbf9b, R8
1472 MOVQ SI, R9
1473 MOVQ SI, R10
1474 SHRQ $0x08, R10
1475 SHLQ $0x10, R9
1476 IMULQ R8, R9
1477 SHRQ $0x32, R9
1478 SHLQ $0x10, R10
1479 IMULQ R8, R10
1480 SHRQ $0x32, R10
1481 MOVL 24(SP)(R9*4), BX
1482 MOVL 24(SP)(R10*4), DI
1483 MOVL CX, 24(SP)(R9*4)
1484 LEAL 1(CX), R9
1485 MOVL R9, 24(SP)(R10*4)
1486 MOVQ SI, R9
1487 SHRQ $0x10, R9
1488 SHLQ $0x10, R9
1489 IMULQ R8, R9
1490 SHRQ $0x32, R9
1491 MOVL CX, R8
1492 SUBL 16(SP), R8
1493 MOVL 1(DX)(R8*1), R10
1494 MOVQ SI, R8
1495 SHRQ $0x08, R8
1496 CMPL R8, R10
1497 JNE no_repeat_found_encodeBlockAsm4MB
1498 LEAL 1(CX), SI
1499 MOVL 12(SP), DI
1500 MOVL SI, BX
1501 SUBL 16(SP), BX
1502 JZ repeat_extend_back_end_encodeBlockAsm4MB
1503
1504repeat_extend_back_loop_encodeBlockAsm4MB:
1505 CMPL SI, DI
1506 JBE repeat_extend_back_end_encodeBlockAsm4MB
1507 MOVB -1(DX)(BX*1), R8
1508 MOVB -1(DX)(SI*1), R9
1509 CMPB R8, R9
1510 JNE repeat_extend_back_end_encodeBlockAsm4MB
1511 LEAL -1(SI), SI
1512 DECL BX
1513 JNZ repeat_extend_back_loop_encodeBlockAsm4MB
1514
1515repeat_extend_back_end_encodeBlockAsm4MB:
1516 MOVL 12(SP), BX
1517 CMPL BX, SI
1518 JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB
1519 MOVL SI, R8
1520 MOVL SI, 12(SP)
1521 LEAQ (DX)(BX*1), R9
1522 SUBL BX, R8
1523 LEAL -1(R8), BX
1524 CMPL BX, $0x3c
1525 JB one_byte_repeat_emit_encodeBlockAsm4MB
1526 CMPL BX, $0x00000100
1527 JB two_bytes_repeat_emit_encodeBlockAsm4MB
1528 CMPL BX, $0x00010000
1529 JB three_bytes_repeat_emit_encodeBlockAsm4MB
1530 MOVL BX, R10
1531 SHRL $0x10, R10
1532 MOVB $0xf8, (AX)
1533 MOVW BX, 1(AX)
1534 MOVB R10, 3(AX)
1535 ADDQ $0x04, AX
1536 JMP memmove_long_repeat_emit_encodeBlockAsm4MB
1537
1538three_bytes_repeat_emit_encodeBlockAsm4MB:
1539 MOVB $0xf4, (AX)
1540 MOVW BX, 1(AX)
1541 ADDQ $0x03, AX
1542 JMP memmove_long_repeat_emit_encodeBlockAsm4MB
1543
1544two_bytes_repeat_emit_encodeBlockAsm4MB:
1545 MOVB $0xf0, (AX)
1546 MOVB BL, 1(AX)
1547 ADDQ $0x02, AX
1548 CMPL BX, $0x40
1549 JB memmove_repeat_emit_encodeBlockAsm4MB
1550 JMP memmove_long_repeat_emit_encodeBlockAsm4MB
1551
1552one_byte_repeat_emit_encodeBlockAsm4MB:
1553 SHLB $0x02, BL
1554 MOVB BL, (AX)
1555 ADDQ $0x01, AX
1556
1557memmove_repeat_emit_encodeBlockAsm4MB:
1558 LEAQ (AX)(R8*1), BX
1559
1560 // genMemMoveShort
1561 CMPQ R8, $0x08
1562 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
1563 CMPQ R8, $0x10
1564 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
1565 CMPQ R8, $0x20
1566 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
1567 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
1568
1569emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
1570 MOVQ (R9), R10
1571 MOVQ R10, (AX)
1572 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1573
1574emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
1575 MOVQ (R9), R10
1576 MOVQ -8(R9)(R8*1), R9
1577 MOVQ R10, (AX)
1578 MOVQ R9, -8(AX)(R8*1)
1579 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1580
1581emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
1582 MOVOU (R9), X0
1583 MOVOU -16(R9)(R8*1), X1
1584 MOVOU X0, (AX)
1585 MOVOU X1, -16(AX)(R8*1)
1586 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1587
1588emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
1589 MOVOU (R9), X0
1590 MOVOU 16(R9), X1
1591 MOVOU -32(R9)(R8*1), X2
1592 MOVOU -16(R9)(R8*1), X3
1593 MOVOU X0, (AX)
1594 MOVOU X1, 16(AX)
1595 MOVOU X2, -32(AX)(R8*1)
1596 MOVOU X3, -16(AX)(R8*1)
1597
1598memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
1599 MOVQ BX, AX
1600 JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB
1601
1602memmove_long_repeat_emit_encodeBlockAsm4MB:
1603 LEAQ (AX)(R8*1), BX
1604
1605 // genMemMoveLong
1606 MOVOU (R9), X0
1607 MOVOU 16(R9), X1
1608 MOVOU -32(R9)(R8*1), X2
1609 MOVOU -16(R9)(R8*1), X3
1610 MOVQ R8, R11
1611 SHRQ $0x05, R11
1612 MOVQ AX, R10
1613 ANDL $0x0000001f, R10
1614 MOVQ $0x00000040, R12
1615 SUBQ R10, R12
1616 DECQ R11
1617 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1618 LEAQ -32(R9)(R12*1), R10
1619 LEAQ -32(AX)(R12*1), R13
1620
1621emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
1622 MOVOU (R10), X4
1623 MOVOU 16(R10), X5
1624 MOVOA X4, (R13)
1625 MOVOA X5, 16(R13)
1626 ADDQ $0x20, R13
1627 ADDQ $0x20, R10
1628 ADDQ $0x20, R12
1629 DECQ R11
1630 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
1631
1632emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
1633 MOVOU -32(R9)(R12*1), X4
1634 MOVOU -16(R9)(R12*1), X5
1635 MOVOA X4, -32(AX)(R12*1)
1636 MOVOA X5, -16(AX)(R12*1)
1637 ADDQ $0x20, R12
1638 CMPQ R8, R12
1639 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1640 MOVOU X0, (AX)
1641 MOVOU X1, 16(AX)
1642 MOVOU X2, -32(AX)(R8*1)
1643 MOVOU X3, -16(AX)(R8*1)
1644 MOVQ BX, AX
1645
1646emit_literal_done_repeat_emit_encodeBlockAsm4MB:
1647 ADDL $0x05, CX
1648 MOVL CX, BX
1649 SUBL 16(SP), BX
1650 MOVQ src_len+32(FP), R8
1651 SUBL CX, R8
1652 LEAQ (DX)(CX*1), R9
1653 LEAQ (DX)(BX*1), BX
1654
1655 // matchLen
1656 XORL R11, R11
1657
1658matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB:
1659 CMPL R8, $0x10
1660 JB matchlen_match8_repeat_extend_encodeBlockAsm4MB
1661 MOVQ (R9)(R11*1), R10
1662 MOVQ 8(R9)(R11*1), R12
1663 XORQ (BX)(R11*1), R10
1664 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
1665 XORQ 8(BX)(R11*1), R12
1666 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4MB
1667 LEAL -16(R8), R8
1668 LEAL 16(R11), R11
1669 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB
1670
1671matchlen_bsf_16repeat_extend_encodeBlockAsm4MB:
1672#ifdef GOAMD64_v3
1673 TZCNTQ R12, R12
1674
1675#else
1676 BSFQ R12, R12
1677
1678#endif
1679 SARQ $0x03, R12
1680 LEAL 8(R11)(R12*1), R11
1681 JMP repeat_extend_forward_end_encodeBlockAsm4MB
1682
1683matchlen_match8_repeat_extend_encodeBlockAsm4MB:
1684 CMPL R8, $0x08
1685 JB matchlen_match4_repeat_extend_encodeBlockAsm4MB
1686 MOVQ (R9)(R11*1), R10
1687 XORQ (BX)(R11*1), R10
1688 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
1689 LEAL -8(R8), R8
1690 LEAL 8(R11), R11
1691 JMP matchlen_match4_repeat_extend_encodeBlockAsm4MB
1692
1693matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB:
1694#ifdef GOAMD64_v3
1695 TZCNTQ R10, R10
1696
1697#else
1698 BSFQ R10, R10
1699
1700#endif
1701 SARQ $0x03, R10
1702 LEAL (R11)(R10*1), R11
1703 JMP repeat_extend_forward_end_encodeBlockAsm4MB
1704
1705matchlen_match4_repeat_extend_encodeBlockAsm4MB:
1706 CMPL R8, $0x04
1707 JB matchlen_match2_repeat_extend_encodeBlockAsm4MB
1708 MOVL (R9)(R11*1), R10
1709 CMPL (BX)(R11*1), R10
1710 JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB
1711 LEAL -4(R8), R8
1712 LEAL 4(R11), R11
1713
1714matchlen_match2_repeat_extend_encodeBlockAsm4MB:
1715 CMPL R8, $0x01
1716 JE matchlen_match1_repeat_extend_encodeBlockAsm4MB
1717 JB repeat_extend_forward_end_encodeBlockAsm4MB
1718 MOVW (R9)(R11*1), R10
1719 CMPW (BX)(R11*1), R10
1720 JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB
1721 LEAL 2(R11), R11
1722 SUBL $0x02, R8
1723 JZ repeat_extend_forward_end_encodeBlockAsm4MB
1724
1725matchlen_match1_repeat_extend_encodeBlockAsm4MB:
1726 MOVB (R9)(R11*1), R10
1727 CMPB (BX)(R11*1), R10
1728 JNE repeat_extend_forward_end_encodeBlockAsm4MB
1729 LEAL 1(R11), R11
1730
1731repeat_extend_forward_end_encodeBlockAsm4MB:
1732 ADDL R11, CX
1733 MOVL CX, BX
1734 SUBL SI, BX
1735 MOVL 16(SP), SI
1736 TESTL DI, DI
1737 JZ repeat_as_copy_encodeBlockAsm4MB
1738
1739 // emitRepeat
1740 MOVL BX, DI
1741 LEAL -4(BX), BX
1742 CMPL DI, $0x08
1743 JBE repeat_two_match_repeat_encodeBlockAsm4MB
1744 CMPL DI, $0x0c
1745 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
1746 CMPL SI, $0x00000800
1747 JB repeat_two_offset_match_repeat_encodeBlockAsm4MB
1748
1749cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
1750 CMPL BX, $0x00000104
1751 JB repeat_three_match_repeat_encodeBlockAsm4MB
1752 CMPL BX, $0x00010100
1753 JB repeat_four_match_repeat_encodeBlockAsm4MB
1754 LEAL -65536(BX), BX
1755 MOVL BX, SI
1756 MOVW $0x001d, (AX)
1757 MOVW BX, 2(AX)
1758 SARL $0x10, SI
1759 MOVB SI, 4(AX)
1760 ADDQ $0x05, AX
1761 JMP repeat_end_emit_encodeBlockAsm4MB
1762
1763repeat_four_match_repeat_encodeBlockAsm4MB:
1764 LEAL -256(BX), BX
1765 MOVW $0x0019, (AX)
1766 MOVW BX, 2(AX)
1767 ADDQ $0x04, AX
1768 JMP repeat_end_emit_encodeBlockAsm4MB
1769
1770repeat_three_match_repeat_encodeBlockAsm4MB:
1771 LEAL -4(BX), BX
1772 MOVW $0x0015, (AX)
1773 MOVB BL, 2(AX)
1774 ADDQ $0x03, AX
1775 JMP repeat_end_emit_encodeBlockAsm4MB
1776
1777repeat_two_match_repeat_encodeBlockAsm4MB:
1778 SHLL $0x02, BX
1779 ORL $0x01, BX
1780 MOVW BX, (AX)
1781 ADDQ $0x02, AX
1782 JMP repeat_end_emit_encodeBlockAsm4MB
1783
1784repeat_two_offset_match_repeat_encodeBlockAsm4MB:
1785 XORQ DI, DI
1786 LEAL 1(DI)(BX*4), BX
1787 MOVB SI, 1(AX)
1788 SARL $0x08, SI
1789 SHLL $0x05, SI
1790 ORL SI, BX
1791 MOVB BL, (AX)
1792 ADDQ $0x02, AX
1793 JMP repeat_end_emit_encodeBlockAsm4MB
1794
1795repeat_as_copy_encodeBlockAsm4MB:
1796 // emitCopy
1797 CMPL SI, $0x00010000
1798 JB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
1799 CMPL BX, $0x40
1800 JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
1801 MOVB $0xff, (AX)
1802 MOVL SI, 1(AX)
1803 LEAL -64(BX), BX
1804 ADDQ $0x05, AX
1805 CMPL BX, $0x04
1806 JB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
1807
1808 // emitRepeat
1809 MOVL BX, DI
1810 LEAL -4(BX), BX
1811 CMPL DI, $0x08
1812 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1813 CMPL DI, $0x0c
1814 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1815 CMPL SI, $0x00000800
1816 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1817
1818cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1819 CMPL BX, $0x00000104
1820 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1821 CMPL BX, $0x00010100
1822 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1823 LEAL -65536(BX), BX
1824 MOVL BX, SI
1825 MOVW $0x001d, (AX)
1826 MOVW BX, 2(AX)
1827 SARL $0x10, SI
1828 MOVB SI, 4(AX)
1829 ADDQ $0x05, AX
1830 JMP repeat_end_emit_encodeBlockAsm4MB
1831
1832repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1833 LEAL -256(BX), BX
1834 MOVW $0x0019, (AX)
1835 MOVW BX, 2(AX)
1836 ADDQ $0x04, AX
1837 JMP repeat_end_emit_encodeBlockAsm4MB
1838
1839repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1840 LEAL -4(BX), BX
1841 MOVW $0x0015, (AX)
1842 MOVB BL, 2(AX)
1843 ADDQ $0x03, AX
1844 JMP repeat_end_emit_encodeBlockAsm4MB
1845
1846repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1847 SHLL $0x02, BX
1848 ORL $0x01, BX
1849 MOVW BX, (AX)
1850 ADDQ $0x02, AX
1851 JMP repeat_end_emit_encodeBlockAsm4MB
1852
1853repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1854 XORQ DI, DI
1855 LEAL 1(DI)(BX*4), BX
1856 MOVB SI, 1(AX)
1857 SARL $0x08, SI
1858 SHLL $0x05, SI
1859 ORL SI, BX
1860 MOVB BL, (AX)
1861 ADDQ $0x02, AX
1862 JMP repeat_end_emit_encodeBlockAsm4MB
1863
1864four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
1865 TESTL BX, BX
1866 JZ repeat_end_emit_encodeBlockAsm4MB
1867 XORL DI, DI
1868 LEAL -1(DI)(BX*4), BX
1869 MOVB BL, (AX)
1870 MOVL SI, 1(AX)
1871 ADDQ $0x05, AX
1872 JMP repeat_end_emit_encodeBlockAsm4MB
1873
1874two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
1875 CMPL BX, $0x40
1876 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
1877 CMPL SI, $0x00000800
1878 JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB
1879 MOVL $0x00000001, DI
1880 LEAL 16(DI), DI
1881 MOVB SI, 1(AX)
1882 SHRL $0x08, SI
1883 SHLL $0x05, SI
1884 ORL SI, DI
1885 MOVB DI, (AX)
1886 ADDQ $0x02, AX
1887 SUBL $0x08, BX
1888
1889 // emitRepeat
1890 LEAL -4(BX), BX
1891 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1892 MOVL BX, DI
1893 LEAL -4(BX), BX
1894 CMPL DI, $0x08
1895 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1896 CMPL DI, $0x0c
1897 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1898 CMPL SI, $0x00000800
1899 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1900
1901cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1902 CMPL BX, $0x00000104
1903 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1904 CMPL BX, $0x00010100
1905 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1906 LEAL -65536(BX), BX
1907 MOVL BX, SI
1908 MOVW $0x001d, (AX)
1909 MOVW BX, 2(AX)
1910 SARL $0x10, SI
1911 MOVB SI, 4(AX)
1912 ADDQ $0x05, AX
1913 JMP repeat_end_emit_encodeBlockAsm4MB
1914
1915repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1916 LEAL -256(BX), BX
1917 MOVW $0x0019, (AX)
1918 MOVW BX, 2(AX)
1919 ADDQ $0x04, AX
1920 JMP repeat_end_emit_encodeBlockAsm4MB
1921
1922repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1923 LEAL -4(BX), BX
1924 MOVW $0x0015, (AX)
1925 MOVB BL, 2(AX)
1926 ADDQ $0x03, AX
1927 JMP repeat_end_emit_encodeBlockAsm4MB
1928
1929repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1930 SHLL $0x02, BX
1931 ORL $0x01, BX
1932 MOVW BX, (AX)
1933 ADDQ $0x02, AX
1934 JMP repeat_end_emit_encodeBlockAsm4MB
1935
1936repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1937 XORQ DI, DI
1938 LEAL 1(DI)(BX*4), BX
1939 MOVB SI, 1(AX)
1940 SARL $0x08, SI
1941 SHLL $0x05, SI
1942 ORL SI, BX
1943 MOVB BL, (AX)
1944 ADDQ $0x02, AX
1945 JMP repeat_end_emit_encodeBlockAsm4MB
1946
1947long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
1948 MOVB $0xee, (AX)
1949 MOVW SI, 1(AX)
1950 LEAL -60(BX), BX
1951 ADDQ $0x03, AX
1952
1953 // emitRepeat
1954 MOVL BX, DI
1955 LEAL -4(BX), BX
1956 CMPL DI, $0x08
1957 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1958 CMPL DI, $0x0c
1959 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1960 CMPL SI, $0x00000800
1961 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1962
1963cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1964 CMPL BX, $0x00000104
1965 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1966 CMPL BX, $0x00010100
1967 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1968 LEAL -65536(BX), BX
1969 MOVL BX, SI
1970 MOVW $0x001d, (AX)
1971 MOVW BX, 2(AX)
1972 SARL $0x10, SI
1973 MOVB SI, 4(AX)
1974 ADDQ $0x05, AX
1975 JMP repeat_end_emit_encodeBlockAsm4MB
1976
1977repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1978 LEAL -256(BX), BX
1979 MOVW $0x0019, (AX)
1980 MOVW BX, 2(AX)
1981 ADDQ $0x04, AX
1982 JMP repeat_end_emit_encodeBlockAsm4MB
1983
1984repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1985 LEAL -4(BX), BX
1986 MOVW $0x0015, (AX)
1987 MOVB BL, 2(AX)
1988 ADDQ $0x03, AX
1989 JMP repeat_end_emit_encodeBlockAsm4MB
1990
1991repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1992 SHLL $0x02, BX
1993 ORL $0x01, BX
1994 MOVW BX, (AX)
1995 ADDQ $0x02, AX
1996 JMP repeat_end_emit_encodeBlockAsm4MB
1997
1998repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1999 XORQ DI, DI
2000 LEAL 1(DI)(BX*4), BX
2001 MOVB SI, 1(AX)
2002 SARL $0x08, SI
2003 SHLL $0x05, SI
2004 ORL SI, BX
2005 MOVB BL, (AX)
2006 ADDQ $0x02, AX
2007 JMP repeat_end_emit_encodeBlockAsm4MB
2008
2009two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
2010 MOVL BX, DI
2011 SHLL $0x02, DI
2012 CMPL BX, $0x0c
2013 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
2014 CMPL SI, $0x00000800
2015 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
2016 LEAL -15(DI), DI
2017 MOVB SI, 1(AX)
2018 SHRL $0x08, SI
2019 SHLL $0x05, SI
2020 ORL SI, DI
2021 MOVB DI, (AX)
2022 ADDQ $0x02, AX
2023 JMP repeat_end_emit_encodeBlockAsm4MB
2024
2025emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
2026 LEAL -2(DI), DI
2027 MOVB DI, (AX)
2028 MOVW SI, 1(AX)
2029 ADDQ $0x03, AX
2030
2031repeat_end_emit_encodeBlockAsm4MB:
2032 MOVL CX, 12(SP)
2033 JMP search_loop_encodeBlockAsm4MB
2034
2035no_repeat_found_encodeBlockAsm4MB:
2036 CMPL (DX)(BX*1), SI
2037 JEQ candidate_match_encodeBlockAsm4MB
2038 SHRQ $0x08, SI
2039 MOVL 24(SP)(R9*4), BX
2040 LEAL 2(CX), R8
2041 CMPL (DX)(DI*1), SI
2042 JEQ candidate2_match_encodeBlockAsm4MB
2043 MOVL R8, 24(SP)(R9*4)
2044 SHRQ $0x08, SI
2045 CMPL (DX)(BX*1), SI
2046 JEQ candidate3_match_encodeBlockAsm4MB
2047 MOVL 20(SP), CX
2048 JMP search_loop_encodeBlockAsm4MB
2049
2050candidate3_match_encodeBlockAsm4MB:
2051 ADDL $0x02, CX
2052 JMP candidate_match_encodeBlockAsm4MB
2053
2054candidate2_match_encodeBlockAsm4MB:
2055 MOVL R8, 24(SP)(R9*4)
2056 INCL CX
2057 MOVL DI, BX
2058
2059candidate_match_encodeBlockAsm4MB:
2060 MOVL 12(SP), SI
2061 TESTL BX, BX
2062 JZ match_extend_back_end_encodeBlockAsm4MB
2063
2064match_extend_back_loop_encodeBlockAsm4MB:
2065 CMPL CX, SI
2066 JBE match_extend_back_end_encodeBlockAsm4MB
2067 MOVB -1(DX)(BX*1), DI
2068 MOVB -1(DX)(CX*1), R8
2069 CMPB DI, R8
2070 JNE match_extend_back_end_encodeBlockAsm4MB
2071 LEAL -1(CX), CX
2072 DECL BX
2073 JZ match_extend_back_end_encodeBlockAsm4MB
2074 JMP match_extend_back_loop_encodeBlockAsm4MB
2075
2076match_extend_back_end_encodeBlockAsm4MB:
2077 MOVL CX, SI
2078 SUBL 12(SP), SI
2079 LEAQ 4(AX)(SI*1), SI
2080 CMPQ SI, (SP)
2081 JB match_dst_size_check_encodeBlockAsm4MB
2082 MOVQ $0x00000000, ret+48(FP)
2083 RET
2084
2085match_dst_size_check_encodeBlockAsm4MB:
2086 MOVL CX, SI
2087 MOVL 12(SP), DI
2088 CMPL DI, SI
2089 JEQ emit_literal_done_match_emit_encodeBlockAsm4MB
2090 MOVL SI, R8
2091 MOVL SI, 12(SP)
2092 LEAQ (DX)(DI*1), SI
2093 SUBL DI, R8
2094 LEAL -1(R8), DI
2095 CMPL DI, $0x3c
2096 JB one_byte_match_emit_encodeBlockAsm4MB
2097 CMPL DI, $0x00000100
2098 JB two_bytes_match_emit_encodeBlockAsm4MB
2099 CMPL DI, $0x00010000
2100 JB three_bytes_match_emit_encodeBlockAsm4MB
2101 MOVL DI, R9
2102 SHRL $0x10, R9
2103 MOVB $0xf8, (AX)
2104 MOVW DI, 1(AX)
2105 MOVB R9, 3(AX)
2106 ADDQ $0x04, AX
2107 JMP memmove_long_match_emit_encodeBlockAsm4MB
2108
2109three_bytes_match_emit_encodeBlockAsm4MB:
2110 MOVB $0xf4, (AX)
2111 MOVW DI, 1(AX)
2112 ADDQ $0x03, AX
2113 JMP memmove_long_match_emit_encodeBlockAsm4MB
2114
2115two_bytes_match_emit_encodeBlockAsm4MB:
2116 MOVB $0xf0, (AX)
2117 MOVB DI, 1(AX)
2118 ADDQ $0x02, AX
2119 CMPL DI, $0x40
2120 JB memmove_match_emit_encodeBlockAsm4MB
2121 JMP memmove_long_match_emit_encodeBlockAsm4MB
2122
2123one_byte_match_emit_encodeBlockAsm4MB:
2124 SHLB $0x02, DI
2125 MOVB DI, (AX)
2126 ADDQ $0x01, AX
2127
2128memmove_match_emit_encodeBlockAsm4MB:
2129 LEAQ (AX)(R8*1), DI
2130
2131 // genMemMoveShort
2132 CMPQ R8, $0x08
2133 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
2134 CMPQ R8, $0x10
2135 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
2136 CMPQ R8, $0x20
2137 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
2138 JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
2139
2140emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
2141 MOVQ (SI), R9
2142 MOVQ R9, (AX)
2143 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
2144
2145emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
2146 MOVQ (SI), R9
2147 MOVQ -8(SI)(R8*1), SI
2148 MOVQ R9, (AX)
2149 MOVQ SI, -8(AX)(R8*1)
2150 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
2151
2152emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
2153 MOVOU (SI), X0
2154 MOVOU -16(SI)(R8*1), X1
2155 MOVOU X0, (AX)
2156 MOVOU X1, -16(AX)(R8*1)
2157 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
2158
2159emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
2160 MOVOU (SI), X0
2161 MOVOU 16(SI), X1
2162 MOVOU -32(SI)(R8*1), X2
2163 MOVOU -16(SI)(R8*1), X3
2164 MOVOU X0, (AX)
2165 MOVOU X1, 16(AX)
2166 MOVOU X2, -32(AX)(R8*1)
2167 MOVOU X3, -16(AX)(R8*1)
2168
2169memmove_end_copy_match_emit_encodeBlockAsm4MB:
2170 MOVQ DI, AX
2171 JMP emit_literal_done_match_emit_encodeBlockAsm4MB
2172
2173memmove_long_match_emit_encodeBlockAsm4MB:
2174 LEAQ (AX)(R8*1), DI
2175
2176 // genMemMoveLong
2177 MOVOU (SI), X0
2178 MOVOU 16(SI), X1
2179 MOVOU -32(SI)(R8*1), X2
2180 MOVOU -16(SI)(R8*1), X3
2181 MOVQ R8, R10
2182 SHRQ $0x05, R10
2183 MOVQ AX, R9
2184 ANDL $0x0000001f, R9
2185 MOVQ $0x00000040, R11
2186 SUBQ R9, R11
2187 DECQ R10
2188 JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
2189 LEAQ -32(SI)(R11*1), R9
2190 LEAQ -32(AX)(R11*1), R12
2191
2192emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
2193 MOVOU (R9), X4
2194 MOVOU 16(R9), X5
2195 MOVOA X4, (R12)
2196 MOVOA X5, 16(R12)
2197 ADDQ $0x20, R12
2198 ADDQ $0x20, R9
2199 ADDQ $0x20, R11
2200 DECQ R10
2201 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
2202
2203emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
2204 MOVOU -32(SI)(R11*1), X4
2205 MOVOU -16(SI)(R11*1), X5
2206 MOVOA X4, -32(AX)(R11*1)
2207 MOVOA X5, -16(AX)(R11*1)
2208 ADDQ $0x20, R11
2209 CMPQ R8, R11
2210 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
2211 MOVOU X0, (AX)
2212 MOVOU X1, 16(AX)
2213 MOVOU X2, -32(AX)(R8*1)
2214 MOVOU X3, -16(AX)(R8*1)
2215 MOVQ DI, AX
2216
2217emit_literal_done_match_emit_encodeBlockAsm4MB:
2218match_nolit_loop_encodeBlockAsm4MB:
2219 MOVL CX, SI
2220 SUBL BX, SI
2221 MOVL SI, 16(SP)
2222 ADDL $0x04, CX
2223 ADDL $0x04, BX
2224 MOVQ src_len+32(FP), SI
2225 SUBL CX, SI
2226 LEAQ (DX)(CX*1), DI
2227 LEAQ (DX)(BX*1), BX
2228
2229 // matchLen
2230 XORL R9, R9
2231
2232matchlen_loopback_16_match_nolit_encodeBlockAsm4MB:
2233 CMPL SI, $0x10
2234 JB matchlen_match8_match_nolit_encodeBlockAsm4MB
2235 MOVQ (DI)(R9*1), R8
2236 MOVQ 8(DI)(R9*1), R10
2237 XORQ (BX)(R9*1), R8
2238 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
2239 XORQ 8(BX)(R9*1), R10
2240 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4MB
2241 LEAL -16(SI), SI
2242 LEAL 16(R9), R9
2243 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm4MB
2244
2245matchlen_bsf_16match_nolit_encodeBlockAsm4MB:
2246#ifdef GOAMD64_v3
2247 TZCNTQ R10, R10
2248
2249#else
2250 BSFQ R10, R10
2251
2252#endif
2253 SARQ $0x03, R10
2254 LEAL 8(R9)(R10*1), R9
2255 JMP match_nolit_end_encodeBlockAsm4MB
2256
2257matchlen_match8_match_nolit_encodeBlockAsm4MB:
2258 CMPL SI, $0x08
2259 JB matchlen_match4_match_nolit_encodeBlockAsm4MB
2260 MOVQ (DI)(R9*1), R8
2261 XORQ (BX)(R9*1), R8
2262 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
2263 LEAL -8(SI), SI
2264 LEAL 8(R9), R9
2265 JMP matchlen_match4_match_nolit_encodeBlockAsm4MB
2266
2267matchlen_bsf_8_match_nolit_encodeBlockAsm4MB:
2268#ifdef GOAMD64_v3
2269 TZCNTQ R8, R8
2270
2271#else
2272 BSFQ R8, R8
2273
2274#endif
2275 SARQ $0x03, R8
2276 LEAL (R9)(R8*1), R9
2277 JMP match_nolit_end_encodeBlockAsm4MB
2278
2279matchlen_match4_match_nolit_encodeBlockAsm4MB:
2280 CMPL SI, $0x04
2281 JB matchlen_match2_match_nolit_encodeBlockAsm4MB
2282 MOVL (DI)(R9*1), R8
2283 CMPL (BX)(R9*1), R8
2284 JNE matchlen_match2_match_nolit_encodeBlockAsm4MB
2285 LEAL -4(SI), SI
2286 LEAL 4(R9), R9
2287
2288matchlen_match2_match_nolit_encodeBlockAsm4MB:
2289 CMPL SI, $0x01
2290 JE matchlen_match1_match_nolit_encodeBlockAsm4MB
2291 JB match_nolit_end_encodeBlockAsm4MB
2292 MOVW (DI)(R9*1), R8
2293 CMPW (BX)(R9*1), R8
2294 JNE matchlen_match1_match_nolit_encodeBlockAsm4MB
2295 LEAL 2(R9), R9
2296 SUBL $0x02, SI
2297 JZ match_nolit_end_encodeBlockAsm4MB
2298
2299matchlen_match1_match_nolit_encodeBlockAsm4MB:
2300 MOVB (DI)(R9*1), R8
2301 CMPB (BX)(R9*1), R8
2302 JNE match_nolit_end_encodeBlockAsm4MB
2303 LEAL 1(R9), R9
2304
2305match_nolit_end_encodeBlockAsm4MB:
2306 ADDL R9, CX
2307 MOVL 16(SP), BX
2308 ADDL $0x04, R9
2309 MOVL CX, 12(SP)
2310
2311 // emitCopy
2312 CMPL BX, $0x00010000
2313 JB two_byte_offset_match_nolit_encodeBlockAsm4MB
2314 CMPL R9, $0x40
2315 JBE four_bytes_remain_match_nolit_encodeBlockAsm4MB
2316 MOVB $0xff, (AX)
2317 MOVL BX, 1(AX)
2318 LEAL -64(R9), R9
2319 ADDQ $0x05, AX
2320 CMPL R9, $0x04
2321 JB four_bytes_remain_match_nolit_encodeBlockAsm4MB
2322
2323 // emitRepeat
2324 MOVL R9, SI
2325 LEAL -4(R9), R9
2326 CMPL SI, $0x08
2327 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
2328 CMPL SI, $0x0c
2329 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
2330 CMPL BX, $0x00000800
2331 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
2332
2333cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
2334 CMPL R9, $0x00000104
2335 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
2336 CMPL R9, $0x00010100
2337 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
2338 LEAL -65536(R9), R9
2339 MOVL R9, BX
2340 MOVW $0x001d, (AX)
2341 MOVW R9, 2(AX)
2342 SARL $0x10, BX
2343 MOVB BL, 4(AX)
2344 ADDQ $0x05, AX
2345 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2346
2347repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
2348 LEAL -256(R9), R9
2349 MOVW $0x0019, (AX)
2350 MOVW R9, 2(AX)
2351 ADDQ $0x04, AX
2352 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2353
2354repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
2355 LEAL -4(R9), R9
2356 MOVW $0x0015, (AX)
2357 MOVB R9, 2(AX)
2358 ADDQ $0x03, AX
2359 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2360
2361repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
2362 SHLL $0x02, R9
2363 ORL $0x01, R9
2364 MOVW R9, (AX)
2365 ADDQ $0x02, AX
2366 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2367
2368repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
2369 XORQ SI, SI
2370 LEAL 1(SI)(R9*4), R9
2371 MOVB BL, 1(AX)
2372 SARL $0x08, BX
2373 SHLL $0x05, BX
2374 ORL BX, R9
2375 MOVB R9, (AX)
2376 ADDQ $0x02, AX
2377 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2378
2379four_bytes_remain_match_nolit_encodeBlockAsm4MB:
2380 TESTL R9, R9
2381 JZ match_nolit_emitcopy_end_encodeBlockAsm4MB
2382 XORL SI, SI
2383 LEAL -1(SI)(R9*4), R9
2384 MOVB R9, (AX)
2385 MOVL BX, 1(AX)
2386 ADDQ $0x05, AX
2387 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2388
2389two_byte_offset_match_nolit_encodeBlockAsm4MB:
2390 CMPL R9, $0x40
2391 JBE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
2392 CMPL BX, $0x00000800
2393 JAE long_offset_short_match_nolit_encodeBlockAsm4MB
2394 MOVL $0x00000001, SI
2395 LEAL 16(SI), SI
2396 MOVB BL, 1(AX)
2397 SHRL $0x08, BX
2398 SHLL $0x05, BX
2399 ORL BX, SI
2400 MOVB SI, (AX)
2401 ADDQ $0x02, AX
2402 SUBL $0x08, R9
2403
2404 // emitRepeat
2405 LEAL -4(R9), R9
2406 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2407 MOVL R9, SI
2408 LEAL -4(R9), R9
2409 CMPL SI, $0x08
2410 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2411 CMPL SI, $0x0c
2412 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2413 CMPL BX, $0x00000800
2414 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2415
2416cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2417 CMPL R9, $0x00000104
2418 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2419 CMPL R9, $0x00010100
2420 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2421 LEAL -65536(R9), R9
2422 MOVL R9, BX
2423 MOVW $0x001d, (AX)
2424 MOVW R9, 2(AX)
2425 SARL $0x10, BX
2426 MOVB BL, 4(AX)
2427 ADDQ $0x05, AX
2428 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2429
2430repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2431 LEAL -256(R9), R9
2432 MOVW $0x0019, (AX)
2433 MOVW R9, 2(AX)
2434 ADDQ $0x04, AX
2435 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2436
2437repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2438 LEAL -4(R9), R9
2439 MOVW $0x0015, (AX)
2440 MOVB R9, 2(AX)
2441 ADDQ $0x03, AX
2442 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2443
2444repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2445 SHLL $0x02, R9
2446 ORL $0x01, R9
2447 MOVW R9, (AX)
2448 ADDQ $0x02, AX
2449 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2450
2451repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2452 XORQ SI, SI
2453 LEAL 1(SI)(R9*4), R9
2454 MOVB BL, 1(AX)
2455 SARL $0x08, BX
2456 SHLL $0x05, BX
2457 ORL BX, R9
2458 MOVB R9, (AX)
2459 ADDQ $0x02, AX
2460 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2461
2462long_offset_short_match_nolit_encodeBlockAsm4MB:
2463 MOVB $0xee, (AX)
2464 MOVW BX, 1(AX)
2465 LEAL -60(R9), R9
2466 ADDQ $0x03, AX
2467
2468 // emitRepeat
2469 MOVL R9, SI
2470 LEAL -4(R9), R9
2471 CMPL SI, $0x08
2472 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
2473 CMPL SI, $0x0c
2474 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
2475 CMPL BX, $0x00000800
2476 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
2477
2478cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2479 CMPL R9, $0x00000104
2480 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
2481 CMPL R9, $0x00010100
2482 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
2483 LEAL -65536(R9), R9
2484 MOVL R9, BX
2485 MOVW $0x001d, (AX)
2486 MOVW R9, 2(AX)
2487 SARL $0x10, BX
2488 MOVB BL, 4(AX)
2489 ADDQ $0x05, AX
2490 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2491
2492repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2493 LEAL -256(R9), R9
2494 MOVW $0x0019, (AX)
2495 MOVW R9, 2(AX)
2496 ADDQ $0x04, AX
2497 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2498
2499repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2500 LEAL -4(R9), R9
2501 MOVW $0x0015, (AX)
2502 MOVB R9, 2(AX)
2503 ADDQ $0x03, AX
2504 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2505
2506repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2507 SHLL $0x02, R9
2508 ORL $0x01, R9
2509 MOVW R9, (AX)
2510 ADDQ $0x02, AX
2511 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2512
2513repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2514 XORQ SI, SI
2515 LEAL 1(SI)(R9*4), R9
2516 MOVB BL, 1(AX)
2517 SARL $0x08, BX
2518 SHLL $0x05, BX
2519 ORL BX, R9
2520 MOVB R9, (AX)
2521 ADDQ $0x02, AX
2522 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2523
2524two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
2525 MOVL R9, SI
2526 SHLL $0x02, SI
2527 CMPL R9, $0x0c
2528 JAE emit_copy_three_match_nolit_encodeBlockAsm4MB
2529 CMPL BX, $0x00000800
2530 JAE emit_copy_three_match_nolit_encodeBlockAsm4MB
2531 LEAL -15(SI), SI
2532 MOVB BL, 1(AX)
2533 SHRL $0x08, BX
2534 SHLL $0x05, BX
2535 ORL BX, SI
2536 MOVB SI, (AX)
2537 ADDQ $0x02, AX
2538 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2539
2540emit_copy_three_match_nolit_encodeBlockAsm4MB:
2541 LEAL -2(SI), SI
2542 MOVB SI, (AX)
2543 MOVW BX, 1(AX)
2544 ADDQ $0x03, AX
2545
2546match_nolit_emitcopy_end_encodeBlockAsm4MB:
2547 CMPL CX, 8(SP)
2548 JAE emit_remainder_encodeBlockAsm4MB
2549 MOVQ -2(DX)(CX*1), SI
2550 CMPQ AX, (SP)
2551 JB match_nolit_dst_ok_encodeBlockAsm4MB
2552 MOVQ $0x00000000, ret+48(FP)
2553 RET
2554
2555match_nolit_dst_ok_encodeBlockAsm4MB:
2556 MOVQ $0x0000cf1bbcdcbf9b, R8
2557 MOVQ SI, DI
2558 SHRQ $0x10, SI
2559 MOVQ SI, BX
2560 SHLQ $0x10, DI
2561 IMULQ R8, DI
2562 SHRQ $0x32, DI
2563 SHLQ $0x10, BX
2564 IMULQ R8, BX
2565 SHRQ $0x32, BX
2566 LEAL -2(CX), R8
2567 LEAQ 24(SP)(BX*4), R9
2568 MOVL (R9), BX
2569 MOVL R8, 24(SP)(DI*4)
2570 MOVL CX, (R9)
2571 CMPL (DX)(BX*1), SI
2572 JEQ match_nolit_loop_encodeBlockAsm4MB
2573 INCL CX
2574 JMP search_loop_encodeBlockAsm4MB
2575
2576emit_remainder_encodeBlockAsm4MB:
2577 MOVQ src_len+32(FP), CX
2578 SUBL 12(SP), CX
2579 LEAQ 4(AX)(CX*1), CX
2580 CMPQ CX, (SP)
2581 JB emit_remainder_ok_encodeBlockAsm4MB
2582 MOVQ $0x00000000, ret+48(FP)
2583 RET
2584
2585emit_remainder_ok_encodeBlockAsm4MB:
2586 MOVQ src_len+32(FP), CX
2587 MOVL 12(SP), BX
2588 CMPL BX, CX
2589 JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB
2590 MOVL CX, SI
2591 MOVL CX, 12(SP)
2592 LEAQ (DX)(BX*1), CX
2593 SUBL BX, SI
2594 LEAL -1(SI), DX
2595 CMPL DX, $0x3c
2596 JB one_byte_emit_remainder_encodeBlockAsm4MB
2597 CMPL DX, $0x00000100
2598 JB two_bytes_emit_remainder_encodeBlockAsm4MB
2599 CMPL DX, $0x00010000
2600 JB three_bytes_emit_remainder_encodeBlockAsm4MB
2601 MOVL DX, BX
2602 SHRL $0x10, BX
2603 MOVB $0xf8, (AX)
2604 MOVW DX, 1(AX)
2605 MOVB BL, 3(AX)
2606 ADDQ $0x04, AX
2607 JMP memmove_long_emit_remainder_encodeBlockAsm4MB
2608
2609three_bytes_emit_remainder_encodeBlockAsm4MB:
2610 MOVB $0xf4, (AX)
2611 MOVW DX, 1(AX)
2612 ADDQ $0x03, AX
2613 JMP memmove_long_emit_remainder_encodeBlockAsm4MB
2614
2615two_bytes_emit_remainder_encodeBlockAsm4MB:
2616 MOVB $0xf0, (AX)
2617 MOVB DL, 1(AX)
2618 ADDQ $0x02, AX
2619 CMPL DX, $0x40
2620 JB memmove_emit_remainder_encodeBlockAsm4MB
2621 JMP memmove_long_emit_remainder_encodeBlockAsm4MB
2622
2623one_byte_emit_remainder_encodeBlockAsm4MB:
2624 SHLB $0x02, DL
2625 MOVB DL, (AX)
2626 ADDQ $0x01, AX
2627
2628memmove_emit_remainder_encodeBlockAsm4MB:
2629 LEAQ (AX)(SI*1), DX
2630 MOVL SI, BX
2631
2632 // genMemMoveShort
2633 CMPQ BX, $0x03
2634 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
2635 JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
2636 CMPQ BX, $0x08
2637 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
2638 CMPQ BX, $0x10
2639 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
2640 CMPQ BX, $0x20
2641 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
2642 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
2643
2644emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
2645 MOVB (CX), SI
2646 MOVB -1(CX)(BX*1), CL
2647 MOVB SI, (AX)
2648 MOVB CL, -1(AX)(BX*1)
2649 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2650
2651emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
2652 MOVW (CX), SI
2653 MOVB 2(CX), CL
2654 MOVW SI, (AX)
2655 MOVB CL, 2(AX)
2656 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2657
2658emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
2659 MOVL (CX), SI
2660 MOVL -4(CX)(BX*1), CX
2661 MOVL SI, (AX)
2662 MOVL CX, -4(AX)(BX*1)
2663 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2664
2665emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
2666 MOVQ (CX), SI
2667 MOVQ -8(CX)(BX*1), CX
2668 MOVQ SI, (AX)
2669 MOVQ CX, -8(AX)(BX*1)
2670 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2671
2672emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
2673 MOVOU (CX), X0
2674 MOVOU -16(CX)(BX*1), X1
2675 MOVOU X0, (AX)
2676 MOVOU X1, -16(AX)(BX*1)
2677 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2678
2679emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
2680 MOVOU (CX), X0
2681 MOVOU 16(CX), X1
2682 MOVOU -32(CX)(BX*1), X2
2683 MOVOU -16(CX)(BX*1), X3
2684 MOVOU X0, (AX)
2685 MOVOU X1, 16(AX)
2686 MOVOU X2, -32(AX)(BX*1)
2687 MOVOU X3, -16(AX)(BX*1)
2688
2689memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
2690 MOVQ DX, AX
2691 JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB
2692
2693memmove_long_emit_remainder_encodeBlockAsm4MB:
2694 LEAQ (AX)(SI*1), DX
2695 MOVL SI, BX
2696
2697 // genMemMoveLong
2698 MOVOU (CX), X0
2699 MOVOU 16(CX), X1
2700 MOVOU -32(CX)(BX*1), X2
2701 MOVOU -16(CX)(BX*1), X3
2702 MOVQ BX, DI
2703 SHRQ $0x05, DI
2704 MOVQ AX, SI
2705 ANDL $0x0000001f, SI
2706 MOVQ $0x00000040, R8
2707 SUBQ SI, R8
2708 DECQ DI
2709 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
2710 LEAQ -32(CX)(R8*1), SI
2711 LEAQ -32(AX)(R8*1), R9
2712
2713emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
2714 MOVOU (SI), X4
2715 MOVOU 16(SI), X5
2716 MOVOA X4, (R9)
2717 MOVOA X5, 16(R9)
2718 ADDQ $0x20, R9
2719 ADDQ $0x20, SI
2720 ADDQ $0x20, R8
2721 DECQ DI
2722 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
2723
2724emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
2725 MOVOU -32(CX)(R8*1), X4
2726 MOVOU -16(CX)(R8*1), X5
2727 MOVOA X4, -32(AX)(R8*1)
2728 MOVOA X5, -16(AX)(R8*1)
2729 ADDQ $0x20, R8
2730 CMPQ BX, R8
2731 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
2732 MOVOU X0, (AX)
2733 MOVOU X1, 16(AX)
2734 MOVOU X2, -32(AX)(BX*1)
2735 MOVOU X3, -16(AX)(BX*1)
2736 MOVQ DX, AX
2737
2738emit_literal_done_emit_remainder_encodeBlockAsm4MB:
2739 MOVQ dst_base+0(FP), CX
2740 SUBQ CX, AX
2741 MOVQ AX, ret+48(FP)
2742 RET
2743
2744// func encodeBlockAsm12B(dst []byte, src []byte) int
2745// Requires: BMI, SSE2
2746TEXT ·encodeBlockAsm12B(SB), $16408-56
2747 MOVQ dst_base+0(FP), AX
2748 MOVQ $0x00000080, CX
2749 LEAQ 24(SP), DX
2750 PXOR X0, X0
2751
2752zero_loop_encodeBlockAsm12B:
2753 MOVOU X0, (DX)
2754 MOVOU X0, 16(DX)
2755 MOVOU X0, 32(DX)
2756 MOVOU X0, 48(DX)
2757 MOVOU X0, 64(DX)
2758 MOVOU X0, 80(DX)
2759 MOVOU X0, 96(DX)
2760 MOVOU X0, 112(DX)
2761 ADDQ $0x80, DX
2762 DECQ CX
2763 JNZ zero_loop_encodeBlockAsm12B
2764 MOVL $0x00000000, 12(SP)
2765 MOVQ src_len+32(FP), CX
2766 LEAQ -9(CX), DX
2767 LEAQ -8(CX), BX
2768 MOVL BX, 8(SP)
2769 SHRQ $0x05, CX
2770 SUBL CX, DX
2771 LEAQ (AX)(DX*1), DX
2772 MOVQ DX, (SP)
2773 MOVL $0x00000001, CX
2774 MOVL CX, 16(SP)
2775 MOVQ src_base+24(FP), DX
2776
2777search_loop_encodeBlockAsm12B:
2778 MOVL CX, BX
2779 SUBL 12(SP), BX
2780 SHRL $0x05, BX
2781 LEAL 4(CX)(BX*1), BX
2782 CMPL BX, 8(SP)
2783 JAE emit_remainder_encodeBlockAsm12B
2784 MOVQ (DX)(CX*1), SI
2785 MOVL BX, 20(SP)
2786 MOVQ $0x000000cf1bbcdcbb, R8
2787 MOVQ SI, R9
2788 MOVQ SI, R10
2789 SHRQ $0x08, R10
2790 SHLQ $0x18, R9
2791 IMULQ R8, R9
2792 SHRQ $0x34, R9
2793 SHLQ $0x18, R10
2794 IMULQ R8, R10
2795 SHRQ $0x34, R10
2796 MOVL 24(SP)(R9*4), BX
2797 MOVL 24(SP)(R10*4), DI
2798 MOVL CX, 24(SP)(R9*4)
2799 LEAL 1(CX), R9
2800 MOVL R9, 24(SP)(R10*4)
2801 MOVQ SI, R9
2802 SHRQ $0x10, R9
2803 SHLQ $0x18, R9
2804 IMULQ R8, R9
2805 SHRQ $0x34, R9
2806 MOVL CX, R8
2807 SUBL 16(SP), R8
2808 MOVL 1(DX)(R8*1), R10
2809 MOVQ SI, R8
2810 SHRQ $0x08, R8
2811 CMPL R8, R10
2812 JNE no_repeat_found_encodeBlockAsm12B
2813 LEAL 1(CX), SI
2814 MOVL 12(SP), DI
2815 MOVL SI, BX
2816 SUBL 16(SP), BX
2817 JZ repeat_extend_back_end_encodeBlockAsm12B
2818
2819repeat_extend_back_loop_encodeBlockAsm12B:
2820 CMPL SI, DI
2821 JBE repeat_extend_back_end_encodeBlockAsm12B
2822 MOVB -1(DX)(BX*1), R8
2823 MOVB -1(DX)(SI*1), R9
2824 CMPB R8, R9
2825 JNE repeat_extend_back_end_encodeBlockAsm12B
2826 LEAL -1(SI), SI
2827 DECL BX
2828 JNZ repeat_extend_back_loop_encodeBlockAsm12B
2829
2830repeat_extend_back_end_encodeBlockAsm12B:
2831 MOVL 12(SP), BX
2832 CMPL BX, SI
2833 JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B
2834 MOVL SI, R8
2835 MOVL SI, 12(SP)
2836 LEAQ (DX)(BX*1), R9
2837 SUBL BX, R8
2838 LEAL -1(R8), BX
2839 CMPL BX, $0x3c
2840 JB one_byte_repeat_emit_encodeBlockAsm12B
2841 CMPL BX, $0x00000100
2842 JB two_bytes_repeat_emit_encodeBlockAsm12B
2843 JB three_bytes_repeat_emit_encodeBlockAsm12B
2844
2845three_bytes_repeat_emit_encodeBlockAsm12B:
2846 MOVB $0xf4, (AX)
2847 MOVW BX, 1(AX)
2848 ADDQ $0x03, AX
2849 JMP memmove_long_repeat_emit_encodeBlockAsm12B
2850
2851two_bytes_repeat_emit_encodeBlockAsm12B:
2852 MOVB $0xf0, (AX)
2853 MOVB BL, 1(AX)
2854 ADDQ $0x02, AX
2855 CMPL BX, $0x40
2856 JB memmove_repeat_emit_encodeBlockAsm12B
2857 JMP memmove_long_repeat_emit_encodeBlockAsm12B
2858
2859one_byte_repeat_emit_encodeBlockAsm12B:
2860 SHLB $0x02, BL
2861 MOVB BL, (AX)
2862 ADDQ $0x01, AX
2863
2864memmove_repeat_emit_encodeBlockAsm12B:
2865 LEAQ (AX)(R8*1), BX
2866
2867 // genMemMoveShort
2868 CMPQ R8, $0x08
2869 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
2870 CMPQ R8, $0x10
2871 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
2872 CMPQ R8, $0x20
2873 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
2874 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
2875
2876emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
2877 MOVQ (R9), R10
2878 MOVQ R10, (AX)
2879 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
2880
2881emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
2882 MOVQ (R9), R10
2883 MOVQ -8(R9)(R8*1), R9
2884 MOVQ R10, (AX)
2885 MOVQ R9, -8(AX)(R8*1)
2886 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
2887
2888emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
2889 MOVOU (R9), X0
2890 MOVOU -16(R9)(R8*1), X1
2891 MOVOU X0, (AX)
2892 MOVOU X1, -16(AX)(R8*1)
2893 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
2894
2895emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
2896 MOVOU (R9), X0
2897 MOVOU 16(R9), X1
2898 MOVOU -32(R9)(R8*1), X2
2899 MOVOU -16(R9)(R8*1), X3
2900 MOVOU X0, (AX)
2901 MOVOU X1, 16(AX)
2902 MOVOU X2, -32(AX)(R8*1)
2903 MOVOU X3, -16(AX)(R8*1)
2904
2905memmove_end_copy_repeat_emit_encodeBlockAsm12B:
2906 MOVQ BX, AX
2907 JMP emit_literal_done_repeat_emit_encodeBlockAsm12B
2908
2909memmove_long_repeat_emit_encodeBlockAsm12B:
2910 LEAQ (AX)(R8*1), BX
2911
2912 // genMemMoveLong
2913 MOVOU (R9), X0
2914 MOVOU 16(R9), X1
2915 MOVOU -32(R9)(R8*1), X2
2916 MOVOU -16(R9)(R8*1), X3
2917 MOVQ R8, R11
2918 SHRQ $0x05, R11
2919 MOVQ AX, R10
2920 ANDL $0x0000001f, R10
2921 MOVQ $0x00000040, R12
2922 SUBQ R10, R12
2923 DECQ R11
2924 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2925 LEAQ -32(R9)(R12*1), R10
2926 LEAQ -32(AX)(R12*1), R13
2927
2928emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
2929 MOVOU (R10), X4
2930 MOVOU 16(R10), X5
2931 MOVOA X4, (R13)
2932 MOVOA X5, 16(R13)
2933 ADDQ $0x20, R13
2934 ADDQ $0x20, R10
2935 ADDQ $0x20, R12
2936 DECQ R11
2937 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
2938
2939emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
2940 MOVOU -32(R9)(R12*1), X4
2941 MOVOU -16(R9)(R12*1), X5
2942 MOVOA X4, -32(AX)(R12*1)
2943 MOVOA X5, -16(AX)(R12*1)
2944 ADDQ $0x20, R12
2945 CMPQ R8, R12
2946 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2947 MOVOU X0, (AX)
2948 MOVOU X1, 16(AX)
2949 MOVOU X2, -32(AX)(R8*1)
2950 MOVOU X3, -16(AX)(R8*1)
2951 MOVQ BX, AX
2952
2953emit_literal_done_repeat_emit_encodeBlockAsm12B:
2954 ADDL $0x05, CX
2955 MOVL CX, BX
2956 SUBL 16(SP), BX
2957 MOVQ src_len+32(FP), R8
2958 SUBL CX, R8
2959 LEAQ (DX)(CX*1), R9
2960 LEAQ (DX)(BX*1), BX
2961
2962 // matchLen
2963 XORL R11, R11
2964
2965matchlen_loopback_16_repeat_extend_encodeBlockAsm12B:
2966 CMPL R8, $0x10
2967 JB matchlen_match8_repeat_extend_encodeBlockAsm12B
2968 MOVQ (R9)(R11*1), R10
2969 MOVQ 8(R9)(R11*1), R12
2970 XORQ (BX)(R11*1), R10
2971 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
2972 XORQ 8(BX)(R11*1), R12
2973 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm12B
2974 LEAL -16(R8), R8
2975 LEAL 16(R11), R11
2976 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm12B
2977
2978matchlen_bsf_16repeat_extend_encodeBlockAsm12B:
2979#ifdef GOAMD64_v3
2980 TZCNTQ R12, R12
2981
2982#else
2983 BSFQ R12, R12
2984
2985#endif
2986 SARQ $0x03, R12
2987 LEAL 8(R11)(R12*1), R11
2988 JMP repeat_extend_forward_end_encodeBlockAsm12B
2989
2990matchlen_match8_repeat_extend_encodeBlockAsm12B:
2991 CMPL R8, $0x08
2992 JB matchlen_match4_repeat_extend_encodeBlockAsm12B
2993 MOVQ (R9)(R11*1), R10
2994 XORQ (BX)(R11*1), R10
2995 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
2996 LEAL -8(R8), R8
2997 LEAL 8(R11), R11
2998 JMP matchlen_match4_repeat_extend_encodeBlockAsm12B
2999
3000matchlen_bsf_8_repeat_extend_encodeBlockAsm12B:
3001#ifdef GOAMD64_v3
3002 TZCNTQ R10, R10
3003
3004#else
3005 BSFQ R10, R10
3006
3007#endif
3008 SARQ $0x03, R10
3009 LEAL (R11)(R10*1), R11
3010 JMP repeat_extend_forward_end_encodeBlockAsm12B
3011
3012matchlen_match4_repeat_extend_encodeBlockAsm12B:
3013 CMPL R8, $0x04
3014 JB matchlen_match2_repeat_extend_encodeBlockAsm12B
3015 MOVL (R9)(R11*1), R10
3016 CMPL (BX)(R11*1), R10
3017 JNE matchlen_match2_repeat_extend_encodeBlockAsm12B
3018 LEAL -4(R8), R8
3019 LEAL 4(R11), R11
3020
3021matchlen_match2_repeat_extend_encodeBlockAsm12B:
3022 CMPL R8, $0x01
3023 JE matchlen_match1_repeat_extend_encodeBlockAsm12B
3024 JB repeat_extend_forward_end_encodeBlockAsm12B
3025 MOVW (R9)(R11*1), R10
3026 CMPW (BX)(R11*1), R10
3027 JNE matchlen_match1_repeat_extend_encodeBlockAsm12B
3028 LEAL 2(R11), R11
3029 SUBL $0x02, R8
3030 JZ repeat_extend_forward_end_encodeBlockAsm12B
3031
3032matchlen_match1_repeat_extend_encodeBlockAsm12B:
3033 MOVB (R9)(R11*1), R10
3034 CMPB (BX)(R11*1), R10
3035 JNE repeat_extend_forward_end_encodeBlockAsm12B
3036 LEAL 1(R11), R11
3037
3038repeat_extend_forward_end_encodeBlockAsm12B:
3039 ADDL R11, CX
3040 MOVL CX, BX
3041 SUBL SI, BX
3042 MOVL 16(SP), SI
3043 TESTL DI, DI
3044 JZ repeat_as_copy_encodeBlockAsm12B
3045
3046 // emitRepeat
3047 MOVL BX, DI
3048 LEAL -4(BX), BX
3049 CMPL DI, $0x08
3050 JBE repeat_two_match_repeat_encodeBlockAsm12B
3051 CMPL DI, $0x0c
3052 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
3053 CMPL SI, $0x00000800
3054 JB repeat_two_offset_match_repeat_encodeBlockAsm12B
3055
3056cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
3057 CMPL BX, $0x00000104
3058 JB repeat_three_match_repeat_encodeBlockAsm12B
3059 LEAL -256(BX), BX
3060 MOVW $0x0019, (AX)
3061 MOVW BX, 2(AX)
3062 ADDQ $0x04, AX
3063 JMP repeat_end_emit_encodeBlockAsm12B
3064
3065repeat_three_match_repeat_encodeBlockAsm12B:
3066 LEAL -4(BX), BX
3067 MOVW $0x0015, (AX)
3068 MOVB BL, 2(AX)
3069 ADDQ $0x03, AX
3070 JMP repeat_end_emit_encodeBlockAsm12B
3071
3072repeat_two_match_repeat_encodeBlockAsm12B:
3073 SHLL $0x02, BX
3074 ORL $0x01, BX
3075 MOVW BX, (AX)
3076 ADDQ $0x02, AX
3077 JMP repeat_end_emit_encodeBlockAsm12B
3078
3079repeat_two_offset_match_repeat_encodeBlockAsm12B:
3080 XORQ DI, DI
3081 LEAL 1(DI)(BX*4), BX
3082 MOVB SI, 1(AX)
3083 SARL $0x08, SI
3084 SHLL $0x05, SI
3085 ORL SI, BX
3086 MOVB BL, (AX)
3087 ADDQ $0x02, AX
3088 JMP repeat_end_emit_encodeBlockAsm12B
3089
3090repeat_as_copy_encodeBlockAsm12B:
3091 // emitCopy
3092 CMPL BX, $0x40
3093 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
3094 CMPL SI, $0x00000800
3095 JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B
3096 MOVL $0x00000001, DI
3097 LEAL 16(DI), DI
3098 MOVB SI, 1(AX)
3099 SHRL $0x08, SI
3100 SHLL $0x05, SI
3101 ORL SI, DI
3102 MOVB DI, (AX)
3103 ADDQ $0x02, AX
3104 SUBL $0x08, BX
3105
3106 // emitRepeat
3107 LEAL -4(BX), BX
3108 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3109 MOVL BX, DI
3110 LEAL -4(BX), BX
3111 CMPL DI, $0x08
3112 JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3113 CMPL DI, $0x0c
3114 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3115 CMPL SI, $0x00000800
3116 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3117
3118cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3119 CMPL BX, $0x00000104
3120 JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3121 LEAL -256(BX), BX
3122 MOVW $0x0019, (AX)
3123 MOVW BX, 2(AX)
3124 ADDQ $0x04, AX
3125 JMP repeat_end_emit_encodeBlockAsm12B
3126
3127repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3128 LEAL -4(BX), BX
3129 MOVW $0x0015, (AX)
3130 MOVB BL, 2(AX)
3131 ADDQ $0x03, AX
3132 JMP repeat_end_emit_encodeBlockAsm12B
3133
3134repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3135 SHLL $0x02, BX
3136 ORL $0x01, BX
3137 MOVW BX, (AX)
3138 ADDQ $0x02, AX
3139 JMP repeat_end_emit_encodeBlockAsm12B
3140
3141repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3142 XORQ DI, DI
3143 LEAL 1(DI)(BX*4), BX
3144 MOVB SI, 1(AX)
3145 SARL $0x08, SI
3146 SHLL $0x05, SI
3147 ORL SI, BX
3148 MOVB BL, (AX)
3149 ADDQ $0x02, AX
3150 JMP repeat_end_emit_encodeBlockAsm12B
3151
3152long_offset_short_repeat_as_copy_encodeBlockAsm12B:
3153 MOVB $0xee, (AX)
3154 MOVW SI, 1(AX)
3155 LEAL -60(BX), BX
3156 ADDQ $0x03, AX
3157
3158 // emitRepeat
3159 MOVL BX, DI
3160 LEAL -4(BX), BX
3161 CMPL DI, $0x08
3162 JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3163 CMPL DI, $0x0c
3164 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3165 CMPL SI, $0x00000800
3166 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3167
3168cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3169 CMPL BX, $0x00000104
3170 JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3171 LEAL -256(BX), BX
3172 MOVW $0x0019, (AX)
3173 MOVW BX, 2(AX)
3174 ADDQ $0x04, AX
3175 JMP repeat_end_emit_encodeBlockAsm12B
3176
3177repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3178 LEAL -4(BX), BX
3179 MOVW $0x0015, (AX)
3180 MOVB BL, 2(AX)
3181 ADDQ $0x03, AX
3182 JMP repeat_end_emit_encodeBlockAsm12B
3183
3184repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3185 SHLL $0x02, BX
3186 ORL $0x01, BX
3187 MOVW BX, (AX)
3188 ADDQ $0x02, AX
3189 JMP repeat_end_emit_encodeBlockAsm12B
3190
3191repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3192 XORQ DI, DI
3193 LEAL 1(DI)(BX*4), BX
3194 MOVB SI, 1(AX)
3195 SARL $0x08, SI
3196 SHLL $0x05, SI
3197 ORL SI, BX
3198 MOVB BL, (AX)
3199 ADDQ $0x02, AX
3200 JMP repeat_end_emit_encodeBlockAsm12B
3201
3202two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
3203 MOVL BX, DI
3204 SHLL $0x02, DI
3205 CMPL BX, $0x0c
3206 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
3207 CMPL SI, $0x00000800
3208 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
3209 LEAL -15(DI), DI
3210 MOVB SI, 1(AX)
3211 SHRL $0x08, SI
3212 SHLL $0x05, SI
3213 ORL SI, DI
3214 MOVB DI, (AX)
3215 ADDQ $0x02, AX
3216 JMP repeat_end_emit_encodeBlockAsm12B
3217
3218emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
3219 LEAL -2(DI), DI
3220 MOVB DI, (AX)
3221 MOVW SI, 1(AX)
3222 ADDQ $0x03, AX
3223
3224repeat_end_emit_encodeBlockAsm12B:
3225 MOVL CX, 12(SP)
3226 JMP search_loop_encodeBlockAsm12B
3227
3228no_repeat_found_encodeBlockAsm12B:
3229 CMPL (DX)(BX*1), SI
3230 JEQ candidate_match_encodeBlockAsm12B
3231 SHRQ $0x08, SI
3232 MOVL 24(SP)(R9*4), BX
3233 LEAL 2(CX), R8
3234 CMPL (DX)(DI*1), SI
3235 JEQ candidate2_match_encodeBlockAsm12B
3236 MOVL R8, 24(SP)(R9*4)
3237 SHRQ $0x08, SI
3238 CMPL (DX)(BX*1), SI
3239 JEQ candidate3_match_encodeBlockAsm12B
3240 MOVL 20(SP), CX
3241 JMP search_loop_encodeBlockAsm12B
3242
3243candidate3_match_encodeBlockAsm12B:
3244 ADDL $0x02, CX
3245 JMP candidate_match_encodeBlockAsm12B
3246
3247candidate2_match_encodeBlockAsm12B:
3248 MOVL R8, 24(SP)(R9*4)
3249 INCL CX
3250 MOVL DI, BX
3251
3252candidate_match_encodeBlockAsm12B:
3253 MOVL 12(SP), SI
3254 TESTL BX, BX
3255 JZ match_extend_back_end_encodeBlockAsm12B
3256
3257match_extend_back_loop_encodeBlockAsm12B:
3258 CMPL CX, SI
3259 JBE match_extend_back_end_encodeBlockAsm12B
3260 MOVB -1(DX)(BX*1), DI
3261 MOVB -1(DX)(CX*1), R8
3262 CMPB DI, R8
3263 JNE match_extend_back_end_encodeBlockAsm12B
3264 LEAL -1(CX), CX
3265 DECL BX
3266 JZ match_extend_back_end_encodeBlockAsm12B
3267 JMP match_extend_back_loop_encodeBlockAsm12B
3268
3269match_extend_back_end_encodeBlockAsm12B:
3270 MOVL CX, SI
3271 SUBL 12(SP), SI
3272 LEAQ 3(AX)(SI*1), SI
3273 CMPQ SI, (SP)
3274 JB match_dst_size_check_encodeBlockAsm12B
3275 MOVQ $0x00000000, ret+48(FP)
3276 RET
3277
3278match_dst_size_check_encodeBlockAsm12B:
3279 MOVL CX, SI
3280 MOVL 12(SP), DI
3281 CMPL DI, SI
3282 JEQ emit_literal_done_match_emit_encodeBlockAsm12B
3283 MOVL SI, R8
3284 MOVL SI, 12(SP)
3285 LEAQ (DX)(DI*1), SI
3286 SUBL DI, R8
3287 LEAL -1(R8), DI
3288 CMPL DI, $0x3c
3289 JB one_byte_match_emit_encodeBlockAsm12B
3290 CMPL DI, $0x00000100
3291 JB two_bytes_match_emit_encodeBlockAsm12B
3292 JB three_bytes_match_emit_encodeBlockAsm12B
3293
3294three_bytes_match_emit_encodeBlockAsm12B:
3295 MOVB $0xf4, (AX)
3296 MOVW DI, 1(AX)
3297 ADDQ $0x03, AX
3298 JMP memmove_long_match_emit_encodeBlockAsm12B
3299
3300two_bytes_match_emit_encodeBlockAsm12B:
3301 MOVB $0xf0, (AX)
3302 MOVB DI, 1(AX)
3303 ADDQ $0x02, AX
3304 CMPL DI, $0x40
3305 JB memmove_match_emit_encodeBlockAsm12B
3306 JMP memmove_long_match_emit_encodeBlockAsm12B
3307
3308one_byte_match_emit_encodeBlockAsm12B:
3309 SHLB $0x02, DI
3310 MOVB DI, (AX)
3311 ADDQ $0x01, AX
3312
3313memmove_match_emit_encodeBlockAsm12B:
3314 LEAQ (AX)(R8*1), DI
3315
3316 // genMemMoveShort
3317 CMPQ R8, $0x08
3318 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
3319 CMPQ R8, $0x10
3320 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
3321 CMPQ R8, $0x20
3322 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
3323 JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
3324
3325emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
3326 MOVQ (SI), R9
3327 MOVQ R9, (AX)
3328 JMP memmove_end_copy_match_emit_encodeBlockAsm12B
3329
3330emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
3331 MOVQ (SI), R9
3332 MOVQ -8(SI)(R8*1), SI
3333 MOVQ R9, (AX)
3334 MOVQ SI, -8(AX)(R8*1)
3335 JMP memmove_end_copy_match_emit_encodeBlockAsm12B
3336
3337emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
3338 MOVOU (SI), X0
3339 MOVOU -16(SI)(R8*1), X1
3340 MOVOU X0, (AX)
3341 MOVOU X1, -16(AX)(R8*1)
3342 JMP memmove_end_copy_match_emit_encodeBlockAsm12B
3343
3344emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
3345 MOVOU (SI), X0
3346 MOVOU 16(SI), X1
3347 MOVOU -32(SI)(R8*1), X2
3348 MOVOU -16(SI)(R8*1), X3
3349 MOVOU X0, (AX)
3350 MOVOU X1, 16(AX)
3351 MOVOU X2, -32(AX)(R8*1)
3352 MOVOU X3, -16(AX)(R8*1)
3353
3354memmove_end_copy_match_emit_encodeBlockAsm12B:
3355 MOVQ DI, AX
3356 JMP emit_literal_done_match_emit_encodeBlockAsm12B
3357
3358memmove_long_match_emit_encodeBlockAsm12B:
3359 LEAQ (AX)(R8*1), DI
3360
3361 // genMemMoveLong
3362 MOVOU (SI), X0
3363 MOVOU 16(SI), X1
3364 MOVOU -32(SI)(R8*1), X2
3365 MOVOU -16(SI)(R8*1), X3
3366 MOVQ R8, R10
3367 SHRQ $0x05, R10
3368 MOVQ AX, R9
3369 ANDL $0x0000001f, R9
3370 MOVQ $0x00000040, R11
3371 SUBQ R9, R11
3372 DECQ R10
3373 JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
3374 LEAQ -32(SI)(R11*1), R9
3375 LEAQ -32(AX)(R11*1), R12
3376
3377emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
3378 MOVOU (R9), X4
3379 MOVOU 16(R9), X5
3380 MOVOA X4, (R12)
3381 MOVOA X5, 16(R12)
3382 ADDQ $0x20, R12
3383 ADDQ $0x20, R9
3384 ADDQ $0x20, R11
3385 DECQ R10
3386 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
3387
3388emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
3389 MOVOU -32(SI)(R11*1), X4
3390 MOVOU -16(SI)(R11*1), X5
3391 MOVOA X4, -32(AX)(R11*1)
3392 MOVOA X5, -16(AX)(R11*1)
3393 ADDQ $0x20, R11
3394 CMPQ R8, R11
3395 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
3396 MOVOU X0, (AX)
3397 MOVOU X1, 16(AX)
3398 MOVOU X2, -32(AX)(R8*1)
3399 MOVOU X3, -16(AX)(R8*1)
3400 MOVQ DI, AX
3401
3402emit_literal_done_match_emit_encodeBlockAsm12B:
3403match_nolit_loop_encodeBlockAsm12B:
3404 MOVL CX, SI
3405 SUBL BX, SI
3406 MOVL SI, 16(SP)
3407 ADDL $0x04, CX
3408 ADDL $0x04, BX
3409 MOVQ src_len+32(FP), SI
3410 SUBL CX, SI
3411 LEAQ (DX)(CX*1), DI
3412 LEAQ (DX)(BX*1), BX
3413
3414 // matchLen
3415 XORL R9, R9
3416
3417matchlen_loopback_16_match_nolit_encodeBlockAsm12B:
3418 CMPL SI, $0x10
3419 JB matchlen_match8_match_nolit_encodeBlockAsm12B
3420 MOVQ (DI)(R9*1), R8
3421 MOVQ 8(DI)(R9*1), R10
3422 XORQ (BX)(R9*1), R8
3423 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B
3424 XORQ 8(BX)(R9*1), R10
3425 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm12B
3426 LEAL -16(SI), SI
3427 LEAL 16(R9), R9
3428 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm12B
3429
3430matchlen_bsf_16match_nolit_encodeBlockAsm12B:
3431#ifdef GOAMD64_v3
3432 TZCNTQ R10, R10
3433
3434#else
3435 BSFQ R10, R10
3436
3437#endif
3438 SARQ $0x03, R10
3439 LEAL 8(R9)(R10*1), R9
3440 JMP match_nolit_end_encodeBlockAsm12B
3441
3442matchlen_match8_match_nolit_encodeBlockAsm12B:
3443 CMPL SI, $0x08
3444 JB matchlen_match4_match_nolit_encodeBlockAsm12B
3445 MOVQ (DI)(R9*1), R8
3446 XORQ (BX)(R9*1), R8
3447 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B
3448 LEAL -8(SI), SI
3449 LEAL 8(R9), R9
3450 JMP matchlen_match4_match_nolit_encodeBlockAsm12B
3451
3452matchlen_bsf_8_match_nolit_encodeBlockAsm12B:
3453#ifdef GOAMD64_v3
3454 TZCNTQ R8, R8
3455
3456#else
3457 BSFQ R8, R8
3458
3459#endif
3460 SARQ $0x03, R8
3461 LEAL (R9)(R8*1), R9
3462 JMP match_nolit_end_encodeBlockAsm12B
3463
3464matchlen_match4_match_nolit_encodeBlockAsm12B:
3465 CMPL SI, $0x04
3466 JB matchlen_match2_match_nolit_encodeBlockAsm12B
3467 MOVL (DI)(R9*1), R8
3468 CMPL (BX)(R9*1), R8
3469 JNE matchlen_match2_match_nolit_encodeBlockAsm12B
3470 LEAL -4(SI), SI
3471 LEAL 4(R9), R9
3472
3473matchlen_match2_match_nolit_encodeBlockAsm12B:
3474 CMPL SI, $0x01
3475 JE matchlen_match1_match_nolit_encodeBlockAsm12B
3476 JB match_nolit_end_encodeBlockAsm12B
3477 MOVW (DI)(R9*1), R8
3478 CMPW (BX)(R9*1), R8
3479 JNE matchlen_match1_match_nolit_encodeBlockAsm12B
3480 LEAL 2(R9), R9
3481 SUBL $0x02, SI
3482 JZ match_nolit_end_encodeBlockAsm12B
3483
3484matchlen_match1_match_nolit_encodeBlockAsm12B:
3485 MOVB (DI)(R9*1), R8
3486 CMPB (BX)(R9*1), R8
3487 JNE match_nolit_end_encodeBlockAsm12B
3488 LEAL 1(R9), R9
3489
3490match_nolit_end_encodeBlockAsm12B:
3491 ADDL R9, CX
3492 MOVL 16(SP), BX
3493 ADDL $0x04, R9
3494 MOVL CX, 12(SP)
3495
3496 // emitCopy
3497 CMPL R9, $0x40
3498 JBE two_byte_offset_short_match_nolit_encodeBlockAsm12B
3499 CMPL BX, $0x00000800
3500 JAE long_offset_short_match_nolit_encodeBlockAsm12B
3501 MOVL $0x00000001, SI
3502 LEAL 16(SI), SI
3503 MOVB BL, 1(AX)
3504 SHRL $0x08, BX
3505 SHLL $0x05, BX
3506 ORL BX, SI
3507 MOVB SI, (AX)
3508 ADDQ $0x02, AX
3509 SUBL $0x08, R9
3510
3511 // emitRepeat
3512 LEAL -4(R9), R9
3513 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3514 MOVL R9, SI
3515 LEAL -4(R9), R9
3516 CMPL SI, $0x08
3517 JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3518 CMPL SI, $0x0c
3519 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3520 CMPL BX, $0x00000800
3521 JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3522
3523cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3524 CMPL R9, $0x00000104
3525 JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3526 LEAL -256(R9), R9
3527 MOVW $0x0019, (AX)
3528 MOVW R9, 2(AX)
3529 ADDQ $0x04, AX
3530 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3531
3532repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3533 LEAL -4(R9), R9
3534 MOVW $0x0015, (AX)
3535 MOVB R9, 2(AX)
3536 ADDQ $0x03, AX
3537 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3538
3539repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3540 SHLL $0x02, R9
3541 ORL $0x01, R9
3542 MOVW R9, (AX)
3543 ADDQ $0x02, AX
3544 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3545
3546repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3547 XORQ SI, SI
3548 LEAL 1(SI)(R9*4), R9
3549 MOVB BL, 1(AX)
3550 SARL $0x08, BX
3551 SHLL $0x05, BX
3552 ORL BX, R9
3553 MOVB R9, (AX)
3554 ADDQ $0x02, AX
3555 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3556
3557long_offset_short_match_nolit_encodeBlockAsm12B:
3558 MOVB $0xee, (AX)
3559 MOVW BX, 1(AX)
3560 LEAL -60(R9), R9
3561 ADDQ $0x03, AX
3562
3563 // emitRepeat
3564 MOVL R9, SI
3565 LEAL -4(R9), R9
3566 CMPL SI, $0x08
3567 JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
3568 CMPL SI, $0x0c
3569 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
3570 CMPL BX, $0x00000800
3571 JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
3572
3573cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
3574 CMPL R9, $0x00000104
3575 JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
3576 LEAL -256(R9), R9
3577 MOVW $0x0019, (AX)
3578 MOVW R9, 2(AX)
3579 ADDQ $0x04, AX
3580 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3581
3582repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
3583 LEAL -4(R9), R9
3584 MOVW $0x0015, (AX)
3585 MOVB R9, 2(AX)
3586 ADDQ $0x03, AX
3587 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3588
3589repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
3590 SHLL $0x02, R9
3591 ORL $0x01, R9
3592 MOVW R9, (AX)
3593 ADDQ $0x02, AX
3594 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3595
3596repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
3597 XORQ SI, SI
3598 LEAL 1(SI)(R9*4), R9
3599 MOVB BL, 1(AX)
3600 SARL $0x08, BX
3601 SHLL $0x05, BX
3602 ORL BX, R9
3603 MOVB R9, (AX)
3604 ADDQ $0x02, AX
3605 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3606
3607two_byte_offset_short_match_nolit_encodeBlockAsm12B:
3608 MOVL R9, SI
3609 SHLL $0x02, SI
3610 CMPL R9, $0x0c
3611 JAE emit_copy_three_match_nolit_encodeBlockAsm12B
3612 CMPL BX, $0x00000800
3613 JAE emit_copy_three_match_nolit_encodeBlockAsm12B
3614 LEAL -15(SI), SI
3615 MOVB BL, 1(AX)
3616 SHRL $0x08, BX
3617 SHLL $0x05, BX
3618 ORL BX, SI
3619 MOVB SI, (AX)
3620 ADDQ $0x02, AX
3621 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3622
3623emit_copy_three_match_nolit_encodeBlockAsm12B:
3624 LEAL -2(SI), SI
3625 MOVB SI, (AX)
3626 MOVW BX, 1(AX)
3627 ADDQ $0x03, AX
3628
3629match_nolit_emitcopy_end_encodeBlockAsm12B:
3630 CMPL CX, 8(SP)
3631 JAE emit_remainder_encodeBlockAsm12B
3632 MOVQ -2(DX)(CX*1), SI
3633 CMPQ AX, (SP)
3634 JB match_nolit_dst_ok_encodeBlockAsm12B
3635 MOVQ $0x00000000, ret+48(FP)
3636 RET
3637
3638match_nolit_dst_ok_encodeBlockAsm12B:
3639 MOVQ $0x000000cf1bbcdcbb, R8
3640 MOVQ SI, DI
3641 SHRQ $0x10, SI
3642 MOVQ SI, BX
3643 SHLQ $0x18, DI
3644 IMULQ R8, DI
3645 SHRQ $0x34, DI
3646 SHLQ $0x18, BX
3647 IMULQ R8, BX
3648 SHRQ $0x34, BX
3649 LEAL -2(CX), R8
3650 LEAQ 24(SP)(BX*4), R9
3651 MOVL (R9), BX
3652 MOVL R8, 24(SP)(DI*4)
3653 MOVL CX, (R9)
3654 CMPL (DX)(BX*1), SI
3655 JEQ match_nolit_loop_encodeBlockAsm12B
3656 INCL CX
3657 JMP search_loop_encodeBlockAsm12B
3658
3659emit_remainder_encodeBlockAsm12B:
3660 MOVQ src_len+32(FP), CX
3661 SUBL 12(SP), CX
3662 LEAQ 3(AX)(CX*1), CX
3663 CMPQ CX, (SP)
3664 JB emit_remainder_ok_encodeBlockAsm12B
3665 MOVQ $0x00000000, ret+48(FP)
3666 RET
3667
3668emit_remainder_ok_encodeBlockAsm12B:
3669 MOVQ src_len+32(FP), CX
3670 MOVL 12(SP), BX
3671 CMPL BX, CX
3672 JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B
3673 MOVL CX, SI
3674 MOVL CX, 12(SP)
3675 LEAQ (DX)(BX*1), CX
3676 SUBL BX, SI
3677 LEAL -1(SI), DX
3678 CMPL DX, $0x3c
3679 JB one_byte_emit_remainder_encodeBlockAsm12B
3680 CMPL DX, $0x00000100
3681 JB two_bytes_emit_remainder_encodeBlockAsm12B
3682 JB three_bytes_emit_remainder_encodeBlockAsm12B
3683
3684three_bytes_emit_remainder_encodeBlockAsm12B:
3685 MOVB $0xf4, (AX)
3686 MOVW DX, 1(AX)
3687 ADDQ $0x03, AX
3688 JMP memmove_long_emit_remainder_encodeBlockAsm12B
3689
3690two_bytes_emit_remainder_encodeBlockAsm12B:
3691 MOVB $0xf0, (AX)
3692 MOVB DL, 1(AX)
3693 ADDQ $0x02, AX
3694 CMPL DX, $0x40
3695 JB memmove_emit_remainder_encodeBlockAsm12B
3696 JMP memmove_long_emit_remainder_encodeBlockAsm12B
3697
3698one_byte_emit_remainder_encodeBlockAsm12B:
3699 SHLB $0x02, DL
3700 MOVB DL, (AX)
3701 ADDQ $0x01, AX
3702
3703memmove_emit_remainder_encodeBlockAsm12B:
3704 LEAQ (AX)(SI*1), DX
3705 MOVL SI, BX
3706
3707 // genMemMoveShort
3708 CMPQ BX, $0x03
3709 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
3710 JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
3711 CMPQ BX, $0x08
3712 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
3713 CMPQ BX, $0x10
3714 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
3715 CMPQ BX, $0x20
3716 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
3717 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
3718
3719emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
3720 MOVB (CX), SI
3721 MOVB -1(CX)(BX*1), CL
3722 MOVB SI, (AX)
3723 MOVB CL, -1(AX)(BX*1)
3724 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3725
3726emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
3727 MOVW (CX), SI
3728 MOVB 2(CX), CL
3729 MOVW SI, (AX)
3730 MOVB CL, 2(AX)
3731 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3732
3733emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
3734 MOVL (CX), SI
3735 MOVL -4(CX)(BX*1), CX
3736 MOVL SI, (AX)
3737 MOVL CX, -4(AX)(BX*1)
3738 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3739
3740emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
3741 MOVQ (CX), SI
3742 MOVQ -8(CX)(BX*1), CX
3743 MOVQ SI, (AX)
3744 MOVQ CX, -8(AX)(BX*1)
3745 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3746
3747emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
3748 MOVOU (CX), X0
3749 MOVOU -16(CX)(BX*1), X1
3750 MOVOU X0, (AX)
3751 MOVOU X1, -16(AX)(BX*1)
3752 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3753
3754emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
3755 MOVOU (CX), X0
3756 MOVOU 16(CX), X1
3757 MOVOU -32(CX)(BX*1), X2
3758 MOVOU -16(CX)(BX*1), X3
3759 MOVOU X0, (AX)
3760 MOVOU X1, 16(AX)
3761 MOVOU X2, -32(AX)(BX*1)
3762 MOVOU X3, -16(AX)(BX*1)
3763
3764memmove_end_copy_emit_remainder_encodeBlockAsm12B:
3765 MOVQ DX, AX
3766 JMP emit_literal_done_emit_remainder_encodeBlockAsm12B
3767
3768memmove_long_emit_remainder_encodeBlockAsm12B:
3769 LEAQ (AX)(SI*1), DX
3770 MOVL SI, BX
3771
3772 // genMemMoveLong
3773 MOVOU (CX), X0
3774 MOVOU 16(CX), X1
3775 MOVOU -32(CX)(BX*1), X2
3776 MOVOU -16(CX)(BX*1), X3
3777 MOVQ BX, DI
3778 SHRQ $0x05, DI
3779 MOVQ AX, SI
3780 ANDL $0x0000001f, SI
3781 MOVQ $0x00000040, R8
3782 SUBQ SI, R8
3783 DECQ DI
3784 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
3785 LEAQ -32(CX)(R8*1), SI
3786 LEAQ -32(AX)(R8*1), R9
3787
3788emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
3789 MOVOU (SI), X4
3790 MOVOU 16(SI), X5
3791 MOVOA X4, (R9)
3792 MOVOA X5, 16(R9)
3793 ADDQ $0x20, R9
3794 ADDQ $0x20, SI
3795 ADDQ $0x20, R8
3796 DECQ DI
3797 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
3798
3799emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
3800 MOVOU -32(CX)(R8*1), X4
3801 MOVOU -16(CX)(R8*1), X5
3802 MOVOA X4, -32(AX)(R8*1)
3803 MOVOA X5, -16(AX)(R8*1)
3804 ADDQ $0x20, R8
3805 CMPQ BX, R8
3806 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
3807 MOVOU X0, (AX)
3808 MOVOU X1, 16(AX)
3809 MOVOU X2, -32(AX)(BX*1)
3810 MOVOU X3, -16(AX)(BX*1)
3811 MOVQ DX, AX
3812
3813emit_literal_done_emit_remainder_encodeBlockAsm12B:
3814 MOVQ dst_base+0(FP), CX
3815 SUBQ CX, AX
3816 MOVQ AX, ret+48(FP)
3817 RET
3818
3819// func encodeBlockAsm10B(dst []byte, src []byte) int
3820// Requires: BMI, SSE2
3821TEXT ·encodeBlockAsm10B(SB), $4120-56
3822 MOVQ dst_base+0(FP), AX
3823 MOVQ $0x00000020, CX
3824 LEAQ 24(SP), DX
3825 PXOR X0, X0
3826
3827zero_loop_encodeBlockAsm10B:
3828 MOVOU X0, (DX)
3829 MOVOU X0, 16(DX)
3830 MOVOU X0, 32(DX)
3831 MOVOU X0, 48(DX)
3832 MOVOU X0, 64(DX)
3833 MOVOU X0, 80(DX)
3834 MOVOU X0, 96(DX)
3835 MOVOU X0, 112(DX)
3836 ADDQ $0x80, DX
3837 DECQ CX
3838 JNZ zero_loop_encodeBlockAsm10B
3839 MOVL $0x00000000, 12(SP)
3840 MOVQ src_len+32(FP), CX
3841 LEAQ -9(CX), DX
3842 LEAQ -8(CX), BX
3843 MOVL BX, 8(SP)
3844 SHRQ $0x05, CX
3845 SUBL CX, DX
3846 LEAQ (AX)(DX*1), DX
3847 MOVQ DX, (SP)
3848 MOVL $0x00000001, CX
3849 MOVL CX, 16(SP)
3850 MOVQ src_base+24(FP), DX
3851
3852search_loop_encodeBlockAsm10B:
3853 MOVL CX, BX
3854 SUBL 12(SP), BX
3855 SHRL $0x05, BX
3856 LEAL 4(CX)(BX*1), BX
3857 CMPL BX, 8(SP)
3858 JAE emit_remainder_encodeBlockAsm10B
3859 MOVQ (DX)(CX*1), SI
3860 MOVL BX, 20(SP)
3861 MOVQ $0x9e3779b1, R8
3862 MOVQ SI, R9
3863 MOVQ SI, R10
3864 SHRQ $0x08, R10
3865 SHLQ $0x20, R9
3866 IMULQ R8, R9
3867 SHRQ $0x36, R9
3868 SHLQ $0x20, R10
3869 IMULQ R8, R10
3870 SHRQ $0x36, R10
3871 MOVL 24(SP)(R9*4), BX
3872 MOVL 24(SP)(R10*4), DI
3873 MOVL CX, 24(SP)(R9*4)
3874 LEAL 1(CX), R9
3875 MOVL R9, 24(SP)(R10*4)
3876 MOVQ SI, R9
3877 SHRQ $0x10, R9
3878 SHLQ $0x20, R9
3879 IMULQ R8, R9
3880 SHRQ $0x36, R9
3881 MOVL CX, R8
3882 SUBL 16(SP), R8
3883 MOVL 1(DX)(R8*1), R10
3884 MOVQ SI, R8
3885 SHRQ $0x08, R8
3886 CMPL R8, R10
3887 JNE no_repeat_found_encodeBlockAsm10B
3888 LEAL 1(CX), SI
3889 MOVL 12(SP), DI
3890 MOVL SI, BX
3891 SUBL 16(SP), BX
3892 JZ repeat_extend_back_end_encodeBlockAsm10B
3893
3894repeat_extend_back_loop_encodeBlockAsm10B:
3895 CMPL SI, DI
3896 JBE repeat_extend_back_end_encodeBlockAsm10B
3897 MOVB -1(DX)(BX*1), R8
3898 MOVB -1(DX)(SI*1), R9
3899 CMPB R8, R9
3900 JNE repeat_extend_back_end_encodeBlockAsm10B
3901 LEAL -1(SI), SI
3902 DECL BX
3903 JNZ repeat_extend_back_loop_encodeBlockAsm10B
3904
3905repeat_extend_back_end_encodeBlockAsm10B:
3906 MOVL 12(SP), BX
3907 CMPL BX, SI
3908 JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B
3909 MOVL SI, R8
3910 MOVL SI, 12(SP)
3911 LEAQ (DX)(BX*1), R9
3912 SUBL BX, R8
3913 LEAL -1(R8), BX
3914 CMPL BX, $0x3c
3915 JB one_byte_repeat_emit_encodeBlockAsm10B
3916 CMPL BX, $0x00000100
3917 JB two_bytes_repeat_emit_encodeBlockAsm10B
3918 JB three_bytes_repeat_emit_encodeBlockAsm10B
3919
3920three_bytes_repeat_emit_encodeBlockAsm10B:
3921 MOVB $0xf4, (AX)
3922 MOVW BX, 1(AX)
3923 ADDQ $0x03, AX
3924 JMP memmove_long_repeat_emit_encodeBlockAsm10B
3925
3926two_bytes_repeat_emit_encodeBlockAsm10B:
3927 MOVB $0xf0, (AX)
3928 MOVB BL, 1(AX)
3929 ADDQ $0x02, AX
3930 CMPL BX, $0x40
3931 JB memmove_repeat_emit_encodeBlockAsm10B
3932 JMP memmove_long_repeat_emit_encodeBlockAsm10B
3933
3934one_byte_repeat_emit_encodeBlockAsm10B:
3935 SHLB $0x02, BL
3936 MOVB BL, (AX)
3937 ADDQ $0x01, AX
3938
3939memmove_repeat_emit_encodeBlockAsm10B:
3940 LEAQ (AX)(R8*1), BX
3941
3942 // genMemMoveShort
3943 CMPQ R8, $0x08
3944 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
3945 CMPQ R8, $0x10
3946 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
3947 CMPQ R8, $0x20
3948 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
3949 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
3950
3951emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
3952 MOVQ (R9), R10
3953 MOVQ R10, (AX)
3954 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
3955
3956emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
3957 MOVQ (R9), R10
3958 MOVQ -8(R9)(R8*1), R9
3959 MOVQ R10, (AX)
3960 MOVQ R9, -8(AX)(R8*1)
3961 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
3962
3963emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
3964 MOVOU (R9), X0
3965 MOVOU -16(R9)(R8*1), X1
3966 MOVOU X0, (AX)
3967 MOVOU X1, -16(AX)(R8*1)
3968 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
3969
3970emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
3971 MOVOU (R9), X0
3972 MOVOU 16(R9), X1
3973 MOVOU -32(R9)(R8*1), X2
3974 MOVOU -16(R9)(R8*1), X3
3975 MOVOU X0, (AX)
3976 MOVOU X1, 16(AX)
3977 MOVOU X2, -32(AX)(R8*1)
3978 MOVOU X3, -16(AX)(R8*1)
3979
3980memmove_end_copy_repeat_emit_encodeBlockAsm10B:
3981 MOVQ BX, AX
3982 JMP emit_literal_done_repeat_emit_encodeBlockAsm10B
3983
3984memmove_long_repeat_emit_encodeBlockAsm10B:
3985 LEAQ (AX)(R8*1), BX
3986
3987 // genMemMoveLong
3988 MOVOU (R9), X0
3989 MOVOU 16(R9), X1
3990 MOVOU -32(R9)(R8*1), X2
3991 MOVOU -16(R9)(R8*1), X3
3992 MOVQ R8, R11
3993 SHRQ $0x05, R11
3994 MOVQ AX, R10
3995 ANDL $0x0000001f, R10
3996 MOVQ $0x00000040, R12
3997 SUBQ R10, R12
3998 DECQ R11
3999 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4000 LEAQ -32(R9)(R12*1), R10
4001 LEAQ -32(AX)(R12*1), R13
4002
4003emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
4004 MOVOU (R10), X4
4005 MOVOU 16(R10), X5
4006 MOVOA X4, (R13)
4007 MOVOA X5, 16(R13)
4008 ADDQ $0x20, R13
4009 ADDQ $0x20, R10
4010 ADDQ $0x20, R12
4011 DECQ R11
4012 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
4013
4014emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
4015 MOVOU -32(R9)(R12*1), X4
4016 MOVOU -16(R9)(R12*1), X5
4017 MOVOA X4, -32(AX)(R12*1)
4018 MOVOA X5, -16(AX)(R12*1)
4019 ADDQ $0x20, R12
4020 CMPQ R8, R12
4021 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4022 MOVOU X0, (AX)
4023 MOVOU X1, 16(AX)
4024 MOVOU X2, -32(AX)(R8*1)
4025 MOVOU X3, -16(AX)(R8*1)
4026 MOVQ BX, AX
4027
4028emit_literal_done_repeat_emit_encodeBlockAsm10B:
4029 ADDL $0x05, CX
4030 MOVL CX, BX
4031 SUBL 16(SP), BX
4032 MOVQ src_len+32(FP), R8
4033 SUBL CX, R8
4034 LEAQ (DX)(CX*1), R9
4035 LEAQ (DX)(BX*1), BX
4036
4037 // matchLen
4038 XORL R11, R11
4039
4040matchlen_loopback_16_repeat_extend_encodeBlockAsm10B:
4041 CMPL R8, $0x10
4042 JB matchlen_match8_repeat_extend_encodeBlockAsm10B
4043 MOVQ (R9)(R11*1), R10
4044 MOVQ 8(R9)(R11*1), R12
4045 XORQ (BX)(R11*1), R10
4046 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
4047 XORQ 8(BX)(R11*1), R12
4048 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm10B
4049 LEAL -16(R8), R8
4050 LEAL 16(R11), R11
4051 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm10B
4052
4053matchlen_bsf_16repeat_extend_encodeBlockAsm10B:
4054#ifdef GOAMD64_v3
4055 TZCNTQ R12, R12
4056
4057#else
4058 BSFQ R12, R12
4059
4060#endif
4061 SARQ $0x03, R12
4062 LEAL 8(R11)(R12*1), R11
4063 JMP repeat_extend_forward_end_encodeBlockAsm10B
4064
4065matchlen_match8_repeat_extend_encodeBlockAsm10B:
4066 CMPL R8, $0x08
4067 JB matchlen_match4_repeat_extend_encodeBlockAsm10B
4068 MOVQ (R9)(R11*1), R10
4069 XORQ (BX)(R11*1), R10
4070 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
4071 LEAL -8(R8), R8
4072 LEAL 8(R11), R11
4073 JMP matchlen_match4_repeat_extend_encodeBlockAsm10B
4074
4075matchlen_bsf_8_repeat_extend_encodeBlockAsm10B:
4076#ifdef GOAMD64_v3
4077 TZCNTQ R10, R10
4078
4079#else
4080 BSFQ R10, R10
4081
4082#endif
4083 SARQ $0x03, R10
4084 LEAL (R11)(R10*1), R11
4085 JMP repeat_extend_forward_end_encodeBlockAsm10B
4086
4087matchlen_match4_repeat_extend_encodeBlockAsm10B:
4088 CMPL R8, $0x04
4089 JB matchlen_match2_repeat_extend_encodeBlockAsm10B
4090 MOVL (R9)(R11*1), R10
4091 CMPL (BX)(R11*1), R10
4092 JNE matchlen_match2_repeat_extend_encodeBlockAsm10B
4093 LEAL -4(R8), R8
4094 LEAL 4(R11), R11
4095
4096matchlen_match2_repeat_extend_encodeBlockAsm10B:
4097 CMPL R8, $0x01
4098 JE matchlen_match1_repeat_extend_encodeBlockAsm10B
4099 JB repeat_extend_forward_end_encodeBlockAsm10B
4100 MOVW (R9)(R11*1), R10
4101 CMPW (BX)(R11*1), R10
4102 JNE matchlen_match1_repeat_extend_encodeBlockAsm10B
4103 LEAL 2(R11), R11
4104 SUBL $0x02, R8
4105 JZ repeat_extend_forward_end_encodeBlockAsm10B
4106
4107matchlen_match1_repeat_extend_encodeBlockAsm10B:
4108 MOVB (R9)(R11*1), R10
4109 CMPB (BX)(R11*1), R10
4110 JNE repeat_extend_forward_end_encodeBlockAsm10B
4111 LEAL 1(R11), R11
4112
4113repeat_extend_forward_end_encodeBlockAsm10B:
4114 ADDL R11, CX
4115 MOVL CX, BX
4116 SUBL SI, BX
4117 MOVL 16(SP), SI
4118 TESTL DI, DI
4119 JZ repeat_as_copy_encodeBlockAsm10B
4120
4121 // emitRepeat
4122 MOVL BX, DI
4123 LEAL -4(BX), BX
4124 CMPL DI, $0x08
4125 JBE repeat_two_match_repeat_encodeBlockAsm10B
4126 CMPL DI, $0x0c
4127 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
4128 CMPL SI, $0x00000800
4129 JB repeat_two_offset_match_repeat_encodeBlockAsm10B
4130
4131cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
4132 CMPL BX, $0x00000104
4133 JB repeat_three_match_repeat_encodeBlockAsm10B
4134 LEAL -256(BX), BX
4135 MOVW $0x0019, (AX)
4136 MOVW BX, 2(AX)
4137 ADDQ $0x04, AX
4138 JMP repeat_end_emit_encodeBlockAsm10B
4139
4140repeat_three_match_repeat_encodeBlockAsm10B:
4141 LEAL -4(BX), BX
4142 MOVW $0x0015, (AX)
4143 MOVB BL, 2(AX)
4144 ADDQ $0x03, AX
4145 JMP repeat_end_emit_encodeBlockAsm10B
4146
4147repeat_two_match_repeat_encodeBlockAsm10B:
4148 SHLL $0x02, BX
4149 ORL $0x01, BX
4150 MOVW BX, (AX)
4151 ADDQ $0x02, AX
4152 JMP repeat_end_emit_encodeBlockAsm10B
4153
4154repeat_two_offset_match_repeat_encodeBlockAsm10B:
4155 XORQ DI, DI
4156 LEAL 1(DI)(BX*4), BX
4157 MOVB SI, 1(AX)
4158 SARL $0x08, SI
4159 SHLL $0x05, SI
4160 ORL SI, BX
4161 MOVB BL, (AX)
4162 ADDQ $0x02, AX
4163 JMP repeat_end_emit_encodeBlockAsm10B
4164
4165repeat_as_copy_encodeBlockAsm10B:
4166 // emitCopy
4167 CMPL BX, $0x40
4168 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
4169 CMPL SI, $0x00000800
4170 JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B
4171 MOVL $0x00000001, DI
4172 LEAL 16(DI), DI
4173 MOVB SI, 1(AX)
4174 SHRL $0x08, SI
4175 SHLL $0x05, SI
4176 ORL SI, DI
4177 MOVB DI, (AX)
4178 ADDQ $0x02, AX
4179 SUBL $0x08, BX
4180
4181 // emitRepeat
4182 LEAL -4(BX), BX
4183 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4184 MOVL BX, DI
4185 LEAL -4(BX), BX
4186 CMPL DI, $0x08
4187 JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4188 CMPL DI, $0x0c
4189 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4190 CMPL SI, $0x00000800
4191 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4192
4193cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4194 CMPL BX, $0x00000104
4195 JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4196 LEAL -256(BX), BX
4197 MOVW $0x0019, (AX)
4198 MOVW BX, 2(AX)
4199 ADDQ $0x04, AX
4200 JMP repeat_end_emit_encodeBlockAsm10B
4201
4202repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4203 LEAL -4(BX), BX
4204 MOVW $0x0015, (AX)
4205 MOVB BL, 2(AX)
4206 ADDQ $0x03, AX
4207 JMP repeat_end_emit_encodeBlockAsm10B
4208
4209repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4210 SHLL $0x02, BX
4211 ORL $0x01, BX
4212 MOVW BX, (AX)
4213 ADDQ $0x02, AX
4214 JMP repeat_end_emit_encodeBlockAsm10B
4215
4216repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4217 XORQ DI, DI
4218 LEAL 1(DI)(BX*4), BX
4219 MOVB SI, 1(AX)
4220 SARL $0x08, SI
4221 SHLL $0x05, SI
4222 ORL SI, BX
4223 MOVB BL, (AX)
4224 ADDQ $0x02, AX
4225 JMP repeat_end_emit_encodeBlockAsm10B
4226
4227long_offset_short_repeat_as_copy_encodeBlockAsm10B:
4228 MOVB $0xee, (AX)
4229 MOVW SI, 1(AX)
4230 LEAL -60(BX), BX
4231 ADDQ $0x03, AX
4232
4233 // emitRepeat
4234 MOVL BX, DI
4235 LEAL -4(BX), BX
4236 CMPL DI, $0x08
4237 JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4238 CMPL DI, $0x0c
4239 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4240 CMPL SI, $0x00000800
4241 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4242
4243cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4244 CMPL BX, $0x00000104
4245 JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4246 LEAL -256(BX), BX
4247 MOVW $0x0019, (AX)
4248 MOVW BX, 2(AX)
4249 ADDQ $0x04, AX
4250 JMP repeat_end_emit_encodeBlockAsm10B
4251
4252repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4253 LEAL -4(BX), BX
4254 MOVW $0x0015, (AX)
4255 MOVB BL, 2(AX)
4256 ADDQ $0x03, AX
4257 JMP repeat_end_emit_encodeBlockAsm10B
4258
4259repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4260 SHLL $0x02, BX
4261 ORL $0x01, BX
4262 MOVW BX, (AX)
4263 ADDQ $0x02, AX
4264 JMP repeat_end_emit_encodeBlockAsm10B
4265
4266repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4267 XORQ DI, DI
4268 LEAL 1(DI)(BX*4), BX
4269 MOVB SI, 1(AX)
4270 SARL $0x08, SI
4271 SHLL $0x05, SI
4272 ORL SI, BX
4273 MOVB BL, (AX)
4274 ADDQ $0x02, AX
4275 JMP repeat_end_emit_encodeBlockAsm10B
4276
4277two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
4278 MOVL BX, DI
4279 SHLL $0x02, DI
4280 CMPL BX, $0x0c
4281 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
4282 CMPL SI, $0x00000800
4283 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
4284 LEAL -15(DI), DI
4285 MOVB SI, 1(AX)
4286 SHRL $0x08, SI
4287 SHLL $0x05, SI
4288 ORL SI, DI
4289 MOVB DI, (AX)
4290 ADDQ $0x02, AX
4291 JMP repeat_end_emit_encodeBlockAsm10B
4292
4293emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
4294 LEAL -2(DI), DI
4295 MOVB DI, (AX)
4296 MOVW SI, 1(AX)
4297 ADDQ $0x03, AX
4298
4299repeat_end_emit_encodeBlockAsm10B:
4300 MOVL CX, 12(SP)
4301 JMP search_loop_encodeBlockAsm10B
4302
4303no_repeat_found_encodeBlockAsm10B:
4304 CMPL (DX)(BX*1), SI
4305 JEQ candidate_match_encodeBlockAsm10B
4306 SHRQ $0x08, SI
4307 MOVL 24(SP)(R9*4), BX
4308 LEAL 2(CX), R8
4309 CMPL (DX)(DI*1), SI
4310 JEQ candidate2_match_encodeBlockAsm10B
4311 MOVL R8, 24(SP)(R9*4)
4312 SHRQ $0x08, SI
4313 CMPL (DX)(BX*1), SI
4314 JEQ candidate3_match_encodeBlockAsm10B
4315 MOVL 20(SP), CX
4316 JMP search_loop_encodeBlockAsm10B
4317
4318candidate3_match_encodeBlockAsm10B:
4319 ADDL $0x02, CX
4320 JMP candidate_match_encodeBlockAsm10B
4321
4322candidate2_match_encodeBlockAsm10B:
4323 MOVL R8, 24(SP)(R9*4)
4324 INCL CX
4325 MOVL DI, BX
4326
4327candidate_match_encodeBlockAsm10B:
4328 MOVL 12(SP), SI
4329 TESTL BX, BX
4330 JZ match_extend_back_end_encodeBlockAsm10B
4331
4332match_extend_back_loop_encodeBlockAsm10B:
4333 CMPL CX, SI
4334 JBE match_extend_back_end_encodeBlockAsm10B
4335 MOVB -1(DX)(BX*1), DI
4336 MOVB -1(DX)(CX*1), R8
4337 CMPB DI, R8
4338 JNE match_extend_back_end_encodeBlockAsm10B
4339 LEAL -1(CX), CX
4340 DECL BX
4341 JZ match_extend_back_end_encodeBlockAsm10B
4342 JMP match_extend_back_loop_encodeBlockAsm10B
4343
4344match_extend_back_end_encodeBlockAsm10B:
4345 MOVL CX, SI
4346 SUBL 12(SP), SI
4347 LEAQ 3(AX)(SI*1), SI
4348 CMPQ SI, (SP)
4349 JB match_dst_size_check_encodeBlockAsm10B
4350 MOVQ $0x00000000, ret+48(FP)
4351 RET
4352
4353match_dst_size_check_encodeBlockAsm10B:
4354 MOVL CX, SI
4355 MOVL 12(SP), DI
4356 CMPL DI, SI
4357 JEQ emit_literal_done_match_emit_encodeBlockAsm10B
4358 MOVL SI, R8
4359 MOVL SI, 12(SP)
4360 LEAQ (DX)(DI*1), SI
4361 SUBL DI, R8
4362 LEAL -1(R8), DI
4363 CMPL DI, $0x3c
4364 JB one_byte_match_emit_encodeBlockAsm10B
4365 CMPL DI, $0x00000100
4366 JB two_bytes_match_emit_encodeBlockAsm10B
4367 JB three_bytes_match_emit_encodeBlockAsm10B
4368
4369three_bytes_match_emit_encodeBlockAsm10B:
4370 MOVB $0xf4, (AX)
4371 MOVW DI, 1(AX)
4372 ADDQ $0x03, AX
4373 JMP memmove_long_match_emit_encodeBlockAsm10B
4374
4375two_bytes_match_emit_encodeBlockAsm10B:
4376 MOVB $0xf0, (AX)
4377 MOVB DI, 1(AX)
4378 ADDQ $0x02, AX
4379 CMPL DI, $0x40
4380 JB memmove_match_emit_encodeBlockAsm10B
4381 JMP memmove_long_match_emit_encodeBlockAsm10B
4382
4383one_byte_match_emit_encodeBlockAsm10B:
4384 SHLB $0x02, DI
4385 MOVB DI, (AX)
4386 ADDQ $0x01, AX
4387
4388memmove_match_emit_encodeBlockAsm10B:
4389 LEAQ (AX)(R8*1), DI
4390
4391 // genMemMoveShort
4392 CMPQ R8, $0x08
4393 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
4394 CMPQ R8, $0x10
4395 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
4396 CMPQ R8, $0x20
4397 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
4398 JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
4399
4400emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
4401 MOVQ (SI), R9
4402 MOVQ R9, (AX)
4403 JMP memmove_end_copy_match_emit_encodeBlockAsm10B
4404
4405emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
4406 MOVQ (SI), R9
4407 MOVQ -8(SI)(R8*1), SI
4408 MOVQ R9, (AX)
4409 MOVQ SI, -8(AX)(R8*1)
4410 JMP memmove_end_copy_match_emit_encodeBlockAsm10B
4411
4412emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
4413 MOVOU (SI), X0
4414 MOVOU -16(SI)(R8*1), X1
4415 MOVOU X0, (AX)
4416 MOVOU X1, -16(AX)(R8*1)
4417 JMP memmove_end_copy_match_emit_encodeBlockAsm10B
4418
4419emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
4420 MOVOU (SI), X0
4421 MOVOU 16(SI), X1
4422 MOVOU -32(SI)(R8*1), X2
4423 MOVOU -16(SI)(R8*1), X3
4424 MOVOU X0, (AX)
4425 MOVOU X1, 16(AX)
4426 MOVOU X2, -32(AX)(R8*1)
4427 MOVOU X3, -16(AX)(R8*1)
4428
4429memmove_end_copy_match_emit_encodeBlockAsm10B:
4430 MOVQ DI, AX
4431 JMP emit_literal_done_match_emit_encodeBlockAsm10B
4432
4433memmove_long_match_emit_encodeBlockAsm10B:
4434 LEAQ (AX)(R8*1), DI
4435
4436 // genMemMoveLong
4437 MOVOU (SI), X0
4438 MOVOU 16(SI), X1
4439 MOVOU -32(SI)(R8*1), X2
4440 MOVOU -16(SI)(R8*1), X3
4441 MOVQ R8, R10
4442 SHRQ $0x05, R10
4443 MOVQ AX, R9
4444 ANDL $0x0000001f, R9
4445 MOVQ $0x00000040, R11
4446 SUBQ R9, R11
4447 DECQ R10
4448 JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4449 LEAQ -32(SI)(R11*1), R9
4450 LEAQ -32(AX)(R11*1), R12
4451
4452emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
4453 MOVOU (R9), X4
4454 MOVOU 16(R9), X5
4455 MOVOA X4, (R12)
4456 MOVOA X5, 16(R12)
4457 ADDQ $0x20, R12
4458 ADDQ $0x20, R9
4459 ADDQ $0x20, R11
4460 DECQ R10
4461 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
4462
4463emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
4464 MOVOU -32(SI)(R11*1), X4
4465 MOVOU -16(SI)(R11*1), X5
4466 MOVOA X4, -32(AX)(R11*1)
4467 MOVOA X5, -16(AX)(R11*1)
4468 ADDQ $0x20, R11
4469 CMPQ R8, R11
4470 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4471 MOVOU X0, (AX)
4472 MOVOU X1, 16(AX)
4473 MOVOU X2, -32(AX)(R8*1)
4474 MOVOU X3, -16(AX)(R8*1)
4475 MOVQ DI, AX
4476
4477emit_literal_done_match_emit_encodeBlockAsm10B:
4478match_nolit_loop_encodeBlockAsm10B:
4479 MOVL CX, SI
4480 SUBL BX, SI
4481 MOVL SI, 16(SP)
4482 ADDL $0x04, CX
4483 ADDL $0x04, BX
4484 MOVQ src_len+32(FP), SI
4485 SUBL CX, SI
4486 LEAQ (DX)(CX*1), DI
4487 LEAQ (DX)(BX*1), BX
4488
4489 // matchLen
4490 XORL R9, R9
4491
4492matchlen_loopback_16_match_nolit_encodeBlockAsm10B:
4493 CMPL SI, $0x10
4494 JB matchlen_match8_match_nolit_encodeBlockAsm10B
4495 MOVQ (DI)(R9*1), R8
4496 MOVQ 8(DI)(R9*1), R10
4497 XORQ (BX)(R9*1), R8
4498 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B
4499 XORQ 8(BX)(R9*1), R10
4500 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm10B
4501 LEAL -16(SI), SI
4502 LEAL 16(R9), R9
4503 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm10B
4504
4505matchlen_bsf_16match_nolit_encodeBlockAsm10B:
4506#ifdef GOAMD64_v3
4507 TZCNTQ R10, R10
4508
4509#else
4510 BSFQ R10, R10
4511
4512#endif
4513 SARQ $0x03, R10
4514 LEAL 8(R9)(R10*1), R9
4515 JMP match_nolit_end_encodeBlockAsm10B
4516
4517matchlen_match8_match_nolit_encodeBlockAsm10B:
4518 CMPL SI, $0x08
4519 JB matchlen_match4_match_nolit_encodeBlockAsm10B
4520 MOVQ (DI)(R9*1), R8
4521 XORQ (BX)(R9*1), R8
4522 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B
4523 LEAL -8(SI), SI
4524 LEAL 8(R9), R9
4525 JMP matchlen_match4_match_nolit_encodeBlockAsm10B
4526
4527matchlen_bsf_8_match_nolit_encodeBlockAsm10B:
4528#ifdef GOAMD64_v3
4529 TZCNTQ R8, R8
4530
4531#else
4532 BSFQ R8, R8
4533
4534#endif
4535 SARQ $0x03, R8
4536 LEAL (R9)(R8*1), R9
4537 JMP match_nolit_end_encodeBlockAsm10B
4538
4539matchlen_match4_match_nolit_encodeBlockAsm10B:
4540 CMPL SI, $0x04
4541 JB matchlen_match2_match_nolit_encodeBlockAsm10B
4542 MOVL (DI)(R9*1), R8
4543 CMPL (BX)(R9*1), R8
4544 JNE matchlen_match2_match_nolit_encodeBlockAsm10B
4545 LEAL -4(SI), SI
4546 LEAL 4(R9), R9
4547
4548matchlen_match2_match_nolit_encodeBlockAsm10B:
4549 CMPL SI, $0x01
4550 JE matchlen_match1_match_nolit_encodeBlockAsm10B
4551 JB match_nolit_end_encodeBlockAsm10B
4552 MOVW (DI)(R9*1), R8
4553 CMPW (BX)(R9*1), R8
4554 JNE matchlen_match1_match_nolit_encodeBlockAsm10B
4555 LEAL 2(R9), R9
4556 SUBL $0x02, SI
4557 JZ match_nolit_end_encodeBlockAsm10B
4558
4559matchlen_match1_match_nolit_encodeBlockAsm10B:
4560 MOVB (DI)(R9*1), R8
4561 CMPB (BX)(R9*1), R8
4562 JNE match_nolit_end_encodeBlockAsm10B
4563 LEAL 1(R9), R9
4564
4565match_nolit_end_encodeBlockAsm10B:
4566 ADDL R9, CX
4567 MOVL 16(SP), BX
4568 ADDL $0x04, R9
4569 MOVL CX, 12(SP)
4570
4571 // emitCopy
4572 CMPL R9, $0x40
4573 JBE two_byte_offset_short_match_nolit_encodeBlockAsm10B
4574 CMPL BX, $0x00000800
4575 JAE long_offset_short_match_nolit_encodeBlockAsm10B
4576 MOVL $0x00000001, SI
4577 LEAL 16(SI), SI
4578 MOVB BL, 1(AX)
4579 SHRL $0x08, BX
4580 SHLL $0x05, BX
4581 ORL BX, SI
4582 MOVB SI, (AX)
4583 ADDQ $0x02, AX
4584 SUBL $0x08, R9
4585
4586 // emitRepeat
4587 LEAL -4(R9), R9
4588 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4589 MOVL R9, SI
4590 LEAL -4(R9), R9
4591 CMPL SI, $0x08
4592 JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4593 CMPL SI, $0x0c
4594 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4595 CMPL BX, $0x00000800
4596 JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4597
4598cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4599 CMPL R9, $0x00000104
4600 JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4601 LEAL -256(R9), R9
4602 MOVW $0x0019, (AX)
4603 MOVW R9, 2(AX)
4604 ADDQ $0x04, AX
4605 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4606
4607repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4608 LEAL -4(R9), R9
4609 MOVW $0x0015, (AX)
4610 MOVB R9, 2(AX)
4611 ADDQ $0x03, AX
4612 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4613
4614repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4615 SHLL $0x02, R9
4616 ORL $0x01, R9
4617 MOVW R9, (AX)
4618 ADDQ $0x02, AX
4619 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4620
4621repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4622 XORQ SI, SI
4623 LEAL 1(SI)(R9*4), R9
4624 MOVB BL, 1(AX)
4625 SARL $0x08, BX
4626 SHLL $0x05, BX
4627 ORL BX, R9
4628 MOVB R9, (AX)
4629 ADDQ $0x02, AX
4630 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4631
4632long_offset_short_match_nolit_encodeBlockAsm10B:
4633 MOVB $0xee, (AX)
4634 MOVW BX, 1(AX)
4635 LEAL -60(R9), R9
4636 ADDQ $0x03, AX
4637
4638 // emitRepeat
4639 MOVL R9, SI
4640 LEAL -4(R9), R9
4641 CMPL SI, $0x08
4642 JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
4643 CMPL SI, $0x0c
4644 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
4645 CMPL BX, $0x00000800
4646 JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
4647
4648cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
4649 CMPL R9, $0x00000104
4650 JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
4651 LEAL -256(R9), R9
4652 MOVW $0x0019, (AX)
4653 MOVW R9, 2(AX)
4654 ADDQ $0x04, AX
4655 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4656
4657repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
4658 LEAL -4(R9), R9
4659 MOVW $0x0015, (AX)
4660 MOVB R9, 2(AX)
4661 ADDQ $0x03, AX
4662 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4663
4664repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
4665 SHLL $0x02, R9
4666 ORL $0x01, R9
4667 MOVW R9, (AX)
4668 ADDQ $0x02, AX
4669 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4670
4671repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
4672 XORQ SI, SI
4673 LEAL 1(SI)(R9*4), R9
4674 MOVB BL, 1(AX)
4675 SARL $0x08, BX
4676 SHLL $0x05, BX
4677 ORL BX, R9
4678 MOVB R9, (AX)
4679 ADDQ $0x02, AX
4680 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4681
4682two_byte_offset_short_match_nolit_encodeBlockAsm10B:
4683 MOVL R9, SI
4684 SHLL $0x02, SI
4685 CMPL R9, $0x0c
4686 JAE emit_copy_three_match_nolit_encodeBlockAsm10B
4687 CMPL BX, $0x00000800
4688 JAE emit_copy_three_match_nolit_encodeBlockAsm10B
4689 LEAL -15(SI), SI
4690 MOVB BL, 1(AX)
4691 SHRL $0x08, BX
4692 SHLL $0x05, BX
4693 ORL BX, SI
4694 MOVB SI, (AX)
4695 ADDQ $0x02, AX
4696 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4697
4698emit_copy_three_match_nolit_encodeBlockAsm10B:
4699 LEAL -2(SI), SI
4700 MOVB SI, (AX)
4701 MOVW BX, 1(AX)
4702 ADDQ $0x03, AX
4703
4704match_nolit_emitcopy_end_encodeBlockAsm10B:
4705 CMPL CX, 8(SP)
4706 JAE emit_remainder_encodeBlockAsm10B
4707 MOVQ -2(DX)(CX*1), SI
4708 CMPQ AX, (SP)
4709 JB match_nolit_dst_ok_encodeBlockAsm10B
4710 MOVQ $0x00000000, ret+48(FP)
4711 RET
4712
4713match_nolit_dst_ok_encodeBlockAsm10B:
4714 MOVQ $0x9e3779b1, R8
4715 MOVQ SI, DI
4716 SHRQ $0x10, SI
4717 MOVQ SI, BX
4718 SHLQ $0x20, DI
4719 IMULQ R8, DI
4720 SHRQ $0x36, DI
4721 SHLQ $0x20, BX
4722 IMULQ R8, BX
4723 SHRQ $0x36, BX
4724 LEAL -2(CX), R8
4725 LEAQ 24(SP)(BX*4), R9
4726 MOVL (R9), BX
4727 MOVL R8, 24(SP)(DI*4)
4728 MOVL CX, (R9)
4729 CMPL (DX)(BX*1), SI
4730 JEQ match_nolit_loop_encodeBlockAsm10B
4731 INCL CX
4732 JMP search_loop_encodeBlockAsm10B
4733
4734emit_remainder_encodeBlockAsm10B:
4735 MOVQ src_len+32(FP), CX
4736 SUBL 12(SP), CX
4737 LEAQ 3(AX)(CX*1), CX
4738 CMPQ CX, (SP)
4739 JB emit_remainder_ok_encodeBlockAsm10B
4740 MOVQ $0x00000000, ret+48(FP)
4741 RET
4742
4743emit_remainder_ok_encodeBlockAsm10B:
4744 MOVQ src_len+32(FP), CX
4745 MOVL 12(SP), BX
4746 CMPL BX, CX
4747 JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B
4748 MOVL CX, SI
4749 MOVL CX, 12(SP)
4750 LEAQ (DX)(BX*1), CX
4751 SUBL BX, SI
4752 LEAL -1(SI), DX
4753 CMPL DX, $0x3c
4754 JB one_byte_emit_remainder_encodeBlockAsm10B
4755 CMPL DX, $0x00000100
4756 JB two_bytes_emit_remainder_encodeBlockAsm10B
4757 JB three_bytes_emit_remainder_encodeBlockAsm10B
4758
4759three_bytes_emit_remainder_encodeBlockAsm10B:
4760 MOVB $0xf4, (AX)
4761 MOVW DX, 1(AX)
4762 ADDQ $0x03, AX
4763 JMP memmove_long_emit_remainder_encodeBlockAsm10B
4764
4765two_bytes_emit_remainder_encodeBlockAsm10B:
4766 MOVB $0xf0, (AX)
4767 MOVB DL, 1(AX)
4768 ADDQ $0x02, AX
4769 CMPL DX, $0x40
4770 JB memmove_emit_remainder_encodeBlockAsm10B
4771 JMP memmove_long_emit_remainder_encodeBlockAsm10B
4772
4773one_byte_emit_remainder_encodeBlockAsm10B:
4774 SHLB $0x02, DL
4775 MOVB DL, (AX)
4776 ADDQ $0x01, AX
4777
4778memmove_emit_remainder_encodeBlockAsm10B:
4779 LEAQ (AX)(SI*1), DX
4780 MOVL SI, BX
4781
4782 // genMemMoveShort
4783 CMPQ BX, $0x03
4784 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
4785 JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
4786 CMPQ BX, $0x08
4787 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
4788 CMPQ BX, $0x10
4789 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
4790 CMPQ BX, $0x20
4791 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
4792 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
4793
4794emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
4795 MOVB (CX), SI
4796 MOVB -1(CX)(BX*1), CL
4797 MOVB SI, (AX)
4798 MOVB CL, -1(AX)(BX*1)
4799 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4800
4801emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
4802 MOVW (CX), SI
4803 MOVB 2(CX), CL
4804 MOVW SI, (AX)
4805 MOVB CL, 2(AX)
4806 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4807
4808emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
4809 MOVL (CX), SI
4810 MOVL -4(CX)(BX*1), CX
4811 MOVL SI, (AX)
4812 MOVL CX, -4(AX)(BX*1)
4813 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4814
4815emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
4816 MOVQ (CX), SI
4817 MOVQ -8(CX)(BX*1), CX
4818 MOVQ SI, (AX)
4819 MOVQ CX, -8(AX)(BX*1)
4820 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4821
4822emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
4823 MOVOU (CX), X0
4824 MOVOU -16(CX)(BX*1), X1
4825 MOVOU X0, (AX)
4826 MOVOU X1, -16(AX)(BX*1)
4827 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4828
4829emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
4830 MOVOU (CX), X0
4831 MOVOU 16(CX), X1
4832 MOVOU -32(CX)(BX*1), X2
4833 MOVOU -16(CX)(BX*1), X3
4834 MOVOU X0, (AX)
4835 MOVOU X1, 16(AX)
4836 MOVOU X2, -32(AX)(BX*1)
4837 MOVOU X3, -16(AX)(BX*1)
4838
4839memmove_end_copy_emit_remainder_encodeBlockAsm10B:
4840 MOVQ DX, AX
4841 JMP emit_literal_done_emit_remainder_encodeBlockAsm10B
4842
4843memmove_long_emit_remainder_encodeBlockAsm10B:
4844 LEAQ (AX)(SI*1), DX
4845 MOVL SI, BX
4846
4847 // genMemMoveLong
4848 MOVOU (CX), X0
4849 MOVOU 16(CX), X1
4850 MOVOU -32(CX)(BX*1), X2
4851 MOVOU -16(CX)(BX*1), X3
4852 MOVQ BX, DI
4853 SHRQ $0x05, DI
4854 MOVQ AX, SI
4855 ANDL $0x0000001f, SI
4856 MOVQ $0x00000040, R8
4857 SUBQ SI, R8
4858 DECQ DI
4859 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
4860 LEAQ -32(CX)(R8*1), SI
4861 LEAQ -32(AX)(R8*1), R9
4862
4863emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
4864 MOVOU (SI), X4
4865 MOVOU 16(SI), X5
4866 MOVOA X4, (R9)
4867 MOVOA X5, 16(R9)
4868 ADDQ $0x20, R9
4869 ADDQ $0x20, SI
4870 ADDQ $0x20, R8
4871 DECQ DI
4872 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
4873
4874emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
4875 MOVOU -32(CX)(R8*1), X4
4876 MOVOU -16(CX)(R8*1), X5
4877 MOVOA X4, -32(AX)(R8*1)
4878 MOVOA X5, -16(AX)(R8*1)
4879 ADDQ $0x20, R8
4880 CMPQ BX, R8
4881 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
4882 MOVOU X0, (AX)
4883 MOVOU X1, 16(AX)
4884 MOVOU X2, -32(AX)(BX*1)
4885 MOVOU X3, -16(AX)(BX*1)
4886 MOVQ DX, AX
4887
4888emit_literal_done_emit_remainder_encodeBlockAsm10B:
4889 MOVQ dst_base+0(FP), CX
4890 SUBQ CX, AX
4891 MOVQ AX, ret+48(FP)
4892 RET
4893
4894// func encodeBlockAsm8B(dst []byte, src []byte) int
4895// Requires: BMI, SSE2
4896TEXT ·encodeBlockAsm8B(SB), $1048-56
4897 MOVQ dst_base+0(FP), AX
4898 MOVQ $0x00000008, CX
4899 LEAQ 24(SP), DX
4900 PXOR X0, X0
4901
4902zero_loop_encodeBlockAsm8B:
4903 MOVOU X0, (DX)
4904 MOVOU X0, 16(DX)
4905 MOVOU X0, 32(DX)
4906 MOVOU X0, 48(DX)
4907 MOVOU X0, 64(DX)
4908 MOVOU X0, 80(DX)
4909 MOVOU X0, 96(DX)
4910 MOVOU X0, 112(DX)
4911 ADDQ $0x80, DX
4912 DECQ CX
4913 JNZ zero_loop_encodeBlockAsm8B
4914 MOVL $0x00000000, 12(SP)
4915 MOVQ src_len+32(FP), CX
4916 LEAQ -9(CX), DX
4917 LEAQ -8(CX), BX
4918 MOVL BX, 8(SP)
4919 SHRQ $0x05, CX
4920 SUBL CX, DX
4921 LEAQ (AX)(DX*1), DX
4922 MOVQ DX, (SP)
4923 MOVL $0x00000001, CX
4924 MOVL CX, 16(SP)
4925 MOVQ src_base+24(FP), DX
4926
4927search_loop_encodeBlockAsm8B:
4928 MOVL CX, BX
4929 SUBL 12(SP), BX
4930 SHRL $0x04, BX
4931 LEAL 4(CX)(BX*1), BX
4932 CMPL BX, 8(SP)
4933 JAE emit_remainder_encodeBlockAsm8B
4934 MOVQ (DX)(CX*1), SI
4935 MOVL BX, 20(SP)
4936 MOVQ $0x9e3779b1, R8
4937 MOVQ SI, R9
4938 MOVQ SI, R10
4939 SHRQ $0x08, R10
4940 SHLQ $0x20, R9
4941 IMULQ R8, R9
4942 SHRQ $0x38, R9
4943 SHLQ $0x20, R10
4944 IMULQ R8, R10
4945 SHRQ $0x38, R10
4946 MOVL 24(SP)(R9*4), BX
4947 MOVL 24(SP)(R10*4), DI
4948 MOVL CX, 24(SP)(R9*4)
4949 LEAL 1(CX), R9
4950 MOVL R9, 24(SP)(R10*4)
4951 MOVQ SI, R9
4952 SHRQ $0x10, R9
4953 SHLQ $0x20, R9
4954 IMULQ R8, R9
4955 SHRQ $0x38, R9
4956 MOVL CX, R8
4957 SUBL 16(SP), R8
4958 MOVL 1(DX)(R8*1), R10
4959 MOVQ SI, R8
4960 SHRQ $0x08, R8
4961 CMPL R8, R10
4962 JNE no_repeat_found_encodeBlockAsm8B
4963 LEAL 1(CX), SI
4964 MOVL 12(SP), DI
4965 MOVL SI, BX
4966 SUBL 16(SP), BX
4967 JZ repeat_extend_back_end_encodeBlockAsm8B
4968
4969repeat_extend_back_loop_encodeBlockAsm8B:
4970 CMPL SI, DI
4971 JBE repeat_extend_back_end_encodeBlockAsm8B
4972 MOVB -1(DX)(BX*1), R8
4973 MOVB -1(DX)(SI*1), R9
4974 CMPB R8, R9
4975 JNE repeat_extend_back_end_encodeBlockAsm8B
4976 LEAL -1(SI), SI
4977 DECL BX
4978 JNZ repeat_extend_back_loop_encodeBlockAsm8B
4979
4980repeat_extend_back_end_encodeBlockAsm8B:
4981 MOVL 12(SP), BX
4982 CMPL BX, SI
4983 JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B
4984 MOVL SI, R8
4985 MOVL SI, 12(SP)
4986 LEAQ (DX)(BX*1), R9
4987 SUBL BX, R8
4988 LEAL -1(R8), BX
4989 CMPL BX, $0x3c
4990 JB one_byte_repeat_emit_encodeBlockAsm8B
4991 CMPL BX, $0x00000100
4992 JB two_bytes_repeat_emit_encodeBlockAsm8B
4993 JB three_bytes_repeat_emit_encodeBlockAsm8B
4994
4995three_bytes_repeat_emit_encodeBlockAsm8B:
4996 MOVB $0xf4, (AX)
4997 MOVW BX, 1(AX)
4998 ADDQ $0x03, AX
4999 JMP memmove_long_repeat_emit_encodeBlockAsm8B
5000
5001two_bytes_repeat_emit_encodeBlockAsm8B:
5002 MOVB $0xf0, (AX)
5003 MOVB BL, 1(AX)
5004 ADDQ $0x02, AX
5005 CMPL BX, $0x40
5006 JB memmove_repeat_emit_encodeBlockAsm8B
5007 JMP memmove_long_repeat_emit_encodeBlockAsm8B
5008
5009one_byte_repeat_emit_encodeBlockAsm8B:
5010 SHLB $0x02, BL
5011 MOVB BL, (AX)
5012 ADDQ $0x01, AX
5013
5014memmove_repeat_emit_encodeBlockAsm8B:
5015 LEAQ (AX)(R8*1), BX
5016
5017 // genMemMoveShort
5018 CMPQ R8, $0x08
5019 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
5020 CMPQ R8, $0x10
5021 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
5022 CMPQ R8, $0x20
5023 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
5024 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
5025
5026emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
5027 MOVQ (R9), R10
5028 MOVQ R10, (AX)
5029 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
5030
5031emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
5032 MOVQ (R9), R10
5033 MOVQ -8(R9)(R8*1), R9
5034 MOVQ R10, (AX)
5035 MOVQ R9, -8(AX)(R8*1)
5036 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
5037
5038emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
5039 MOVOU (R9), X0
5040 MOVOU -16(R9)(R8*1), X1
5041 MOVOU X0, (AX)
5042 MOVOU X1, -16(AX)(R8*1)
5043 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
5044
5045emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
5046 MOVOU (R9), X0
5047 MOVOU 16(R9), X1
5048 MOVOU -32(R9)(R8*1), X2
5049 MOVOU -16(R9)(R8*1), X3
5050 MOVOU X0, (AX)
5051 MOVOU X1, 16(AX)
5052 MOVOU X2, -32(AX)(R8*1)
5053 MOVOU X3, -16(AX)(R8*1)
5054
5055memmove_end_copy_repeat_emit_encodeBlockAsm8B:
5056 MOVQ BX, AX
5057 JMP emit_literal_done_repeat_emit_encodeBlockAsm8B
5058
5059memmove_long_repeat_emit_encodeBlockAsm8B:
5060 LEAQ (AX)(R8*1), BX
5061
5062 // genMemMoveLong
5063 MOVOU (R9), X0
5064 MOVOU 16(R9), X1
5065 MOVOU -32(R9)(R8*1), X2
5066 MOVOU -16(R9)(R8*1), X3
5067 MOVQ R8, R11
5068 SHRQ $0x05, R11
5069 MOVQ AX, R10
5070 ANDL $0x0000001f, R10
5071 MOVQ $0x00000040, R12
5072 SUBQ R10, R12
5073 DECQ R11
5074 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5075 LEAQ -32(R9)(R12*1), R10
5076 LEAQ -32(AX)(R12*1), R13
5077
5078emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
5079 MOVOU (R10), X4
5080 MOVOU 16(R10), X5
5081 MOVOA X4, (R13)
5082 MOVOA X5, 16(R13)
5083 ADDQ $0x20, R13
5084 ADDQ $0x20, R10
5085 ADDQ $0x20, R12
5086 DECQ R11
5087 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
5088
5089emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
5090 MOVOU -32(R9)(R12*1), X4
5091 MOVOU -16(R9)(R12*1), X5
5092 MOVOA X4, -32(AX)(R12*1)
5093 MOVOA X5, -16(AX)(R12*1)
5094 ADDQ $0x20, R12
5095 CMPQ R8, R12
5096 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5097 MOVOU X0, (AX)
5098 MOVOU X1, 16(AX)
5099 MOVOU X2, -32(AX)(R8*1)
5100 MOVOU X3, -16(AX)(R8*1)
5101 MOVQ BX, AX
5102
5103emit_literal_done_repeat_emit_encodeBlockAsm8B:
5104 ADDL $0x05, CX
5105 MOVL CX, BX
5106 SUBL 16(SP), BX
5107 MOVQ src_len+32(FP), R8
5108 SUBL CX, R8
5109 LEAQ (DX)(CX*1), R9
5110 LEAQ (DX)(BX*1), BX
5111
5112 // matchLen
5113 XORL R11, R11
5114
5115matchlen_loopback_16_repeat_extend_encodeBlockAsm8B:
5116 CMPL R8, $0x10
5117 JB matchlen_match8_repeat_extend_encodeBlockAsm8B
5118 MOVQ (R9)(R11*1), R10
5119 MOVQ 8(R9)(R11*1), R12
5120 XORQ (BX)(R11*1), R10
5121 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
5122 XORQ 8(BX)(R11*1), R12
5123 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm8B
5124 LEAL -16(R8), R8
5125 LEAL 16(R11), R11
5126 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm8B
5127
5128matchlen_bsf_16repeat_extend_encodeBlockAsm8B:
5129#ifdef GOAMD64_v3
5130 TZCNTQ R12, R12
5131
5132#else
5133 BSFQ R12, R12
5134
5135#endif
5136 SARQ $0x03, R12
5137 LEAL 8(R11)(R12*1), R11
5138 JMP repeat_extend_forward_end_encodeBlockAsm8B
5139
5140matchlen_match8_repeat_extend_encodeBlockAsm8B:
5141 CMPL R8, $0x08
5142 JB matchlen_match4_repeat_extend_encodeBlockAsm8B
5143 MOVQ (R9)(R11*1), R10
5144 XORQ (BX)(R11*1), R10
5145 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
5146 LEAL -8(R8), R8
5147 LEAL 8(R11), R11
5148 JMP matchlen_match4_repeat_extend_encodeBlockAsm8B
5149
5150matchlen_bsf_8_repeat_extend_encodeBlockAsm8B:
5151#ifdef GOAMD64_v3
5152 TZCNTQ R10, R10
5153
5154#else
5155 BSFQ R10, R10
5156
5157#endif
5158 SARQ $0x03, R10
5159 LEAL (R11)(R10*1), R11
5160 JMP repeat_extend_forward_end_encodeBlockAsm8B
5161
5162matchlen_match4_repeat_extend_encodeBlockAsm8B:
5163 CMPL R8, $0x04
5164 JB matchlen_match2_repeat_extend_encodeBlockAsm8B
5165 MOVL (R9)(R11*1), R10
5166 CMPL (BX)(R11*1), R10
5167 JNE matchlen_match2_repeat_extend_encodeBlockAsm8B
5168 LEAL -4(R8), R8
5169 LEAL 4(R11), R11
5170
5171matchlen_match2_repeat_extend_encodeBlockAsm8B:
5172 CMPL R8, $0x01
5173 JE matchlen_match1_repeat_extend_encodeBlockAsm8B
5174 JB repeat_extend_forward_end_encodeBlockAsm8B
5175 MOVW (R9)(R11*1), R10
5176 CMPW (BX)(R11*1), R10
5177 JNE matchlen_match1_repeat_extend_encodeBlockAsm8B
5178 LEAL 2(R11), R11
5179 SUBL $0x02, R8
5180 JZ repeat_extend_forward_end_encodeBlockAsm8B
5181
5182matchlen_match1_repeat_extend_encodeBlockAsm8B:
5183 MOVB (R9)(R11*1), R10
5184 CMPB (BX)(R11*1), R10
5185 JNE repeat_extend_forward_end_encodeBlockAsm8B
5186 LEAL 1(R11), R11
5187
5188repeat_extend_forward_end_encodeBlockAsm8B:
5189 ADDL R11, CX
5190 MOVL CX, BX
5191 SUBL SI, BX
5192 MOVL 16(SP), SI
5193 TESTL DI, DI
5194 JZ repeat_as_copy_encodeBlockAsm8B
5195
5196 // emitRepeat
5197 MOVL BX, SI
5198 LEAL -4(BX), BX
5199 CMPL SI, $0x08
5200 JBE repeat_two_match_repeat_encodeBlockAsm8B
5201 CMPL SI, $0x0c
5202 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
5203
5204cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
5205 CMPL BX, $0x00000104
5206 JB repeat_three_match_repeat_encodeBlockAsm8B
5207 LEAL -256(BX), BX
5208 MOVW $0x0019, (AX)
5209 MOVW BX, 2(AX)
5210 ADDQ $0x04, AX
5211 JMP repeat_end_emit_encodeBlockAsm8B
5212
5213repeat_three_match_repeat_encodeBlockAsm8B:
5214 LEAL -4(BX), BX
5215 MOVW $0x0015, (AX)
5216 MOVB BL, 2(AX)
5217 ADDQ $0x03, AX
5218 JMP repeat_end_emit_encodeBlockAsm8B
5219
5220repeat_two_match_repeat_encodeBlockAsm8B:
5221 SHLL $0x02, BX
5222 ORL $0x01, BX
5223 MOVW BX, (AX)
5224 ADDQ $0x02, AX
5225 JMP repeat_end_emit_encodeBlockAsm8B
5226 XORQ DI, DI
5227 LEAL 1(DI)(BX*4), BX
5228 MOVB SI, 1(AX)
5229 SARL $0x08, SI
5230 SHLL $0x05, SI
5231 ORL SI, BX
5232 MOVB BL, (AX)
5233 ADDQ $0x02, AX
5234 JMP repeat_end_emit_encodeBlockAsm8B
5235
5236repeat_as_copy_encodeBlockAsm8B:
5237 // emitCopy
5238 CMPL BX, $0x40
5239 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
5240 CMPL SI, $0x00000800
5241 JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B
5242 MOVL $0x00000001, DI
5243 LEAL 16(DI), DI
5244 MOVB SI, 1(AX)
5245 SHRL $0x08, SI
5246 SHLL $0x05, SI
5247 ORL SI, DI
5248 MOVB DI, (AX)
5249 ADDQ $0x02, AX
5250 SUBL $0x08, BX
5251
5252 // emitRepeat
5253 LEAL -4(BX), BX
5254 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5255 MOVL BX, SI
5256 LEAL -4(BX), BX
5257 CMPL SI, $0x08
5258 JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5259 CMPL SI, $0x0c
5260 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5261
5262cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
5263 CMPL BX, $0x00000104
5264 JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5265 LEAL -256(BX), BX
5266 MOVW $0x0019, (AX)
5267 MOVW BX, 2(AX)
5268 ADDQ $0x04, AX
5269 JMP repeat_end_emit_encodeBlockAsm8B
5270
5271repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
5272 LEAL -4(BX), BX
5273 MOVW $0x0015, (AX)
5274 MOVB BL, 2(AX)
5275 ADDQ $0x03, AX
5276 JMP repeat_end_emit_encodeBlockAsm8B
5277
5278repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
5279 SHLL $0x02, BX
5280 ORL $0x01, BX
5281 MOVW BX, (AX)
5282 ADDQ $0x02, AX
5283 JMP repeat_end_emit_encodeBlockAsm8B
5284 XORQ DI, DI
5285 LEAL 1(DI)(BX*4), BX
5286 MOVB SI, 1(AX)
5287 SARL $0x08, SI
5288 SHLL $0x05, SI
5289 ORL SI, BX
5290 MOVB BL, (AX)
5291 ADDQ $0x02, AX
5292 JMP repeat_end_emit_encodeBlockAsm8B
5293
5294long_offset_short_repeat_as_copy_encodeBlockAsm8B:
5295 MOVB $0xee, (AX)
5296 MOVW SI, 1(AX)
5297 LEAL -60(BX), BX
5298 ADDQ $0x03, AX
5299
5300 // emitRepeat
5301 MOVL BX, SI
5302 LEAL -4(BX), BX
5303 CMPL SI, $0x08
5304 JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
5305 CMPL SI, $0x0c
5306 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
5307
5308cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
5309 CMPL BX, $0x00000104
5310 JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
5311 LEAL -256(BX), BX
5312 MOVW $0x0019, (AX)
5313 MOVW BX, 2(AX)
5314 ADDQ $0x04, AX
5315 JMP repeat_end_emit_encodeBlockAsm8B
5316
5317repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
5318 LEAL -4(BX), BX
5319 MOVW $0x0015, (AX)
5320 MOVB BL, 2(AX)
5321 ADDQ $0x03, AX
5322 JMP repeat_end_emit_encodeBlockAsm8B
5323
5324repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
5325 SHLL $0x02, BX
5326 ORL $0x01, BX
5327 MOVW BX, (AX)
5328 ADDQ $0x02, AX
5329 JMP repeat_end_emit_encodeBlockAsm8B
5330 XORQ DI, DI
5331 LEAL 1(DI)(BX*4), BX
5332 MOVB SI, 1(AX)
5333 SARL $0x08, SI
5334 SHLL $0x05, SI
5335 ORL SI, BX
5336 MOVB BL, (AX)
5337 ADDQ $0x02, AX
5338 JMP repeat_end_emit_encodeBlockAsm8B
5339
5340two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
5341 MOVL BX, DI
5342 SHLL $0x02, DI
5343 CMPL BX, $0x0c
5344 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm8B
5345 LEAL -15(DI), DI
5346 MOVB SI, 1(AX)
5347 SHRL $0x08, SI
5348 SHLL $0x05, SI
5349 ORL SI, DI
5350 MOVB DI, (AX)
5351 ADDQ $0x02, AX
5352 JMP repeat_end_emit_encodeBlockAsm8B
5353
5354emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
5355 LEAL -2(DI), DI
5356 MOVB DI, (AX)
5357 MOVW SI, 1(AX)
5358 ADDQ $0x03, AX
5359
5360repeat_end_emit_encodeBlockAsm8B:
5361 MOVL CX, 12(SP)
5362 JMP search_loop_encodeBlockAsm8B
5363
5364no_repeat_found_encodeBlockAsm8B:
5365 CMPL (DX)(BX*1), SI
5366 JEQ candidate_match_encodeBlockAsm8B
5367 SHRQ $0x08, SI
5368 MOVL 24(SP)(R9*4), BX
5369 LEAL 2(CX), R8
5370 CMPL (DX)(DI*1), SI
5371 JEQ candidate2_match_encodeBlockAsm8B
5372 MOVL R8, 24(SP)(R9*4)
5373 SHRQ $0x08, SI
5374 CMPL (DX)(BX*1), SI
5375 JEQ candidate3_match_encodeBlockAsm8B
5376 MOVL 20(SP), CX
5377 JMP search_loop_encodeBlockAsm8B
5378
5379candidate3_match_encodeBlockAsm8B:
5380 ADDL $0x02, CX
5381 JMP candidate_match_encodeBlockAsm8B
5382
5383candidate2_match_encodeBlockAsm8B:
5384 MOVL R8, 24(SP)(R9*4)
5385 INCL CX
5386 MOVL DI, BX
5387
5388candidate_match_encodeBlockAsm8B:
5389 MOVL 12(SP), SI
5390 TESTL BX, BX
5391 JZ match_extend_back_end_encodeBlockAsm8B
5392
5393match_extend_back_loop_encodeBlockAsm8B:
5394 CMPL CX, SI
5395 JBE match_extend_back_end_encodeBlockAsm8B
5396 MOVB -1(DX)(BX*1), DI
5397 MOVB -1(DX)(CX*1), R8
5398 CMPB DI, R8
5399 JNE match_extend_back_end_encodeBlockAsm8B
5400 LEAL -1(CX), CX
5401 DECL BX
5402 JZ match_extend_back_end_encodeBlockAsm8B
5403 JMP match_extend_back_loop_encodeBlockAsm8B
5404
5405match_extend_back_end_encodeBlockAsm8B:
5406 MOVL CX, SI
5407 SUBL 12(SP), SI
5408 LEAQ 3(AX)(SI*1), SI
5409 CMPQ SI, (SP)
5410 JB match_dst_size_check_encodeBlockAsm8B
5411 MOVQ $0x00000000, ret+48(FP)
5412 RET
5413
5414match_dst_size_check_encodeBlockAsm8B:
5415 MOVL CX, SI
5416 MOVL 12(SP), DI
5417 CMPL DI, SI
5418 JEQ emit_literal_done_match_emit_encodeBlockAsm8B
5419 MOVL SI, R8
5420 MOVL SI, 12(SP)
5421 LEAQ (DX)(DI*1), SI
5422 SUBL DI, R8
5423 LEAL -1(R8), DI
5424 CMPL DI, $0x3c
5425 JB one_byte_match_emit_encodeBlockAsm8B
5426 CMPL DI, $0x00000100
5427 JB two_bytes_match_emit_encodeBlockAsm8B
5428 JB three_bytes_match_emit_encodeBlockAsm8B
5429
5430three_bytes_match_emit_encodeBlockAsm8B:
5431 MOVB $0xf4, (AX)
5432 MOVW DI, 1(AX)
5433 ADDQ $0x03, AX
5434 JMP memmove_long_match_emit_encodeBlockAsm8B
5435
5436two_bytes_match_emit_encodeBlockAsm8B:
5437 MOVB $0xf0, (AX)
5438 MOVB DI, 1(AX)
5439 ADDQ $0x02, AX
5440 CMPL DI, $0x40
5441 JB memmove_match_emit_encodeBlockAsm8B
5442 JMP memmove_long_match_emit_encodeBlockAsm8B
5443
5444one_byte_match_emit_encodeBlockAsm8B:
5445 SHLB $0x02, DI
5446 MOVB DI, (AX)
5447 ADDQ $0x01, AX
5448
5449memmove_match_emit_encodeBlockAsm8B:
5450 LEAQ (AX)(R8*1), DI
5451
5452 // genMemMoveShort
5453 CMPQ R8, $0x08
5454 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
5455 CMPQ R8, $0x10
5456 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
5457 CMPQ R8, $0x20
5458 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
5459 JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
5460
5461emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
5462 MOVQ (SI), R9
5463 MOVQ R9, (AX)
5464 JMP memmove_end_copy_match_emit_encodeBlockAsm8B
5465
5466emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
5467 MOVQ (SI), R9
5468 MOVQ -8(SI)(R8*1), SI
5469 MOVQ R9, (AX)
5470 MOVQ SI, -8(AX)(R8*1)
5471 JMP memmove_end_copy_match_emit_encodeBlockAsm8B
5472
5473emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
5474 MOVOU (SI), X0
5475 MOVOU -16(SI)(R8*1), X1
5476 MOVOU X0, (AX)
5477 MOVOU X1, -16(AX)(R8*1)
5478 JMP memmove_end_copy_match_emit_encodeBlockAsm8B
5479
5480emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
5481 MOVOU (SI), X0
5482 MOVOU 16(SI), X1
5483 MOVOU -32(SI)(R8*1), X2
5484 MOVOU -16(SI)(R8*1), X3
5485 MOVOU X0, (AX)
5486 MOVOU X1, 16(AX)
5487 MOVOU X2, -32(AX)(R8*1)
5488 MOVOU X3, -16(AX)(R8*1)
5489
5490memmove_end_copy_match_emit_encodeBlockAsm8B:
5491 MOVQ DI, AX
5492 JMP emit_literal_done_match_emit_encodeBlockAsm8B
5493
5494memmove_long_match_emit_encodeBlockAsm8B:
5495 LEAQ (AX)(R8*1), DI
5496
5497 // genMemMoveLong
5498 MOVOU (SI), X0
5499 MOVOU 16(SI), X1
5500 MOVOU -32(SI)(R8*1), X2
5501 MOVOU -16(SI)(R8*1), X3
5502 MOVQ R8, R10
5503 SHRQ $0x05, R10
5504 MOVQ AX, R9
5505 ANDL $0x0000001f, R9
5506 MOVQ $0x00000040, R11
5507 SUBQ R9, R11
5508 DECQ R10
5509 JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5510 LEAQ -32(SI)(R11*1), R9
5511 LEAQ -32(AX)(R11*1), R12
5512
5513emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
5514 MOVOU (R9), X4
5515 MOVOU 16(R9), X5
5516 MOVOA X4, (R12)
5517 MOVOA X5, 16(R12)
5518 ADDQ $0x20, R12
5519 ADDQ $0x20, R9
5520 ADDQ $0x20, R11
5521 DECQ R10
5522 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
5523
5524emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
5525 MOVOU -32(SI)(R11*1), X4
5526 MOVOU -16(SI)(R11*1), X5
5527 MOVOA X4, -32(AX)(R11*1)
5528 MOVOA X5, -16(AX)(R11*1)
5529 ADDQ $0x20, R11
5530 CMPQ R8, R11
5531 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5532 MOVOU X0, (AX)
5533 MOVOU X1, 16(AX)
5534 MOVOU X2, -32(AX)(R8*1)
5535 MOVOU X3, -16(AX)(R8*1)
5536 MOVQ DI, AX
5537
5538emit_literal_done_match_emit_encodeBlockAsm8B:
5539match_nolit_loop_encodeBlockAsm8B:
5540 MOVL CX, SI
5541 SUBL BX, SI
5542 MOVL SI, 16(SP)
5543 ADDL $0x04, CX
5544 ADDL $0x04, BX
5545 MOVQ src_len+32(FP), SI
5546 SUBL CX, SI
5547 LEAQ (DX)(CX*1), DI
5548 LEAQ (DX)(BX*1), BX
5549
5550 // matchLen
5551 XORL R9, R9
5552
5553matchlen_loopback_16_match_nolit_encodeBlockAsm8B:
5554 CMPL SI, $0x10
5555 JB matchlen_match8_match_nolit_encodeBlockAsm8B
5556 MOVQ (DI)(R9*1), R8
5557 MOVQ 8(DI)(R9*1), R10
5558 XORQ (BX)(R9*1), R8
5559 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B
5560 XORQ 8(BX)(R9*1), R10
5561 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm8B
5562 LEAL -16(SI), SI
5563 LEAL 16(R9), R9
5564 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm8B
5565
5566matchlen_bsf_16match_nolit_encodeBlockAsm8B:
5567#ifdef GOAMD64_v3
5568 TZCNTQ R10, R10
5569
5570#else
5571 BSFQ R10, R10
5572
5573#endif
5574 SARQ $0x03, R10
5575 LEAL 8(R9)(R10*1), R9
5576 JMP match_nolit_end_encodeBlockAsm8B
5577
5578matchlen_match8_match_nolit_encodeBlockAsm8B:
5579 CMPL SI, $0x08
5580 JB matchlen_match4_match_nolit_encodeBlockAsm8B
5581 MOVQ (DI)(R9*1), R8
5582 XORQ (BX)(R9*1), R8
5583 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B
5584 LEAL -8(SI), SI
5585 LEAL 8(R9), R9
5586 JMP matchlen_match4_match_nolit_encodeBlockAsm8B
5587
5588matchlen_bsf_8_match_nolit_encodeBlockAsm8B:
5589#ifdef GOAMD64_v3
5590 TZCNTQ R8, R8
5591
5592#else
5593 BSFQ R8, R8
5594
5595#endif
5596 SARQ $0x03, R8
5597 LEAL (R9)(R8*1), R9
5598 JMP match_nolit_end_encodeBlockAsm8B
5599
5600matchlen_match4_match_nolit_encodeBlockAsm8B:
5601 CMPL SI, $0x04
5602 JB matchlen_match2_match_nolit_encodeBlockAsm8B
5603 MOVL (DI)(R9*1), R8
5604 CMPL (BX)(R9*1), R8
5605 JNE matchlen_match2_match_nolit_encodeBlockAsm8B
5606 LEAL -4(SI), SI
5607 LEAL 4(R9), R9
5608
5609matchlen_match2_match_nolit_encodeBlockAsm8B:
5610 CMPL SI, $0x01
5611 JE matchlen_match1_match_nolit_encodeBlockAsm8B
5612 JB match_nolit_end_encodeBlockAsm8B
5613 MOVW (DI)(R9*1), R8
5614 CMPW (BX)(R9*1), R8
5615 JNE matchlen_match1_match_nolit_encodeBlockAsm8B
5616 LEAL 2(R9), R9
5617 SUBL $0x02, SI
5618 JZ match_nolit_end_encodeBlockAsm8B
5619
5620matchlen_match1_match_nolit_encodeBlockAsm8B:
5621 MOVB (DI)(R9*1), R8
5622 CMPB (BX)(R9*1), R8
5623 JNE match_nolit_end_encodeBlockAsm8B
5624 LEAL 1(R9), R9
5625
5626match_nolit_end_encodeBlockAsm8B:
5627 ADDL R9, CX
5628 MOVL 16(SP), BX
5629 ADDL $0x04, R9
5630 MOVL CX, 12(SP)
5631
5632 // emitCopy
5633 CMPL R9, $0x40
5634 JBE two_byte_offset_short_match_nolit_encodeBlockAsm8B
5635 CMPL BX, $0x00000800
5636 JAE long_offset_short_match_nolit_encodeBlockAsm8B
5637 MOVL $0x00000001, SI
5638 LEAL 16(SI), SI
5639 MOVB BL, 1(AX)
5640 SHRL $0x08, BX
5641 SHLL $0x05, BX
5642 ORL BX, SI
5643 MOVB SI, (AX)
5644 ADDQ $0x02, AX
5645 SUBL $0x08, R9
5646
5647 // emitRepeat
5648 LEAL -4(R9), R9
5649 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5650 MOVL R9, BX
5651 LEAL -4(R9), R9
5652 CMPL BX, $0x08
5653 JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5654 CMPL BX, $0x0c
5655 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5656
5657cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
5658 CMPL R9, $0x00000104
5659 JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5660 LEAL -256(R9), R9
5661 MOVW $0x0019, (AX)
5662 MOVW R9, 2(AX)
5663 ADDQ $0x04, AX
5664 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5665
5666repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
5667 LEAL -4(R9), R9
5668 MOVW $0x0015, (AX)
5669 MOVB R9, 2(AX)
5670 ADDQ $0x03, AX
5671 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5672
5673repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
5674 SHLL $0x02, R9
5675 ORL $0x01, R9
5676 MOVW R9, (AX)
5677 ADDQ $0x02, AX
5678 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5679 XORQ SI, SI
5680 LEAL 1(SI)(R9*4), R9
5681 MOVB BL, 1(AX)
5682 SARL $0x08, BX
5683 SHLL $0x05, BX
5684 ORL BX, R9
5685 MOVB R9, (AX)
5686 ADDQ $0x02, AX
5687 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5688
5689long_offset_short_match_nolit_encodeBlockAsm8B:
5690 MOVB $0xee, (AX)
5691 MOVW BX, 1(AX)
5692 LEAL -60(R9), R9
5693 ADDQ $0x03, AX
5694
5695 // emitRepeat
5696 MOVL R9, BX
5697 LEAL -4(R9), R9
5698 CMPL BX, $0x08
5699 JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
5700 CMPL BX, $0x0c
5701 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
5702
5703cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
5704 CMPL R9, $0x00000104
5705 JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
5706 LEAL -256(R9), R9
5707 MOVW $0x0019, (AX)
5708 MOVW R9, 2(AX)
5709 ADDQ $0x04, AX
5710 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5711
5712repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
5713 LEAL -4(R9), R9
5714 MOVW $0x0015, (AX)
5715 MOVB R9, 2(AX)
5716 ADDQ $0x03, AX
5717 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5718
5719repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
5720 SHLL $0x02, R9
5721 ORL $0x01, R9
5722 MOVW R9, (AX)
5723 ADDQ $0x02, AX
5724 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5725 XORQ SI, SI
5726 LEAL 1(SI)(R9*4), R9
5727 MOVB BL, 1(AX)
5728 SARL $0x08, BX
5729 SHLL $0x05, BX
5730 ORL BX, R9
5731 MOVB R9, (AX)
5732 ADDQ $0x02, AX
5733 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5734
5735two_byte_offset_short_match_nolit_encodeBlockAsm8B:
5736 MOVL R9, SI
5737 SHLL $0x02, SI
5738 CMPL R9, $0x0c
5739 JAE emit_copy_three_match_nolit_encodeBlockAsm8B
5740 LEAL -15(SI), SI
5741 MOVB BL, 1(AX)
5742 SHRL $0x08, BX
5743 SHLL $0x05, BX
5744 ORL BX, SI
5745 MOVB SI, (AX)
5746 ADDQ $0x02, AX
5747 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5748
5749emit_copy_three_match_nolit_encodeBlockAsm8B:
5750 LEAL -2(SI), SI
5751 MOVB SI, (AX)
5752 MOVW BX, 1(AX)
5753 ADDQ $0x03, AX
5754
5755match_nolit_emitcopy_end_encodeBlockAsm8B:
5756 CMPL CX, 8(SP)
5757 JAE emit_remainder_encodeBlockAsm8B
5758 MOVQ -2(DX)(CX*1), SI
5759 CMPQ AX, (SP)
5760 JB match_nolit_dst_ok_encodeBlockAsm8B
5761 MOVQ $0x00000000, ret+48(FP)
5762 RET
5763
5764match_nolit_dst_ok_encodeBlockAsm8B:
5765 MOVQ $0x9e3779b1, R8
5766 MOVQ SI, DI
5767 SHRQ $0x10, SI
5768 MOVQ SI, BX
5769 SHLQ $0x20, DI
5770 IMULQ R8, DI
5771 SHRQ $0x38, DI
5772 SHLQ $0x20, BX
5773 IMULQ R8, BX
5774 SHRQ $0x38, BX
5775 LEAL -2(CX), R8
5776 LEAQ 24(SP)(BX*4), R9
5777 MOVL (R9), BX
5778 MOVL R8, 24(SP)(DI*4)
5779 MOVL CX, (R9)
5780 CMPL (DX)(BX*1), SI
5781 JEQ match_nolit_loop_encodeBlockAsm8B
5782 INCL CX
5783 JMP search_loop_encodeBlockAsm8B
5784
5785emit_remainder_encodeBlockAsm8B:
5786 MOVQ src_len+32(FP), CX
5787 SUBL 12(SP), CX
5788 LEAQ 3(AX)(CX*1), CX
5789 CMPQ CX, (SP)
5790 JB emit_remainder_ok_encodeBlockAsm8B
5791 MOVQ $0x00000000, ret+48(FP)
5792 RET
5793
5794emit_remainder_ok_encodeBlockAsm8B:
5795 MOVQ src_len+32(FP), CX
5796 MOVL 12(SP), BX
5797 CMPL BX, CX
5798 JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B
5799 MOVL CX, SI
5800 MOVL CX, 12(SP)
5801 LEAQ (DX)(BX*1), CX
5802 SUBL BX, SI
5803 LEAL -1(SI), DX
5804 CMPL DX, $0x3c
5805 JB one_byte_emit_remainder_encodeBlockAsm8B
5806 CMPL DX, $0x00000100
5807 JB two_bytes_emit_remainder_encodeBlockAsm8B
5808 JB three_bytes_emit_remainder_encodeBlockAsm8B
5809
5810three_bytes_emit_remainder_encodeBlockAsm8B:
5811 MOVB $0xf4, (AX)
5812 MOVW DX, 1(AX)
5813 ADDQ $0x03, AX
5814 JMP memmove_long_emit_remainder_encodeBlockAsm8B
5815
5816two_bytes_emit_remainder_encodeBlockAsm8B:
5817 MOVB $0xf0, (AX)
5818 MOVB DL, 1(AX)
5819 ADDQ $0x02, AX
5820 CMPL DX, $0x40
5821 JB memmove_emit_remainder_encodeBlockAsm8B
5822 JMP memmove_long_emit_remainder_encodeBlockAsm8B
5823
5824one_byte_emit_remainder_encodeBlockAsm8B:
5825 SHLB $0x02, DL
5826 MOVB DL, (AX)
5827 ADDQ $0x01, AX
5828
5829memmove_emit_remainder_encodeBlockAsm8B:
5830 LEAQ (AX)(SI*1), DX
5831 MOVL SI, BX
5832
5833 // genMemMoveShort
5834 CMPQ BX, $0x03
5835 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
5836 JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
5837 CMPQ BX, $0x08
5838 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
5839 CMPQ BX, $0x10
5840 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
5841 CMPQ BX, $0x20
5842 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
5843 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
5844
5845emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
5846 MOVB (CX), SI
5847 MOVB -1(CX)(BX*1), CL
5848 MOVB SI, (AX)
5849 MOVB CL, -1(AX)(BX*1)
5850 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5851
5852emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
5853 MOVW (CX), SI
5854 MOVB 2(CX), CL
5855 MOVW SI, (AX)
5856 MOVB CL, 2(AX)
5857 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5858
5859emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
5860 MOVL (CX), SI
5861 MOVL -4(CX)(BX*1), CX
5862 MOVL SI, (AX)
5863 MOVL CX, -4(AX)(BX*1)
5864 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5865
5866emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
5867 MOVQ (CX), SI
5868 MOVQ -8(CX)(BX*1), CX
5869 MOVQ SI, (AX)
5870 MOVQ CX, -8(AX)(BX*1)
5871 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5872
5873emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
5874 MOVOU (CX), X0
5875 MOVOU -16(CX)(BX*1), X1
5876 MOVOU X0, (AX)
5877 MOVOU X1, -16(AX)(BX*1)
5878 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5879
5880emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
5881 MOVOU (CX), X0
5882 MOVOU 16(CX), X1
5883 MOVOU -32(CX)(BX*1), X2
5884 MOVOU -16(CX)(BX*1), X3
5885 MOVOU X0, (AX)
5886 MOVOU X1, 16(AX)
5887 MOVOU X2, -32(AX)(BX*1)
5888 MOVOU X3, -16(AX)(BX*1)
5889
5890memmove_end_copy_emit_remainder_encodeBlockAsm8B:
5891 MOVQ DX, AX
5892 JMP emit_literal_done_emit_remainder_encodeBlockAsm8B
5893
5894memmove_long_emit_remainder_encodeBlockAsm8B:
5895 LEAQ (AX)(SI*1), DX
5896 MOVL SI, BX
5897
5898 // genMemMoveLong
5899 MOVOU (CX), X0
5900 MOVOU 16(CX), X1
5901 MOVOU -32(CX)(BX*1), X2
5902 MOVOU -16(CX)(BX*1), X3
5903 MOVQ BX, DI
5904 SHRQ $0x05, DI
5905 MOVQ AX, SI
5906 ANDL $0x0000001f, SI
5907 MOVQ $0x00000040, R8
5908 SUBQ SI, R8
5909 DECQ DI
5910 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
5911 LEAQ -32(CX)(R8*1), SI
5912 LEAQ -32(AX)(R8*1), R9
5913
5914emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
5915 MOVOU (SI), X4
5916 MOVOU 16(SI), X5
5917 MOVOA X4, (R9)
5918 MOVOA X5, 16(R9)
5919 ADDQ $0x20, R9
5920 ADDQ $0x20, SI
5921 ADDQ $0x20, R8
5922 DECQ DI
5923 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
5924
5925emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
5926 MOVOU -32(CX)(R8*1), X4
5927 MOVOU -16(CX)(R8*1), X5
5928 MOVOA X4, -32(AX)(R8*1)
5929 MOVOA X5, -16(AX)(R8*1)
5930 ADDQ $0x20, R8
5931 CMPQ BX, R8
5932 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
5933 MOVOU X0, (AX)
5934 MOVOU X1, 16(AX)
5935 MOVOU X2, -32(AX)(BX*1)
5936 MOVOU X3, -16(AX)(BX*1)
5937 MOVQ DX, AX
5938
5939emit_literal_done_emit_remainder_encodeBlockAsm8B:
5940 MOVQ dst_base+0(FP), CX
5941 SUBQ CX, AX
5942 MOVQ AX, ret+48(FP)
5943 RET
5944
5945// func encodeBetterBlockAsm(dst []byte, src []byte) int
5946// Requires: BMI, SSE2
5947TEXT ·encodeBetterBlockAsm(SB), $589848-56
5948 MOVQ dst_base+0(FP), AX
5949 MOVQ $0x00001200, CX
5950 LEAQ 24(SP), DX
5951 PXOR X0, X0
5952
5953zero_loop_encodeBetterBlockAsm:
5954 MOVOU X0, (DX)
5955 MOVOU X0, 16(DX)
5956 MOVOU X0, 32(DX)
5957 MOVOU X0, 48(DX)
5958 MOVOU X0, 64(DX)
5959 MOVOU X0, 80(DX)
5960 MOVOU X0, 96(DX)
5961 MOVOU X0, 112(DX)
5962 ADDQ $0x80, DX
5963 DECQ CX
5964 JNZ zero_loop_encodeBetterBlockAsm
5965 MOVL $0x00000000, 12(SP)
5966 MOVQ src_len+32(FP), CX
5967 LEAQ -6(CX), DX
5968 LEAQ -8(CX), BX
5969 MOVL BX, 8(SP)
5970 SHRQ $0x05, CX
5971 SUBL CX, DX
5972 LEAQ (AX)(DX*1), DX
5973 MOVQ DX, (SP)
5974 MOVL $0x00000001, CX
5975 MOVL $0x00000000, 16(SP)
5976 MOVQ src_base+24(FP), DX
5977
5978search_loop_encodeBetterBlockAsm:
5979 MOVL CX, BX
5980 SUBL 12(SP), BX
5981 SHRL $0x07, BX
5982 CMPL BX, $0x63
5983 JBE check_maxskip_ok_encodeBetterBlockAsm
5984 LEAL 100(CX), BX
5985 JMP check_maxskip_cont_encodeBetterBlockAsm
5986
5987check_maxskip_ok_encodeBetterBlockAsm:
5988 LEAL 1(CX)(BX*1), BX
5989
5990check_maxskip_cont_encodeBetterBlockAsm:
5991 CMPL BX, 8(SP)
5992 JAE emit_remainder_encodeBetterBlockAsm
5993 MOVQ (DX)(CX*1), SI
5994 MOVL BX, 20(SP)
5995 MOVQ $0x00cf1bbcdcbfa563, R8
5996 MOVQ $0x9e3779b1, BX
5997 MOVQ SI, R9
5998 MOVQ SI, R10
5999 SHLQ $0x08, R9
6000 IMULQ R8, R9
6001 SHRQ $0x2f, R9
6002 SHLQ $0x20, R10
6003 IMULQ BX, R10
6004 SHRQ $0x32, R10
6005 MOVL 24(SP)(R9*4), BX
6006 MOVL 524312(SP)(R10*4), DI
6007 MOVL CX, 24(SP)(R9*4)
6008 MOVL CX, 524312(SP)(R10*4)
6009 MOVQ (DX)(BX*1), R9
6010 MOVQ (DX)(DI*1), R10
6011 CMPQ R9, SI
6012 JEQ candidate_match_encodeBetterBlockAsm
6013 CMPQ R10, SI
6014 JNE no_short_found_encodeBetterBlockAsm
6015 MOVL DI, BX
6016 JMP candidate_match_encodeBetterBlockAsm
6017
6018no_short_found_encodeBetterBlockAsm:
6019 CMPL R9, SI
6020 JEQ candidate_match_encodeBetterBlockAsm
6021 CMPL R10, SI
6022 JEQ candidateS_match_encodeBetterBlockAsm
6023 MOVL 20(SP), CX
6024 JMP search_loop_encodeBetterBlockAsm
6025
6026candidateS_match_encodeBetterBlockAsm:
6027 SHRQ $0x08, SI
6028 MOVQ SI, R9
6029 SHLQ $0x08, R9
6030 IMULQ R8, R9
6031 SHRQ $0x2f, R9
6032 MOVL 24(SP)(R9*4), BX
6033 INCL CX
6034 MOVL CX, 24(SP)(R9*4)
6035 CMPL (DX)(BX*1), SI
6036 JEQ candidate_match_encodeBetterBlockAsm
6037 DECL CX
6038 MOVL DI, BX
6039
6040candidate_match_encodeBetterBlockAsm:
6041 MOVL 12(SP), SI
6042 TESTL BX, BX
6043 JZ match_extend_back_end_encodeBetterBlockAsm
6044
6045match_extend_back_loop_encodeBetterBlockAsm:
6046 CMPL CX, SI
6047 JBE match_extend_back_end_encodeBetterBlockAsm
6048 MOVB -1(DX)(BX*1), DI
6049 MOVB -1(DX)(CX*1), R8
6050 CMPB DI, R8
6051 JNE match_extend_back_end_encodeBetterBlockAsm
6052 LEAL -1(CX), CX
6053 DECL BX
6054 JZ match_extend_back_end_encodeBetterBlockAsm
6055 JMP match_extend_back_loop_encodeBetterBlockAsm
6056
6057match_extend_back_end_encodeBetterBlockAsm:
6058 MOVL CX, SI
6059 SUBL 12(SP), SI
6060 LEAQ 5(AX)(SI*1), SI
6061 CMPQ SI, (SP)
6062 JB match_dst_size_check_encodeBetterBlockAsm
6063 MOVQ $0x00000000, ret+48(FP)
6064 RET
6065
6066match_dst_size_check_encodeBetterBlockAsm:
6067 MOVL CX, SI
6068 ADDL $0x04, CX
6069 ADDL $0x04, BX
6070 MOVQ src_len+32(FP), DI
6071 SUBL CX, DI
6072 LEAQ (DX)(CX*1), R8
6073 LEAQ (DX)(BX*1), R9
6074
6075 // matchLen
6076 XORL R11, R11
6077
6078matchlen_loopback_16_match_nolit_encodeBetterBlockAsm:
6079 CMPL DI, $0x10
6080 JB matchlen_match8_match_nolit_encodeBetterBlockAsm
6081 MOVQ (R8)(R11*1), R10
6082 MOVQ 8(R8)(R11*1), R12
6083 XORQ (R9)(R11*1), R10
6084 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
6085 XORQ 8(R9)(R11*1), R12
6086 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm
6087 LEAL -16(DI), DI
6088 LEAL 16(R11), R11
6089 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm
6090
6091matchlen_bsf_16match_nolit_encodeBetterBlockAsm:
6092#ifdef GOAMD64_v3
6093 TZCNTQ R12, R12
6094
6095#else
6096 BSFQ R12, R12
6097
6098#endif
6099 SARQ $0x03, R12
6100 LEAL 8(R11)(R12*1), R11
6101 JMP match_nolit_end_encodeBetterBlockAsm
6102
6103matchlen_match8_match_nolit_encodeBetterBlockAsm:
6104 CMPL DI, $0x08
6105 JB matchlen_match4_match_nolit_encodeBetterBlockAsm
6106 MOVQ (R8)(R11*1), R10
6107 XORQ (R9)(R11*1), R10
6108 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
6109 LEAL -8(DI), DI
6110 LEAL 8(R11), R11
6111 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm
6112
6113matchlen_bsf_8_match_nolit_encodeBetterBlockAsm:
6114#ifdef GOAMD64_v3
6115 TZCNTQ R10, R10
6116
6117#else
6118 BSFQ R10, R10
6119
6120#endif
6121 SARQ $0x03, R10
6122 LEAL (R11)(R10*1), R11
6123 JMP match_nolit_end_encodeBetterBlockAsm
6124
6125matchlen_match4_match_nolit_encodeBetterBlockAsm:
6126 CMPL DI, $0x04
6127 JB matchlen_match2_match_nolit_encodeBetterBlockAsm
6128 MOVL (R8)(R11*1), R10
6129 CMPL (R9)(R11*1), R10
6130 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm
6131 LEAL -4(DI), DI
6132 LEAL 4(R11), R11
6133
6134matchlen_match2_match_nolit_encodeBetterBlockAsm:
6135 CMPL DI, $0x01
6136 JE matchlen_match1_match_nolit_encodeBetterBlockAsm
6137 JB match_nolit_end_encodeBetterBlockAsm
6138 MOVW (R8)(R11*1), R10
6139 CMPW (R9)(R11*1), R10
6140 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm
6141 LEAL 2(R11), R11
6142 SUBL $0x02, DI
6143 JZ match_nolit_end_encodeBetterBlockAsm
6144
6145matchlen_match1_match_nolit_encodeBetterBlockAsm:
6146 MOVB (R8)(R11*1), R10
6147 CMPB (R9)(R11*1), R10
6148 JNE match_nolit_end_encodeBetterBlockAsm
6149 LEAL 1(R11), R11
6150
6151match_nolit_end_encodeBetterBlockAsm:
6152 MOVL CX, DI
6153 SUBL BX, DI
6154
6155 // Check if repeat
6156 CMPL 16(SP), DI
6157 JEQ match_is_repeat_encodeBetterBlockAsm
6158 CMPL R11, $0x01
6159 JA match_length_ok_encodeBetterBlockAsm
6160 CMPL DI, $0x0000ffff
6161 JBE match_length_ok_encodeBetterBlockAsm
6162 MOVL 20(SP), CX
6163 INCL CX
6164 JMP search_loop_encodeBetterBlockAsm
6165
6166match_length_ok_encodeBetterBlockAsm:
6167 MOVL DI, 16(SP)
6168 MOVL 12(SP), BX
6169 CMPL BX, SI
6170 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm
6171 MOVL SI, R8
6172 MOVL SI, 12(SP)
6173 LEAQ (DX)(BX*1), R9
6174 SUBL BX, R8
6175 LEAL -1(R8), BX
6176 CMPL BX, $0x3c
6177 JB one_byte_match_emit_encodeBetterBlockAsm
6178 CMPL BX, $0x00000100
6179 JB two_bytes_match_emit_encodeBetterBlockAsm
6180 CMPL BX, $0x00010000
6181 JB three_bytes_match_emit_encodeBetterBlockAsm
6182 CMPL BX, $0x01000000
6183 JB four_bytes_match_emit_encodeBetterBlockAsm
6184 MOVB $0xfc, (AX)
6185 MOVL BX, 1(AX)
6186 ADDQ $0x05, AX
6187 JMP memmove_long_match_emit_encodeBetterBlockAsm
6188
6189four_bytes_match_emit_encodeBetterBlockAsm:
6190 MOVL BX, R10
6191 SHRL $0x10, R10
6192 MOVB $0xf8, (AX)
6193 MOVW BX, 1(AX)
6194 MOVB R10, 3(AX)
6195 ADDQ $0x04, AX
6196 JMP memmove_long_match_emit_encodeBetterBlockAsm
6197
6198three_bytes_match_emit_encodeBetterBlockAsm:
6199 MOVB $0xf4, (AX)
6200 MOVW BX, 1(AX)
6201 ADDQ $0x03, AX
6202 JMP memmove_long_match_emit_encodeBetterBlockAsm
6203
6204two_bytes_match_emit_encodeBetterBlockAsm:
6205 MOVB $0xf0, (AX)
6206 MOVB BL, 1(AX)
6207 ADDQ $0x02, AX
6208 CMPL BX, $0x40
6209 JB memmove_match_emit_encodeBetterBlockAsm
6210 JMP memmove_long_match_emit_encodeBetterBlockAsm
6211
6212one_byte_match_emit_encodeBetterBlockAsm:
6213 SHLB $0x02, BL
6214 MOVB BL, (AX)
6215 ADDQ $0x01, AX
6216
6217memmove_match_emit_encodeBetterBlockAsm:
6218 LEAQ (AX)(R8*1), BX
6219
6220 // genMemMoveShort
6221 CMPQ R8, $0x04
6222 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
6223 CMPQ R8, $0x08
6224 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
6225 CMPQ R8, $0x10
6226 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
6227 CMPQ R8, $0x20
6228 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
6229 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
6230
6231emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
6232 MOVL (R9), R10
6233 MOVL R10, (AX)
6234 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6235
6236emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
6237 MOVL (R9), R10
6238 MOVL -4(R9)(R8*1), R9
6239 MOVL R10, (AX)
6240 MOVL R9, -4(AX)(R8*1)
6241 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6242
6243emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
6244 MOVQ (R9), R10
6245 MOVQ -8(R9)(R8*1), R9
6246 MOVQ R10, (AX)
6247 MOVQ R9, -8(AX)(R8*1)
6248 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6249
6250emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
6251 MOVOU (R9), X0
6252 MOVOU -16(R9)(R8*1), X1
6253 MOVOU X0, (AX)
6254 MOVOU X1, -16(AX)(R8*1)
6255 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6256
6257emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
6258 MOVOU (R9), X0
6259 MOVOU 16(R9), X1
6260 MOVOU -32(R9)(R8*1), X2
6261 MOVOU -16(R9)(R8*1), X3
6262 MOVOU X0, (AX)
6263 MOVOU X1, 16(AX)
6264 MOVOU X2, -32(AX)(R8*1)
6265 MOVOU X3, -16(AX)(R8*1)
6266
6267memmove_end_copy_match_emit_encodeBetterBlockAsm:
6268 MOVQ BX, AX
6269 JMP emit_literal_done_match_emit_encodeBetterBlockAsm
6270
6271memmove_long_match_emit_encodeBetterBlockAsm:
6272 LEAQ (AX)(R8*1), BX
6273
6274 // genMemMoveLong
6275 MOVOU (R9), X0
6276 MOVOU 16(R9), X1
6277 MOVOU -32(R9)(R8*1), X2
6278 MOVOU -16(R9)(R8*1), X3
6279 MOVQ R8, R12
6280 SHRQ $0x05, R12
6281 MOVQ AX, R10
6282 ANDL $0x0000001f, R10
6283 MOVQ $0x00000040, R13
6284 SUBQ R10, R13
6285 DECQ R12
6286 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
6287 LEAQ -32(R9)(R13*1), R10
6288 LEAQ -32(AX)(R13*1), R14
6289
6290emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
6291 MOVOU (R10), X4
6292 MOVOU 16(R10), X5
6293 MOVOA X4, (R14)
6294 MOVOA X5, 16(R14)
6295 ADDQ $0x20, R14
6296 ADDQ $0x20, R10
6297 ADDQ $0x20, R13
6298 DECQ R12
6299 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
6300
6301emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
6302 MOVOU -32(R9)(R13*1), X4
6303 MOVOU -16(R9)(R13*1), X5
6304 MOVOA X4, -32(AX)(R13*1)
6305 MOVOA X5, -16(AX)(R13*1)
6306 ADDQ $0x20, R13
6307 CMPQ R8, R13
6308 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
6309 MOVOU X0, (AX)
6310 MOVOU X1, 16(AX)
6311 MOVOU X2, -32(AX)(R8*1)
6312 MOVOU X3, -16(AX)(R8*1)
6313 MOVQ BX, AX
6314
6315emit_literal_done_match_emit_encodeBetterBlockAsm:
6316 ADDL R11, CX
6317 ADDL $0x04, R11
6318 MOVL CX, 12(SP)
6319
6320 // emitCopy
6321 CMPL DI, $0x00010000
6322 JB two_byte_offset_match_nolit_encodeBetterBlockAsm
6323 CMPL R11, $0x40
6324 JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm
6325 MOVB $0xff, (AX)
6326 MOVL DI, 1(AX)
6327 LEAL -64(R11), R11
6328 ADDQ $0x05, AX
6329 CMPL R11, $0x04
6330 JB four_bytes_remain_match_nolit_encodeBetterBlockAsm
6331
6332 // emitRepeat
6333emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
6334 MOVL R11, BX
6335 LEAL -4(R11), R11
6336 CMPL BX, $0x08
6337 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
6338 CMPL BX, $0x0c
6339 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
6340 CMPL DI, $0x00000800
6341 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
6342
6343cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
6344 CMPL R11, $0x00000104
6345 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
6346 CMPL R11, $0x00010100
6347 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
6348 CMPL R11, $0x0100ffff
6349 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
6350 LEAL -16842747(R11), R11
6351 MOVL $0xfffb001d, (AX)
6352 MOVB $0xff, 4(AX)
6353 ADDQ $0x05, AX
6354 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
6355
6356repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
6357 LEAL -65536(R11), R11
6358 MOVL R11, DI
6359 MOVW $0x001d, (AX)
6360 MOVW R11, 2(AX)
6361 SARL $0x10, DI
6362 MOVB DI, 4(AX)
6363 ADDQ $0x05, AX
6364 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6365
6366repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
6367 LEAL -256(R11), R11
6368 MOVW $0x0019, (AX)
6369 MOVW R11, 2(AX)
6370 ADDQ $0x04, AX
6371 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6372
6373repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
6374 LEAL -4(R11), R11
6375 MOVW $0x0015, (AX)
6376 MOVB R11, 2(AX)
6377 ADDQ $0x03, AX
6378 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6379
6380repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
6381 SHLL $0x02, R11
6382 ORL $0x01, R11
6383 MOVW R11, (AX)
6384 ADDQ $0x02, AX
6385 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6386
6387repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
6388 XORQ BX, BX
6389 LEAL 1(BX)(R11*4), R11
6390 MOVB DI, 1(AX)
6391 SARL $0x08, DI
6392 SHLL $0x05, DI
6393 ORL DI, R11
6394 MOVB R11, (AX)
6395 ADDQ $0x02, AX
6396 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6397
6398four_bytes_remain_match_nolit_encodeBetterBlockAsm:
6399 TESTL R11, R11
6400 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
6401 XORL BX, BX
6402 LEAL -1(BX)(R11*4), R11
6403 MOVB R11, (AX)
6404 MOVL DI, 1(AX)
6405 ADDQ $0x05, AX
6406 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6407
6408two_byte_offset_match_nolit_encodeBetterBlockAsm:
6409 CMPL R11, $0x40
6410 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
6411 CMPL DI, $0x00000800
6412 JAE long_offset_short_match_nolit_encodeBetterBlockAsm
6413 MOVL $0x00000001, BX
6414 LEAL 16(BX), BX
6415 MOVB DI, 1(AX)
6416 MOVL DI, R8
6417 SHRL $0x08, R8
6418 SHLL $0x05, R8
6419 ORL R8, BX
6420 MOVB BL, (AX)
6421 ADDQ $0x02, AX
6422 SUBL $0x08, R11
6423
6424 // emitRepeat
6425 LEAL -4(R11), R11
6426 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6427
6428emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6429 MOVL R11, BX
6430 LEAL -4(R11), R11
6431 CMPL BX, $0x08
6432 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6433 CMPL BX, $0x0c
6434 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6435 CMPL DI, $0x00000800
6436 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6437
6438cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6439 CMPL R11, $0x00000104
6440 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6441 CMPL R11, $0x00010100
6442 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6443 CMPL R11, $0x0100ffff
6444 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6445 LEAL -16842747(R11), R11
6446 MOVL $0xfffb001d, (AX)
6447 MOVB $0xff, 4(AX)
6448 ADDQ $0x05, AX
6449 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6450
6451repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6452 LEAL -65536(R11), R11
6453 MOVL R11, DI
6454 MOVW $0x001d, (AX)
6455 MOVW R11, 2(AX)
6456 SARL $0x10, DI
6457 MOVB DI, 4(AX)
6458 ADDQ $0x05, AX
6459 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6460
6461repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6462 LEAL -256(R11), R11
6463 MOVW $0x0019, (AX)
6464 MOVW R11, 2(AX)
6465 ADDQ $0x04, AX
6466 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6467
6468repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6469 LEAL -4(R11), R11
6470 MOVW $0x0015, (AX)
6471 MOVB R11, 2(AX)
6472 ADDQ $0x03, AX
6473 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6474
6475repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6476 SHLL $0x02, R11
6477 ORL $0x01, R11
6478 MOVW R11, (AX)
6479 ADDQ $0x02, AX
6480 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6481
6482repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6483 XORQ BX, BX
6484 LEAL 1(BX)(R11*4), R11
6485 MOVB DI, 1(AX)
6486 SARL $0x08, DI
6487 SHLL $0x05, DI
6488 ORL DI, R11
6489 MOVB R11, (AX)
6490 ADDQ $0x02, AX
6491 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6492
6493long_offset_short_match_nolit_encodeBetterBlockAsm:
6494 MOVB $0xee, (AX)
6495 MOVW DI, 1(AX)
6496 LEAL -60(R11), R11
6497 ADDQ $0x03, AX
6498
6499 // emitRepeat
6500emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6501 MOVL R11, BX
6502 LEAL -4(R11), R11
6503 CMPL BX, $0x08
6504 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
6505 CMPL BX, $0x0c
6506 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
6507 CMPL DI, $0x00000800
6508 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
6509
6510cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6511 CMPL R11, $0x00000104
6512 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
6513 CMPL R11, $0x00010100
6514 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
6515 CMPL R11, $0x0100ffff
6516 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
6517 LEAL -16842747(R11), R11
6518 MOVL $0xfffb001d, (AX)
6519 MOVB $0xff, 4(AX)
6520 ADDQ $0x05, AX
6521 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
6522
6523repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6524 LEAL -65536(R11), R11
6525 MOVL R11, DI
6526 MOVW $0x001d, (AX)
6527 MOVW R11, 2(AX)
6528 SARL $0x10, DI
6529 MOVB DI, 4(AX)
6530 ADDQ $0x05, AX
6531 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6532
6533repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6534 LEAL -256(R11), R11
6535 MOVW $0x0019, (AX)
6536 MOVW R11, 2(AX)
6537 ADDQ $0x04, AX
6538 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6539
6540repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6541 LEAL -4(R11), R11
6542 MOVW $0x0015, (AX)
6543 MOVB R11, 2(AX)
6544 ADDQ $0x03, AX
6545 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6546
6547repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6548 SHLL $0x02, R11
6549 ORL $0x01, R11
6550 MOVW R11, (AX)
6551 ADDQ $0x02, AX
6552 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6553
6554repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6555 XORQ BX, BX
6556 LEAL 1(BX)(R11*4), R11
6557 MOVB DI, 1(AX)
6558 SARL $0x08, DI
6559 SHLL $0x05, DI
6560 ORL DI, R11
6561 MOVB R11, (AX)
6562 ADDQ $0x02, AX
6563 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6564
6565two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
6566 MOVL R11, BX
6567 SHLL $0x02, BX
6568 CMPL R11, $0x0c
6569 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm
6570 CMPL DI, $0x00000800
6571 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm
6572 LEAL -15(BX), BX
6573 MOVB DI, 1(AX)
6574 SHRL $0x08, DI
6575 SHLL $0x05, DI
6576 ORL DI, BX
6577 MOVB BL, (AX)
6578 ADDQ $0x02, AX
6579 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6580
6581emit_copy_three_match_nolit_encodeBetterBlockAsm:
6582 LEAL -2(BX), BX
6583 MOVB BL, (AX)
6584 MOVW DI, 1(AX)
6585 ADDQ $0x03, AX
6586 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6587
6588match_is_repeat_encodeBetterBlockAsm:
6589 MOVL 12(SP), BX
6590 CMPL BX, SI
6591 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
6592 MOVL SI, R8
6593 MOVL SI, 12(SP)
6594 LEAQ (DX)(BX*1), R9
6595 SUBL BX, R8
6596 LEAL -1(R8), BX
6597 CMPL BX, $0x3c
6598 JB one_byte_match_emit_repeat_encodeBetterBlockAsm
6599 CMPL BX, $0x00000100
6600 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm
6601 CMPL BX, $0x00010000
6602 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm
6603 CMPL BX, $0x01000000
6604 JB four_bytes_match_emit_repeat_encodeBetterBlockAsm
6605 MOVB $0xfc, (AX)
6606 MOVL BX, 1(AX)
6607 ADDQ $0x05, AX
6608 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6609
6610four_bytes_match_emit_repeat_encodeBetterBlockAsm:
6611 MOVL BX, R10
6612 SHRL $0x10, R10
6613 MOVB $0xf8, (AX)
6614 MOVW BX, 1(AX)
6615 MOVB R10, 3(AX)
6616 ADDQ $0x04, AX
6617 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6618
6619three_bytes_match_emit_repeat_encodeBetterBlockAsm:
6620 MOVB $0xf4, (AX)
6621 MOVW BX, 1(AX)
6622 ADDQ $0x03, AX
6623 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6624
6625two_bytes_match_emit_repeat_encodeBetterBlockAsm:
6626 MOVB $0xf0, (AX)
6627 MOVB BL, 1(AX)
6628 ADDQ $0x02, AX
6629 CMPL BX, $0x40
6630 JB memmove_match_emit_repeat_encodeBetterBlockAsm
6631 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6632
6633one_byte_match_emit_repeat_encodeBetterBlockAsm:
6634 SHLB $0x02, BL
6635 MOVB BL, (AX)
6636 ADDQ $0x01, AX
6637
6638memmove_match_emit_repeat_encodeBetterBlockAsm:
6639 LEAQ (AX)(R8*1), BX
6640
6641 // genMemMoveShort
6642 CMPQ R8, $0x04
6643 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
6644 CMPQ R8, $0x08
6645 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
6646 CMPQ R8, $0x10
6647 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
6648 CMPQ R8, $0x20
6649 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
6650 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
6651
6652emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
6653 MOVL (R9), R10
6654 MOVL R10, (AX)
6655 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6656
6657emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
6658 MOVL (R9), R10
6659 MOVL -4(R9)(R8*1), R9
6660 MOVL R10, (AX)
6661 MOVL R9, -4(AX)(R8*1)
6662 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6663
6664emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
6665 MOVQ (R9), R10
6666 MOVQ -8(R9)(R8*1), R9
6667 MOVQ R10, (AX)
6668 MOVQ R9, -8(AX)(R8*1)
6669 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6670
6671emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
6672 MOVOU (R9), X0
6673 MOVOU -16(R9)(R8*1), X1
6674 MOVOU X0, (AX)
6675 MOVOU X1, -16(AX)(R8*1)
6676 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6677
6678emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
6679 MOVOU (R9), X0
6680 MOVOU 16(R9), X1
6681 MOVOU -32(R9)(R8*1), X2
6682 MOVOU -16(R9)(R8*1), X3
6683 MOVOU X0, (AX)
6684 MOVOU X1, 16(AX)
6685 MOVOU X2, -32(AX)(R8*1)
6686 MOVOU X3, -16(AX)(R8*1)
6687
6688memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
6689 MOVQ BX, AX
6690 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
6691
6692memmove_long_match_emit_repeat_encodeBetterBlockAsm:
6693 LEAQ (AX)(R8*1), BX
6694
6695 // genMemMoveLong
6696 MOVOU (R9), X0
6697 MOVOU 16(R9), X1
6698 MOVOU -32(R9)(R8*1), X2
6699 MOVOU -16(R9)(R8*1), X3
6700 MOVQ R8, R12
6701 SHRQ $0x05, R12
6702 MOVQ AX, R10
6703 ANDL $0x0000001f, R10
6704 MOVQ $0x00000040, R13
6705 SUBQ R10, R13
6706 DECQ R12
6707 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
6708 LEAQ -32(R9)(R13*1), R10
6709 LEAQ -32(AX)(R13*1), R14
6710
6711emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
6712 MOVOU (R10), X4
6713 MOVOU 16(R10), X5
6714 MOVOA X4, (R14)
6715 MOVOA X5, 16(R14)
6716 ADDQ $0x20, R14
6717 ADDQ $0x20, R10
6718 ADDQ $0x20, R13
6719 DECQ R12
6720 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
6721
6722emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
6723 MOVOU -32(R9)(R13*1), X4
6724 MOVOU -16(R9)(R13*1), X5
6725 MOVOA X4, -32(AX)(R13*1)
6726 MOVOA X5, -16(AX)(R13*1)
6727 ADDQ $0x20, R13
6728 CMPQ R8, R13
6729 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
6730 MOVOU X0, (AX)
6731 MOVOU X1, 16(AX)
6732 MOVOU X2, -32(AX)(R8*1)
6733 MOVOU X3, -16(AX)(R8*1)
6734 MOVQ BX, AX
6735
6736emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
6737 ADDL R11, CX
6738 ADDL $0x04, R11
6739 MOVL CX, 12(SP)
6740
6741 // emitRepeat
6742emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
6743 MOVL R11, BX
6744 LEAL -4(R11), R11
6745 CMPL BX, $0x08
6746 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm
6747 CMPL BX, $0x0c
6748 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
6749 CMPL DI, $0x00000800
6750 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
6751
6752cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
6753 CMPL R11, $0x00000104
6754 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm
6755 CMPL R11, $0x00010100
6756 JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm
6757 CMPL R11, $0x0100ffff
6758 JB repeat_five_match_nolit_repeat_encodeBetterBlockAsm
6759 LEAL -16842747(R11), R11
6760 MOVL $0xfffb001d, (AX)
6761 MOVB $0xff, 4(AX)
6762 ADDQ $0x05, AX
6763 JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
6764
6765repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
6766 LEAL -65536(R11), R11
6767 MOVL R11, DI
6768 MOVW $0x001d, (AX)
6769 MOVW R11, 2(AX)
6770 SARL $0x10, DI
6771 MOVB DI, 4(AX)
6772 ADDQ $0x05, AX
6773 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6774
6775repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
6776 LEAL -256(R11), R11
6777 MOVW $0x0019, (AX)
6778 MOVW R11, 2(AX)
6779 ADDQ $0x04, AX
6780 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6781
6782repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
6783 LEAL -4(R11), R11
6784 MOVW $0x0015, (AX)
6785 MOVB R11, 2(AX)
6786 ADDQ $0x03, AX
6787 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6788
6789repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
6790 SHLL $0x02, R11
6791 ORL $0x01, R11
6792 MOVW R11, (AX)
6793 ADDQ $0x02, AX
6794 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6795
6796repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
6797 XORQ BX, BX
6798 LEAL 1(BX)(R11*4), R11
6799 MOVB DI, 1(AX)
6800 SARL $0x08, DI
6801 SHLL $0x05, DI
6802 ORL DI, R11
6803 MOVB R11, (AX)
6804 ADDQ $0x02, AX
6805
6806match_nolit_emitcopy_end_encodeBetterBlockAsm:
6807 CMPL CX, 8(SP)
6808 JAE emit_remainder_encodeBetterBlockAsm
6809 CMPQ AX, (SP)
6810 JB match_nolit_dst_ok_encodeBetterBlockAsm
6811 MOVQ $0x00000000, ret+48(FP)
6812 RET
6813
6814match_nolit_dst_ok_encodeBetterBlockAsm:
6815 MOVQ $0x00cf1bbcdcbfa563, BX
6816 MOVQ $0x9e3779b1, DI
6817 LEAQ 1(SI), SI
6818 LEAQ -2(CX), R8
6819 MOVQ (DX)(SI*1), R9
6820 MOVQ 1(DX)(SI*1), R10
6821 MOVQ (DX)(R8*1), R11
6822 MOVQ 1(DX)(R8*1), R12
6823 SHLQ $0x08, R9
6824 IMULQ BX, R9
6825 SHRQ $0x2f, R9
6826 SHLQ $0x20, R10
6827 IMULQ DI, R10
6828 SHRQ $0x32, R10
6829 SHLQ $0x08, R11
6830 IMULQ BX, R11
6831 SHRQ $0x2f, R11
6832 SHLQ $0x20, R12
6833 IMULQ DI, R12
6834 SHRQ $0x32, R12
6835 LEAQ 1(SI), DI
6836 LEAQ 1(R8), R13
6837 MOVL SI, 24(SP)(R9*4)
6838 MOVL R8, 24(SP)(R11*4)
6839 MOVL DI, 524312(SP)(R10*4)
6840 MOVL R13, 524312(SP)(R12*4)
6841 LEAQ 1(R8)(SI*1), DI
6842 SHRQ $0x01, DI
6843 ADDQ $0x01, SI
6844 SUBQ $0x01, R8
6845
6846index_loop_encodeBetterBlockAsm:
6847 CMPQ DI, R8
6848 JAE search_loop_encodeBetterBlockAsm
6849 MOVQ (DX)(SI*1), R9
6850 MOVQ (DX)(DI*1), R10
6851 SHLQ $0x08, R9
6852 IMULQ BX, R9
6853 SHRQ $0x2f, R9
6854 SHLQ $0x08, R10
6855 IMULQ BX, R10
6856 SHRQ $0x2f, R10
6857 MOVL SI, 24(SP)(R9*4)
6858 MOVL DI, 24(SP)(R10*4)
6859 ADDQ $0x02, SI
6860 ADDQ $0x02, DI
6861 JMP index_loop_encodeBetterBlockAsm
6862
6863emit_remainder_encodeBetterBlockAsm:
6864 MOVQ src_len+32(FP), CX
6865 SUBL 12(SP), CX
6866 LEAQ 5(AX)(CX*1), CX
6867 CMPQ CX, (SP)
6868 JB emit_remainder_ok_encodeBetterBlockAsm
6869 MOVQ $0x00000000, ret+48(FP)
6870 RET
6871
6872emit_remainder_ok_encodeBetterBlockAsm:
6873 MOVQ src_len+32(FP), CX
6874 MOVL 12(SP), BX
6875 CMPL BX, CX
6876 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm
6877 MOVL CX, SI
6878 MOVL CX, 12(SP)
6879 LEAQ (DX)(BX*1), CX
6880 SUBL BX, SI
6881 LEAL -1(SI), DX
6882 CMPL DX, $0x3c
6883 JB one_byte_emit_remainder_encodeBetterBlockAsm
6884 CMPL DX, $0x00000100
6885 JB two_bytes_emit_remainder_encodeBetterBlockAsm
6886 CMPL DX, $0x00010000
6887 JB three_bytes_emit_remainder_encodeBetterBlockAsm
6888 CMPL DX, $0x01000000
6889 JB four_bytes_emit_remainder_encodeBetterBlockAsm
6890 MOVB $0xfc, (AX)
6891 MOVL DX, 1(AX)
6892 ADDQ $0x05, AX
6893 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6894
6895four_bytes_emit_remainder_encodeBetterBlockAsm:
6896 MOVL DX, BX
6897 SHRL $0x10, BX
6898 MOVB $0xf8, (AX)
6899 MOVW DX, 1(AX)
6900 MOVB BL, 3(AX)
6901 ADDQ $0x04, AX
6902 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6903
6904three_bytes_emit_remainder_encodeBetterBlockAsm:
6905 MOVB $0xf4, (AX)
6906 MOVW DX, 1(AX)
6907 ADDQ $0x03, AX
6908 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6909
6910two_bytes_emit_remainder_encodeBetterBlockAsm:
6911 MOVB $0xf0, (AX)
6912 MOVB DL, 1(AX)
6913 ADDQ $0x02, AX
6914 CMPL DX, $0x40
6915 JB memmove_emit_remainder_encodeBetterBlockAsm
6916 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6917
6918one_byte_emit_remainder_encodeBetterBlockAsm:
6919 SHLB $0x02, DL
6920 MOVB DL, (AX)
6921 ADDQ $0x01, AX
6922
6923memmove_emit_remainder_encodeBetterBlockAsm:
6924 LEAQ (AX)(SI*1), DX
6925 MOVL SI, BX
6926
6927 // genMemMoveShort
6928 CMPQ BX, $0x03
6929 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
6930 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
6931 CMPQ BX, $0x08
6932 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
6933 CMPQ BX, $0x10
6934 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
6935 CMPQ BX, $0x20
6936 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
6937 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
6938
6939emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
6940 MOVB (CX), SI
6941 MOVB -1(CX)(BX*1), CL
6942 MOVB SI, (AX)
6943 MOVB CL, -1(AX)(BX*1)
6944 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
6945
6946emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
6947 MOVW (CX), SI
6948 MOVB 2(CX), CL
6949 MOVW SI, (AX)
6950 MOVB CL, 2(AX)
6951 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
6952
6953emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
6954 MOVL (CX), SI
6955 MOVL -4(CX)(BX*1), CX
6956 MOVL SI, (AX)
6957 MOVL CX, -4(AX)(BX*1)
6958 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
6959
6960emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
6961 MOVQ (CX), SI
6962 MOVQ -8(CX)(BX*1), CX
6963 MOVQ SI, (AX)
6964 MOVQ CX, -8(AX)(BX*1)
6965 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
6966
6967emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
6968 MOVOU (CX), X0
6969 MOVOU -16(CX)(BX*1), X1
6970 MOVOU X0, (AX)
6971 MOVOU X1, -16(AX)(BX*1)
6972 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
6973
6974emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
6975 MOVOU (CX), X0
6976 MOVOU 16(CX), X1
6977 MOVOU -32(CX)(BX*1), X2
6978 MOVOU -16(CX)(BX*1), X3
6979 MOVOU X0, (AX)
6980 MOVOU X1, 16(AX)
6981 MOVOU X2, -32(AX)(BX*1)
6982 MOVOU X3, -16(AX)(BX*1)
6983
6984memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
6985 MOVQ DX, AX
6986 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm
6987
6988memmove_long_emit_remainder_encodeBetterBlockAsm:
6989 LEAQ (AX)(SI*1), DX
6990 MOVL SI, BX
6991
6992 // genMemMoveLong
6993 MOVOU (CX), X0
6994 MOVOU 16(CX), X1
6995 MOVOU -32(CX)(BX*1), X2
6996 MOVOU -16(CX)(BX*1), X3
6997 MOVQ BX, DI
6998 SHRQ $0x05, DI
6999 MOVQ AX, SI
7000 ANDL $0x0000001f, SI
7001 MOVQ $0x00000040, R8
7002 SUBQ SI, R8
7003 DECQ DI
7004 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
7005 LEAQ -32(CX)(R8*1), SI
7006 LEAQ -32(AX)(R8*1), R9
7007
7008emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
7009 MOVOU (SI), X4
7010 MOVOU 16(SI), X5
7011 MOVOA X4, (R9)
7012 MOVOA X5, 16(R9)
7013 ADDQ $0x20, R9
7014 ADDQ $0x20, SI
7015 ADDQ $0x20, R8
7016 DECQ DI
7017 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
7018
7019emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
7020 MOVOU -32(CX)(R8*1), X4
7021 MOVOU -16(CX)(R8*1), X5
7022 MOVOA X4, -32(AX)(R8*1)
7023 MOVOA X5, -16(AX)(R8*1)
7024 ADDQ $0x20, R8
7025 CMPQ BX, R8
7026 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
7027 MOVOU X0, (AX)
7028 MOVOU X1, 16(AX)
7029 MOVOU X2, -32(AX)(BX*1)
7030 MOVOU X3, -16(AX)(BX*1)
7031 MOVQ DX, AX
7032
7033emit_literal_done_emit_remainder_encodeBetterBlockAsm:
7034 MOVQ dst_base+0(FP), CX
7035 SUBQ CX, AX
7036 MOVQ AX, ret+48(FP)
7037 RET
7038
7039// func encodeBetterBlockAsm4MB(dst []byte, src []byte) int
7040// Requires: BMI, SSE2
7041TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56
7042 MOVQ dst_base+0(FP), AX
7043 MOVQ $0x00001200, CX
7044 LEAQ 24(SP), DX
7045 PXOR X0, X0
7046
7047zero_loop_encodeBetterBlockAsm4MB:
7048 MOVOU X0, (DX)
7049 MOVOU X0, 16(DX)
7050 MOVOU X0, 32(DX)
7051 MOVOU X0, 48(DX)
7052 MOVOU X0, 64(DX)
7053 MOVOU X0, 80(DX)
7054 MOVOU X0, 96(DX)
7055 MOVOU X0, 112(DX)
7056 ADDQ $0x80, DX
7057 DECQ CX
7058 JNZ zero_loop_encodeBetterBlockAsm4MB
7059 MOVL $0x00000000, 12(SP)
7060 MOVQ src_len+32(FP), CX
7061 LEAQ -6(CX), DX
7062 LEAQ -8(CX), BX
7063 MOVL BX, 8(SP)
7064 SHRQ $0x05, CX
7065 SUBL CX, DX
7066 LEAQ (AX)(DX*1), DX
7067 MOVQ DX, (SP)
7068 MOVL $0x00000001, CX
7069 MOVL $0x00000000, 16(SP)
7070 MOVQ src_base+24(FP), DX
7071
7072search_loop_encodeBetterBlockAsm4MB:
7073 MOVL CX, BX
7074 SUBL 12(SP), BX
7075 SHRL $0x07, BX
7076 CMPL BX, $0x63
7077 JBE check_maxskip_ok_encodeBetterBlockAsm4MB
7078 LEAL 100(CX), BX
7079 JMP check_maxskip_cont_encodeBetterBlockAsm4MB
7080
7081check_maxskip_ok_encodeBetterBlockAsm4MB:
7082 LEAL 1(CX)(BX*1), BX
7083
7084check_maxskip_cont_encodeBetterBlockAsm4MB:
7085 CMPL BX, 8(SP)
7086 JAE emit_remainder_encodeBetterBlockAsm4MB
7087 MOVQ (DX)(CX*1), SI
7088 MOVL BX, 20(SP)
7089 MOVQ $0x00cf1bbcdcbfa563, R8
7090 MOVQ $0x9e3779b1, BX
7091 MOVQ SI, R9
7092 MOVQ SI, R10
7093 SHLQ $0x08, R9
7094 IMULQ R8, R9
7095 SHRQ $0x2f, R9
7096 SHLQ $0x20, R10
7097 IMULQ BX, R10
7098 SHRQ $0x32, R10
7099 MOVL 24(SP)(R9*4), BX
7100 MOVL 524312(SP)(R10*4), DI
7101 MOVL CX, 24(SP)(R9*4)
7102 MOVL CX, 524312(SP)(R10*4)
7103 MOVQ (DX)(BX*1), R9
7104 MOVQ (DX)(DI*1), R10
7105 CMPQ R9, SI
7106 JEQ candidate_match_encodeBetterBlockAsm4MB
7107 CMPQ R10, SI
7108 JNE no_short_found_encodeBetterBlockAsm4MB
7109 MOVL DI, BX
7110 JMP candidate_match_encodeBetterBlockAsm4MB
7111
7112no_short_found_encodeBetterBlockAsm4MB:
7113 CMPL R9, SI
7114 JEQ candidate_match_encodeBetterBlockAsm4MB
7115 CMPL R10, SI
7116 JEQ candidateS_match_encodeBetterBlockAsm4MB
7117 MOVL 20(SP), CX
7118 JMP search_loop_encodeBetterBlockAsm4MB
7119
7120candidateS_match_encodeBetterBlockAsm4MB:
7121 SHRQ $0x08, SI
7122 MOVQ SI, R9
7123 SHLQ $0x08, R9
7124 IMULQ R8, R9
7125 SHRQ $0x2f, R9
7126 MOVL 24(SP)(R9*4), BX
7127 INCL CX
7128 MOVL CX, 24(SP)(R9*4)
7129 CMPL (DX)(BX*1), SI
7130 JEQ candidate_match_encodeBetterBlockAsm4MB
7131 DECL CX
7132 MOVL DI, BX
7133
7134candidate_match_encodeBetterBlockAsm4MB:
7135 MOVL 12(SP), SI
7136 TESTL BX, BX
7137 JZ match_extend_back_end_encodeBetterBlockAsm4MB
7138
7139match_extend_back_loop_encodeBetterBlockAsm4MB:
7140 CMPL CX, SI
7141 JBE match_extend_back_end_encodeBetterBlockAsm4MB
7142 MOVB -1(DX)(BX*1), DI
7143 MOVB -1(DX)(CX*1), R8
7144 CMPB DI, R8
7145 JNE match_extend_back_end_encodeBetterBlockAsm4MB
7146 LEAL -1(CX), CX
7147 DECL BX
7148 JZ match_extend_back_end_encodeBetterBlockAsm4MB
7149 JMP match_extend_back_loop_encodeBetterBlockAsm4MB
7150
7151match_extend_back_end_encodeBetterBlockAsm4MB:
7152 MOVL CX, SI
7153 SUBL 12(SP), SI
7154 LEAQ 4(AX)(SI*1), SI
7155 CMPQ SI, (SP)
7156 JB match_dst_size_check_encodeBetterBlockAsm4MB
7157 MOVQ $0x00000000, ret+48(FP)
7158 RET
7159
7160match_dst_size_check_encodeBetterBlockAsm4MB:
7161 MOVL CX, SI
7162 ADDL $0x04, CX
7163 ADDL $0x04, BX
7164 MOVQ src_len+32(FP), DI
7165 SUBL CX, DI
7166 LEAQ (DX)(CX*1), R8
7167 LEAQ (DX)(BX*1), R9
7168
7169 // matchLen
7170 XORL R11, R11
7171
7172matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB:
7173 CMPL DI, $0x10
7174 JB matchlen_match8_match_nolit_encodeBetterBlockAsm4MB
7175 MOVQ (R8)(R11*1), R10
7176 MOVQ 8(R8)(R11*1), R12
7177 XORQ (R9)(R11*1), R10
7178 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
7179 XORQ 8(R9)(R11*1), R12
7180 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB
7181 LEAL -16(DI), DI
7182 LEAL 16(R11), R11
7183 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB
7184
7185matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB:
7186#ifdef GOAMD64_v3
7187 TZCNTQ R12, R12
7188
7189#else
7190 BSFQ R12, R12
7191
7192#endif
7193 SARQ $0x03, R12
7194 LEAL 8(R11)(R12*1), R11
7195 JMP match_nolit_end_encodeBetterBlockAsm4MB
7196
7197matchlen_match8_match_nolit_encodeBetterBlockAsm4MB:
7198 CMPL DI, $0x08
7199 JB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
7200 MOVQ (R8)(R11*1), R10
7201 XORQ (R9)(R11*1), R10
7202 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
7203 LEAL -8(DI), DI
7204 LEAL 8(R11), R11
7205 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
7206
7207matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB:
7208#ifdef GOAMD64_v3
7209 TZCNTQ R10, R10
7210
7211#else
7212 BSFQ R10, R10
7213
7214#endif
7215 SARQ $0x03, R10
7216 LEAL (R11)(R10*1), R11
7217 JMP match_nolit_end_encodeBetterBlockAsm4MB
7218
7219matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
7220 CMPL DI, $0x04
7221 JB matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
7222 MOVL (R8)(R11*1), R10
7223 CMPL (R9)(R11*1), R10
7224 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
7225 LEAL -4(DI), DI
7226 LEAL 4(R11), R11
7227
7228matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
7229 CMPL DI, $0x01
7230 JE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
7231 JB match_nolit_end_encodeBetterBlockAsm4MB
7232 MOVW (R8)(R11*1), R10
7233 CMPW (R9)(R11*1), R10
7234 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
7235 LEAL 2(R11), R11
7236 SUBL $0x02, DI
7237 JZ match_nolit_end_encodeBetterBlockAsm4MB
7238
7239matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
7240 MOVB (R8)(R11*1), R10
7241 CMPB (R9)(R11*1), R10
7242 JNE match_nolit_end_encodeBetterBlockAsm4MB
7243 LEAL 1(R11), R11
7244
7245match_nolit_end_encodeBetterBlockAsm4MB:
7246 MOVL CX, DI
7247 SUBL BX, DI
7248
7249 // Check if repeat
7250 CMPL 16(SP), DI
7251 JEQ match_is_repeat_encodeBetterBlockAsm4MB
7252 CMPL R11, $0x01
7253 JA match_length_ok_encodeBetterBlockAsm4MB
7254 CMPL DI, $0x0000ffff
7255 JBE match_length_ok_encodeBetterBlockAsm4MB
7256 MOVL 20(SP), CX
7257 INCL CX
7258 JMP search_loop_encodeBetterBlockAsm4MB
7259
7260match_length_ok_encodeBetterBlockAsm4MB:
7261 MOVL DI, 16(SP)
7262 MOVL 12(SP), BX
7263 CMPL BX, SI
7264 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB
7265 MOVL SI, R8
7266 MOVL SI, 12(SP)
7267 LEAQ (DX)(BX*1), R9
7268 SUBL BX, R8
7269 LEAL -1(R8), BX
7270 CMPL BX, $0x3c
7271 JB one_byte_match_emit_encodeBetterBlockAsm4MB
7272 CMPL BX, $0x00000100
7273 JB two_bytes_match_emit_encodeBetterBlockAsm4MB
7274 CMPL BX, $0x00010000
7275 JB three_bytes_match_emit_encodeBetterBlockAsm4MB
7276 MOVL BX, R10
7277 SHRL $0x10, R10
7278 MOVB $0xf8, (AX)
7279 MOVW BX, 1(AX)
7280 MOVB R10, 3(AX)
7281 ADDQ $0x04, AX
7282 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
7283
7284three_bytes_match_emit_encodeBetterBlockAsm4MB:
7285 MOVB $0xf4, (AX)
7286 MOVW BX, 1(AX)
7287 ADDQ $0x03, AX
7288 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
7289
7290two_bytes_match_emit_encodeBetterBlockAsm4MB:
7291 MOVB $0xf0, (AX)
7292 MOVB BL, 1(AX)
7293 ADDQ $0x02, AX
7294 CMPL BX, $0x40
7295 JB memmove_match_emit_encodeBetterBlockAsm4MB
7296 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
7297
7298one_byte_match_emit_encodeBetterBlockAsm4MB:
7299 SHLB $0x02, BL
7300 MOVB BL, (AX)
7301 ADDQ $0x01, AX
7302
7303memmove_match_emit_encodeBetterBlockAsm4MB:
7304 LEAQ (AX)(R8*1), BX
7305
7306 // genMemMoveShort
7307 CMPQ R8, $0x04
7308 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
7309 CMPQ R8, $0x08
7310 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
7311 CMPQ R8, $0x10
7312 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
7313 CMPQ R8, $0x20
7314 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
7315 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
7316
7317emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
7318 MOVL (R9), R10
7319 MOVL R10, (AX)
7320 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7321
7322emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
7323 MOVL (R9), R10
7324 MOVL -4(R9)(R8*1), R9
7325 MOVL R10, (AX)
7326 MOVL R9, -4(AX)(R8*1)
7327 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7328
7329emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
7330 MOVQ (R9), R10
7331 MOVQ -8(R9)(R8*1), R9
7332 MOVQ R10, (AX)
7333 MOVQ R9, -8(AX)(R8*1)
7334 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7335
7336emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
7337 MOVOU (R9), X0
7338 MOVOU -16(R9)(R8*1), X1
7339 MOVOU X0, (AX)
7340 MOVOU X1, -16(AX)(R8*1)
7341 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7342
7343emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
7344 MOVOU (R9), X0
7345 MOVOU 16(R9), X1
7346 MOVOU -32(R9)(R8*1), X2
7347 MOVOU -16(R9)(R8*1), X3
7348 MOVOU X0, (AX)
7349 MOVOU X1, 16(AX)
7350 MOVOU X2, -32(AX)(R8*1)
7351 MOVOU X3, -16(AX)(R8*1)
7352
7353memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
7354 MOVQ BX, AX
7355 JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB
7356
7357memmove_long_match_emit_encodeBetterBlockAsm4MB:
7358 LEAQ (AX)(R8*1), BX
7359
7360 // genMemMoveLong
7361 MOVOU (R9), X0
7362 MOVOU 16(R9), X1
7363 MOVOU -32(R9)(R8*1), X2
7364 MOVOU -16(R9)(R8*1), X3
7365 MOVQ R8, R12
7366 SHRQ $0x05, R12
7367 MOVQ AX, R10
7368 ANDL $0x0000001f, R10
7369 MOVQ $0x00000040, R13
7370 SUBQ R10, R13
7371 DECQ R12
7372 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7373 LEAQ -32(R9)(R13*1), R10
7374 LEAQ -32(AX)(R13*1), R14
7375
7376emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
7377 MOVOU (R10), X4
7378 MOVOU 16(R10), X5
7379 MOVOA X4, (R14)
7380 MOVOA X5, 16(R14)
7381 ADDQ $0x20, R14
7382 ADDQ $0x20, R10
7383 ADDQ $0x20, R13
7384 DECQ R12
7385 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
7386
7387emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
7388 MOVOU -32(R9)(R13*1), X4
7389 MOVOU -16(R9)(R13*1), X5
7390 MOVOA X4, -32(AX)(R13*1)
7391 MOVOA X5, -16(AX)(R13*1)
7392 ADDQ $0x20, R13
7393 CMPQ R8, R13
7394 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7395 MOVOU X0, (AX)
7396 MOVOU X1, 16(AX)
7397 MOVOU X2, -32(AX)(R8*1)
7398 MOVOU X3, -16(AX)(R8*1)
7399 MOVQ BX, AX
7400
7401emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
7402 ADDL R11, CX
7403 ADDL $0x04, R11
7404 MOVL CX, 12(SP)
7405
7406 // emitCopy
7407 CMPL DI, $0x00010000
7408 JB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
7409 CMPL R11, $0x40
7410 JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
7411 MOVB $0xff, (AX)
7412 MOVL DI, 1(AX)
7413 LEAL -64(R11), R11
7414 ADDQ $0x05, AX
7415 CMPL R11, $0x04
7416 JB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
7417
7418 // emitRepeat
7419 MOVL R11, BX
7420 LEAL -4(R11), R11
7421 CMPL BX, $0x08
7422 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7423 CMPL BX, $0x0c
7424 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7425 CMPL DI, $0x00000800
7426 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7427
7428cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7429 CMPL R11, $0x00000104
7430 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7431 CMPL R11, $0x00010100
7432 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7433 LEAL -65536(R11), R11
7434 MOVL R11, DI
7435 MOVW $0x001d, (AX)
7436 MOVW R11, 2(AX)
7437 SARL $0x10, DI
7438 MOVB DI, 4(AX)
7439 ADDQ $0x05, AX
7440 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7441
7442repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7443 LEAL -256(R11), R11
7444 MOVW $0x0019, (AX)
7445 MOVW R11, 2(AX)
7446 ADDQ $0x04, AX
7447 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7448
7449repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7450 LEAL -4(R11), R11
7451 MOVW $0x0015, (AX)
7452 MOVB R11, 2(AX)
7453 ADDQ $0x03, AX
7454 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7455
7456repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7457 SHLL $0x02, R11
7458 ORL $0x01, R11
7459 MOVW R11, (AX)
7460 ADDQ $0x02, AX
7461 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7462
7463repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7464 XORQ BX, BX
7465 LEAL 1(BX)(R11*4), R11
7466 MOVB DI, 1(AX)
7467 SARL $0x08, DI
7468 SHLL $0x05, DI
7469 ORL DI, R11
7470 MOVB R11, (AX)
7471 ADDQ $0x02, AX
7472 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7473
7474four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
7475 TESTL R11, R11
7476 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7477 XORL BX, BX
7478 LEAL -1(BX)(R11*4), R11
7479 MOVB R11, (AX)
7480 MOVL DI, 1(AX)
7481 ADDQ $0x05, AX
7482 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7483
7484two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
7485 CMPL R11, $0x40
7486 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
7487 CMPL DI, $0x00000800
7488 JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB
7489 MOVL $0x00000001, BX
7490 LEAL 16(BX), BX
7491 MOVB DI, 1(AX)
7492 SHRL $0x08, DI
7493 SHLL $0x05, DI
7494 ORL DI, BX
7495 MOVB BL, (AX)
7496 ADDQ $0x02, AX
7497 SUBL $0x08, R11
7498
7499 // emitRepeat
7500 LEAL -4(R11), R11
7501 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7502 MOVL R11, BX
7503 LEAL -4(R11), R11
7504 CMPL BX, $0x08
7505 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7506 CMPL BX, $0x0c
7507 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7508 CMPL DI, $0x00000800
7509 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7510
7511cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7512 CMPL R11, $0x00000104
7513 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7514 CMPL R11, $0x00010100
7515 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7516 LEAL -65536(R11), R11
7517 MOVL R11, DI
7518 MOVW $0x001d, (AX)
7519 MOVW R11, 2(AX)
7520 SARL $0x10, DI
7521 MOVB DI, 4(AX)
7522 ADDQ $0x05, AX
7523 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7524
7525repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7526 LEAL -256(R11), R11
7527 MOVW $0x0019, (AX)
7528 MOVW R11, 2(AX)
7529 ADDQ $0x04, AX
7530 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7531
7532repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7533 LEAL -4(R11), R11
7534 MOVW $0x0015, (AX)
7535 MOVB R11, 2(AX)
7536 ADDQ $0x03, AX
7537 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7538
7539repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7540 SHLL $0x02, R11
7541 ORL $0x01, R11
7542 MOVW R11, (AX)
7543 ADDQ $0x02, AX
7544 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7545
7546repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7547 XORQ BX, BX
7548 LEAL 1(BX)(R11*4), R11
7549 MOVB DI, 1(AX)
7550 SARL $0x08, DI
7551 SHLL $0x05, DI
7552 ORL DI, R11
7553 MOVB R11, (AX)
7554 ADDQ $0x02, AX
7555 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7556
7557long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
7558 MOVB $0xee, (AX)
7559 MOVW DI, 1(AX)
7560 LEAL -60(R11), R11
7561 ADDQ $0x03, AX
7562
7563 // emitRepeat
7564 MOVL R11, BX
7565 LEAL -4(R11), R11
7566 CMPL BX, $0x08
7567 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7568 CMPL BX, $0x0c
7569 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7570 CMPL DI, $0x00000800
7571 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7572
7573cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7574 CMPL R11, $0x00000104
7575 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7576 CMPL R11, $0x00010100
7577 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7578 LEAL -65536(R11), R11
7579 MOVL R11, DI
7580 MOVW $0x001d, (AX)
7581 MOVW R11, 2(AX)
7582 SARL $0x10, DI
7583 MOVB DI, 4(AX)
7584 ADDQ $0x05, AX
7585 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7586
7587repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7588 LEAL -256(R11), R11
7589 MOVW $0x0019, (AX)
7590 MOVW R11, 2(AX)
7591 ADDQ $0x04, AX
7592 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7593
7594repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7595 LEAL -4(R11), R11
7596 MOVW $0x0015, (AX)
7597 MOVB R11, 2(AX)
7598 ADDQ $0x03, AX
7599 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7600
7601repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7602 SHLL $0x02, R11
7603 ORL $0x01, R11
7604 MOVW R11, (AX)
7605 ADDQ $0x02, AX
7606 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7607
7608repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7609 XORQ BX, BX
7610 LEAL 1(BX)(R11*4), R11
7611 MOVB DI, 1(AX)
7612 SARL $0x08, DI
7613 SHLL $0x05, DI
7614 ORL DI, R11
7615 MOVB R11, (AX)
7616 ADDQ $0x02, AX
7617 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7618
7619two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
7620 MOVL R11, BX
7621 SHLL $0x02, BX
7622 CMPL R11, $0x0c
7623 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
7624 CMPL DI, $0x00000800
7625 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
7626 LEAL -15(BX), BX
7627 MOVB DI, 1(AX)
7628 SHRL $0x08, DI
7629 SHLL $0x05, DI
7630 ORL DI, BX
7631 MOVB BL, (AX)
7632 ADDQ $0x02, AX
7633 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7634
7635emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
7636 LEAL -2(BX), BX
7637 MOVB BL, (AX)
7638 MOVW DI, 1(AX)
7639 ADDQ $0x03, AX
7640 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7641
7642match_is_repeat_encodeBetterBlockAsm4MB:
7643 MOVL 12(SP), BX
7644 CMPL BX, SI
7645 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
7646 MOVL SI, R8
7647 MOVL SI, 12(SP)
7648 LEAQ (DX)(BX*1), R9
7649 SUBL BX, R8
7650 LEAL -1(R8), BX
7651 CMPL BX, $0x3c
7652 JB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
7653 CMPL BX, $0x00000100
7654 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
7655 CMPL BX, $0x00010000
7656 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
7657 MOVL BX, R10
7658 SHRL $0x10, R10
7659 MOVB $0xf8, (AX)
7660 MOVW BX, 1(AX)
7661 MOVB R10, 3(AX)
7662 ADDQ $0x04, AX
7663 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
7664
7665three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
7666 MOVB $0xf4, (AX)
7667 MOVW BX, 1(AX)
7668 ADDQ $0x03, AX
7669 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
7670
7671two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
7672 MOVB $0xf0, (AX)
7673 MOVB BL, 1(AX)
7674 ADDQ $0x02, AX
7675 CMPL BX, $0x40
7676 JB memmove_match_emit_repeat_encodeBetterBlockAsm4MB
7677 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
7678
7679one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
7680 SHLB $0x02, BL
7681 MOVB BL, (AX)
7682 ADDQ $0x01, AX
7683
7684memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
7685 LEAQ (AX)(R8*1), BX
7686
7687 // genMemMoveShort
7688 CMPQ R8, $0x04
7689 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
7690 CMPQ R8, $0x08
7691 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
7692 CMPQ R8, $0x10
7693 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
7694 CMPQ R8, $0x20
7695 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
7696 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
7697
7698emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
7699 MOVL (R9), R10
7700 MOVL R10, (AX)
7701 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7702
7703emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
7704 MOVL (R9), R10
7705 MOVL -4(R9)(R8*1), R9
7706 MOVL R10, (AX)
7707 MOVL R9, -4(AX)(R8*1)
7708 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7709
7710emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
7711 MOVQ (R9), R10
7712 MOVQ -8(R9)(R8*1), R9
7713 MOVQ R10, (AX)
7714 MOVQ R9, -8(AX)(R8*1)
7715 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7716
7717emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
7718 MOVOU (R9), X0
7719 MOVOU -16(R9)(R8*1), X1
7720 MOVOU X0, (AX)
7721 MOVOU X1, -16(AX)(R8*1)
7722 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7723
7724emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
7725 MOVOU (R9), X0
7726 MOVOU 16(R9), X1
7727 MOVOU -32(R9)(R8*1), X2
7728 MOVOU -16(R9)(R8*1), X3
7729 MOVOU X0, (AX)
7730 MOVOU X1, 16(AX)
7731 MOVOU X2, -32(AX)(R8*1)
7732 MOVOU X3, -16(AX)(R8*1)
7733
7734memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
7735 MOVQ BX, AX
7736 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
7737
7738memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
7739 LEAQ (AX)(R8*1), BX
7740
7741 // genMemMoveLong
7742 MOVOU (R9), X0
7743 MOVOU 16(R9), X1
7744 MOVOU -32(R9)(R8*1), X2
7745 MOVOU -16(R9)(R8*1), X3
7746 MOVQ R8, R12
7747 SHRQ $0x05, R12
7748 MOVQ AX, R10
7749 ANDL $0x0000001f, R10
7750 MOVQ $0x00000040, R13
7751 SUBQ R10, R13
7752 DECQ R12
7753 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7754 LEAQ -32(R9)(R13*1), R10
7755 LEAQ -32(AX)(R13*1), R14
7756
7757emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
7758 MOVOU (R10), X4
7759 MOVOU 16(R10), X5
7760 MOVOA X4, (R14)
7761 MOVOA X5, 16(R14)
7762 ADDQ $0x20, R14
7763 ADDQ $0x20, R10
7764 ADDQ $0x20, R13
7765 DECQ R12
7766 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
7767
7768emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
7769 MOVOU -32(R9)(R13*1), X4
7770 MOVOU -16(R9)(R13*1), X5
7771 MOVOA X4, -32(AX)(R13*1)
7772 MOVOA X5, -16(AX)(R13*1)
7773 ADDQ $0x20, R13
7774 CMPQ R8, R13
7775 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7776 MOVOU X0, (AX)
7777 MOVOU X1, 16(AX)
7778 MOVOU X2, -32(AX)(R8*1)
7779 MOVOU X3, -16(AX)(R8*1)
7780 MOVQ BX, AX
7781
7782emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
7783 ADDL R11, CX
7784 ADDL $0x04, R11
7785 MOVL CX, 12(SP)
7786
7787 // emitRepeat
7788 MOVL R11, BX
7789 LEAL -4(R11), R11
7790 CMPL BX, $0x08
7791 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
7792 CMPL BX, $0x0c
7793 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
7794 CMPL DI, $0x00000800
7795 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
7796
7797cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
7798 CMPL R11, $0x00000104
7799 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
7800 CMPL R11, $0x00010100
7801 JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
7802 LEAL -65536(R11), R11
7803 MOVL R11, DI
7804 MOVW $0x001d, (AX)
7805 MOVW R11, 2(AX)
7806 SARL $0x10, DI
7807 MOVB DI, 4(AX)
7808 ADDQ $0x05, AX
7809 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7810
7811repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
7812 LEAL -256(R11), R11
7813 MOVW $0x0019, (AX)
7814 MOVW R11, 2(AX)
7815 ADDQ $0x04, AX
7816 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7817
7818repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
7819 LEAL -4(R11), R11
7820 MOVW $0x0015, (AX)
7821 MOVB R11, 2(AX)
7822 ADDQ $0x03, AX
7823 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7824
7825repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
7826 SHLL $0x02, R11
7827 ORL $0x01, R11
7828 MOVW R11, (AX)
7829 ADDQ $0x02, AX
7830 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7831
7832repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
7833 XORQ BX, BX
7834 LEAL 1(BX)(R11*4), R11
7835 MOVB DI, 1(AX)
7836 SARL $0x08, DI
7837 SHLL $0x05, DI
7838 ORL DI, R11
7839 MOVB R11, (AX)
7840 ADDQ $0x02, AX
7841
7842match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
7843 CMPL CX, 8(SP)
7844 JAE emit_remainder_encodeBetterBlockAsm4MB
7845 CMPQ AX, (SP)
7846 JB match_nolit_dst_ok_encodeBetterBlockAsm4MB
7847 MOVQ $0x00000000, ret+48(FP)
7848 RET
7849
7850match_nolit_dst_ok_encodeBetterBlockAsm4MB:
7851 MOVQ $0x00cf1bbcdcbfa563, BX
7852 MOVQ $0x9e3779b1, DI
7853 LEAQ 1(SI), SI
7854 LEAQ -2(CX), R8
7855 MOVQ (DX)(SI*1), R9
7856 MOVQ 1(DX)(SI*1), R10
7857 MOVQ (DX)(R8*1), R11
7858 MOVQ 1(DX)(R8*1), R12
7859 SHLQ $0x08, R9
7860 IMULQ BX, R9
7861 SHRQ $0x2f, R9
7862 SHLQ $0x20, R10
7863 IMULQ DI, R10
7864 SHRQ $0x32, R10
7865 SHLQ $0x08, R11
7866 IMULQ BX, R11
7867 SHRQ $0x2f, R11
7868 SHLQ $0x20, R12
7869 IMULQ DI, R12
7870 SHRQ $0x32, R12
7871 LEAQ 1(SI), DI
7872 LEAQ 1(R8), R13
7873 MOVL SI, 24(SP)(R9*4)
7874 MOVL R8, 24(SP)(R11*4)
7875 MOVL DI, 524312(SP)(R10*4)
7876 MOVL R13, 524312(SP)(R12*4)
7877 LEAQ 1(R8)(SI*1), DI
7878 SHRQ $0x01, DI
7879 ADDQ $0x01, SI
7880 SUBQ $0x01, R8
7881
7882index_loop_encodeBetterBlockAsm4MB:
7883 CMPQ DI, R8
7884 JAE search_loop_encodeBetterBlockAsm4MB
7885 MOVQ (DX)(SI*1), R9
7886 MOVQ (DX)(DI*1), R10
7887 SHLQ $0x08, R9
7888 IMULQ BX, R9
7889 SHRQ $0x2f, R9
7890 SHLQ $0x08, R10
7891 IMULQ BX, R10
7892 SHRQ $0x2f, R10
7893 MOVL SI, 24(SP)(R9*4)
7894 MOVL DI, 24(SP)(R10*4)
7895 ADDQ $0x02, SI
7896 ADDQ $0x02, DI
7897 JMP index_loop_encodeBetterBlockAsm4MB
7898
7899emit_remainder_encodeBetterBlockAsm4MB:
7900 MOVQ src_len+32(FP), CX
7901 SUBL 12(SP), CX
7902 LEAQ 4(AX)(CX*1), CX
7903 CMPQ CX, (SP)
7904 JB emit_remainder_ok_encodeBetterBlockAsm4MB
7905 MOVQ $0x00000000, ret+48(FP)
7906 RET
7907
7908emit_remainder_ok_encodeBetterBlockAsm4MB:
7909 MOVQ src_len+32(FP), CX
7910 MOVL 12(SP), BX
7911 CMPL BX, CX
7912 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
7913 MOVL CX, SI
7914 MOVL CX, 12(SP)
7915 LEAQ (DX)(BX*1), CX
7916 SUBL BX, SI
7917 LEAL -1(SI), DX
7918 CMPL DX, $0x3c
7919 JB one_byte_emit_remainder_encodeBetterBlockAsm4MB
7920 CMPL DX, $0x00000100
7921 JB two_bytes_emit_remainder_encodeBetterBlockAsm4MB
7922 CMPL DX, $0x00010000
7923 JB three_bytes_emit_remainder_encodeBetterBlockAsm4MB
7924 MOVL DX, BX
7925 SHRL $0x10, BX
7926 MOVB $0xf8, (AX)
7927 MOVW DX, 1(AX)
7928 MOVB BL, 3(AX)
7929 ADDQ $0x04, AX
7930 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
7931
7932three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
7933 MOVB $0xf4, (AX)
7934 MOVW DX, 1(AX)
7935 ADDQ $0x03, AX
7936 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
7937
7938two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
7939 MOVB $0xf0, (AX)
7940 MOVB DL, 1(AX)
7941 ADDQ $0x02, AX
7942 CMPL DX, $0x40
7943 JB memmove_emit_remainder_encodeBetterBlockAsm4MB
7944 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
7945
7946one_byte_emit_remainder_encodeBetterBlockAsm4MB:
7947 SHLB $0x02, DL
7948 MOVB DL, (AX)
7949 ADDQ $0x01, AX
7950
7951memmove_emit_remainder_encodeBetterBlockAsm4MB:
7952 LEAQ (AX)(SI*1), DX
7953 MOVL SI, BX
7954
7955 // genMemMoveShort
7956 CMPQ BX, $0x03
7957 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
7958 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
7959 CMPQ BX, $0x08
7960 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
7961 CMPQ BX, $0x10
7962 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
7963 CMPQ BX, $0x20
7964 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
7965 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
7966
7967emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
7968 MOVB (CX), SI
7969 MOVB -1(CX)(BX*1), CL
7970 MOVB SI, (AX)
7971 MOVB CL, -1(AX)(BX*1)
7972 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
7973
7974emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
7975 MOVW (CX), SI
7976 MOVB 2(CX), CL
7977 MOVW SI, (AX)
7978 MOVB CL, 2(AX)
7979 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
7980
7981emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
7982 MOVL (CX), SI
7983 MOVL -4(CX)(BX*1), CX
7984 MOVL SI, (AX)
7985 MOVL CX, -4(AX)(BX*1)
7986 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
7987
7988emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
7989 MOVQ (CX), SI
7990 MOVQ -8(CX)(BX*1), CX
7991 MOVQ SI, (AX)
7992 MOVQ CX, -8(AX)(BX*1)
7993 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
7994
7995emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
7996 MOVOU (CX), X0
7997 MOVOU -16(CX)(BX*1), X1
7998 MOVOU X0, (AX)
7999 MOVOU X1, -16(AX)(BX*1)
8000 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8001
8002emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
8003 MOVOU (CX), X0
8004 MOVOU 16(CX), X1
8005 MOVOU -32(CX)(BX*1), X2
8006 MOVOU -16(CX)(BX*1), X3
8007 MOVOU X0, (AX)
8008 MOVOU X1, 16(AX)
8009 MOVOU X2, -32(AX)(BX*1)
8010 MOVOU X3, -16(AX)(BX*1)
8011
8012memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
8013 MOVQ DX, AX
8014 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
8015
8016memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
8017 LEAQ (AX)(SI*1), DX
8018 MOVL SI, BX
8019
8020 // genMemMoveLong
8021 MOVOU (CX), X0
8022 MOVOU 16(CX), X1
8023 MOVOU -32(CX)(BX*1), X2
8024 MOVOU -16(CX)(BX*1), X3
8025 MOVQ BX, DI
8026 SHRQ $0x05, DI
8027 MOVQ AX, SI
8028 ANDL $0x0000001f, SI
8029 MOVQ $0x00000040, R8
8030 SUBQ SI, R8
8031 DECQ DI
8032 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
8033 LEAQ -32(CX)(R8*1), SI
8034 LEAQ -32(AX)(R8*1), R9
8035
8036emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
8037 MOVOU (SI), X4
8038 MOVOU 16(SI), X5
8039 MOVOA X4, (R9)
8040 MOVOA X5, 16(R9)
8041 ADDQ $0x20, R9
8042 ADDQ $0x20, SI
8043 ADDQ $0x20, R8
8044 DECQ DI
8045 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
8046
8047emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
8048 MOVOU -32(CX)(R8*1), X4
8049 MOVOU -16(CX)(R8*1), X5
8050 MOVOA X4, -32(AX)(R8*1)
8051 MOVOA X5, -16(AX)(R8*1)
8052 ADDQ $0x20, R8
8053 CMPQ BX, R8
8054 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
8055 MOVOU X0, (AX)
8056 MOVOU X1, 16(AX)
8057 MOVOU X2, -32(AX)(BX*1)
8058 MOVOU X3, -16(AX)(BX*1)
8059 MOVQ DX, AX
8060
8061emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
8062 MOVQ dst_base+0(FP), CX
8063 SUBQ CX, AX
8064 MOVQ AX, ret+48(FP)
8065 RET
8066
8067// func encodeBetterBlockAsm12B(dst []byte, src []byte) int
8068// Requires: BMI, SSE2
8069TEXT ·encodeBetterBlockAsm12B(SB), $81944-56
8070 MOVQ dst_base+0(FP), AX
8071 MOVQ $0x00000280, CX
8072 LEAQ 24(SP), DX
8073 PXOR X0, X0
8074
8075zero_loop_encodeBetterBlockAsm12B:
8076 MOVOU X0, (DX)
8077 MOVOU X0, 16(DX)
8078 MOVOU X0, 32(DX)
8079 MOVOU X0, 48(DX)
8080 MOVOU X0, 64(DX)
8081 MOVOU X0, 80(DX)
8082 MOVOU X0, 96(DX)
8083 MOVOU X0, 112(DX)
8084 ADDQ $0x80, DX
8085 DECQ CX
8086 JNZ zero_loop_encodeBetterBlockAsm12B
8087 MOVL $0x00000000, 12(SP)
8088 MOVQ src_len+32(FP), CX
8089 LEAQ -6(CX), DX
8090 LEAQ -8(CX), BX
8091 MOVL BX, 8(SP)
8092 SHRQ $0x05, CX
8093 SUBL CX, DX
8094 LEAQ (AX)(DX*1), DX
8095 MOVQ DX, (SP)
8096 MOVL $0x00000001, CX
8097 MOVL $0x00000000, 16(SP)
8098 MOVQ src_base+24(FP), DX
8099
8100search_loop_encodeBetterBlockAsm12B:
8101 MOVL CX, BX
8102 SUBL 12(SP), BX
8103 SHRL $0x06, BX
8104 LEAL 1(CX)(BX*1), BX
8105 CMPL BX, 8(SP)
8106 JAE emit_remainder_encodeBetterBlockAsm12B
8107 MOVQ (DX)(CX*1), SI
8108 MOVL BX, 20(SP)
8109 MOVQ $0x0000cf1bbcdcbf9b, R8
8110 MOVQ $0x9e3779b1, BX
8111 MOVQ SI, R9
8112 MOVQ SI, R10
8113 SHLQ $0x10, R9
8114 IMULQ R8, R9
8115 SHRQ $0x32, R9
8116 SHLQ $0x20, R10
8117 IMULQ BX, R10
8118 SHRQ $0x34, R10
8119 MOVL 24(SP)(R9*4), BX
8120 MOVL 65560(SP)(R10*4), DI
8121 MOVL CX, 24(SP)(R9*4)
8122 MOVL CX, 65560(SP)(R10*4)
8123 MOVQ (DX)(BX*1), R9
8124 MOVQ (DX)(DI*1), R10
8125 CMPQ R9, SI
8126 JEQ candidate_match_encodeBetterBlockAsm12B
8127 CMPQ R10, SI
8128 JNE no_short_found_encodeBetterBlockAsm12B
8129 MOVL DI, BX
8130 JMP candidate_match_encodeBetterBlockAsm12B
8131
8132no_short_found_encodeBetterBlockAsm12B:
8133 CMPL R9, SI
8134 JEQ candidate_match_encodeBetterBlockAsm12B
8135 CMPL R10, SI
8136 JEQ candidateS_match_encodeBetterBlockAsm12B
8137 MOVL 20(SP), CX
8138 JMP search_loop_encodeBetterBlockAsm12B
8139
8140candidateS_match_encodeBetterBlockAsm12B:
8141 SHRQ $0x08, SI
8142 MOVQ SI, R9
8143 SHLQ $0x10, R9
8144 IMULQ R8, R9
8145 SHRQ $0x32, R9
8146 MOVL 24(SP)(R9*4), BX
8147 INCL CX
8148 MOVL CX, 24(SP)(R9*4)
8149 CMPL (DX)(BX*1), SI
8150 JEQ candidate_match_encodeBetterBlockAsm12B
8151 DECL CX
8152 MOVL DI, BX
8153
8154candidate_match_encodeBetterBlockAsm12B:
8155 MOVL 12(SP), SI
8156 TESTL BX, BX
8157 JZ match_extend_back_end_encodeBetterBlockAsm12B
8158
8159match_extend_back_loop_encodeBetterBlockAsm12B:
8160 CMPL CX, SI
8161 JBE match_extend_back_end_encodeBetterBlockAsm12B
8162 MOVB -1(DX)(BX*1), DI
8163 MOVB -1(DX)(CX*1), R8
8164 CMPB DI, R8
8165 JNE match_extend_back_end_encodeBetterBlockAsm12B
8166 LEAL -1(CX), CX
8167 DECL BX
8168 JZ match_extend_back_end_encodeBetterBlockAsm12B
8169 JMP match_extend_back_loop_encodeBetterBlockAsm12B
8170
8171match_extend_back_end_encodeBetterBlockAsm12B:
8172 MOVL CX, SI
8173 SUBL 12(SP), SI
8174 LEAQ 3(AX)(SI*1), SI
8175 CMPQ SI, (SP)
8176 JB match_dst_size_check_encodeBetterBlockAsm12B
8177 MOVQ $0x00000000, ret+48(FP)
8178 RET
8179
8180match_dst_size_check_encodeBetterBlockAsm12B:
8181 MOVL CX, SI
8182 ADDL $0x04, CX
8183 ADDL $0x04, BX
8184 MOVQ src_len+32(FP), DI
8185 SUBL CX, DI
8186 LEAQ (DX)(CX*1), R8
8187 LEAQ (DX)(BX*1), R9
8188
8189 // matchLen
8190 XORL R11, R11
8191
8192matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B:
8193 CMPL DI, $0x10
8194 JB matchlen_match8_match_nolit_encodeBetterBlockAsm12B
8195 MOVQ (R8)(R11*1), R10
8196 MOVQ 8(R8)(R11*1), R12
8197 XORQ (R9)(R11*1), R10
8198 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
8199 XORQ 8(R9)(R11*1), R12
8200 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B
8201 LEAL -16(DI), DI
8202 LEAL 16(R11), R11
8203 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B
8204
8205matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B:
8206#ifdef GOAMD64_v3
8207 TZCNTQ R12, R12
8208
8209#else
8210 BSFQ R12, R12
8211
8212#endif
8213 SARQ $0x03, R12
8214 LEAL 8(R11)(R12*1), R11
8215 JMP match_nolit_end_encodeBetterBlockAsm12B
8216
8217matchlen_match8_match_nolit_encodeBetterBlockAsm12B:
8218 CMPL DI, $0x08
8219 JB matchlen_match4_match_nolit_encodeBetterBlockAsm12B
8220 MOVQ (R8)(R11*1), R10
8221 XORQ (R9)(R11*1), R10
8222 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
8223 LEAL -8(DI), DI
8224 LEAL 8(R11), R11
8225 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm12B
8226
8227matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B:
8228#ifdef GOAMD64_v3
8229 TZCNTQ R10, R10
8230
8231#else
8232 BSFQ R10, R10
8233
8234#endif
8235 SARQ $0x03, R10
8236 LEAL (R11)(R10*1), R11
8237 JMP match_nolit_end_encodeBetterBlockAsm12B
8238
8239matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
8240 CMPL DI, $0x04
8241 JB matchlen_match2_match_nolit_encodeBetterBlockAsm12B
8242 MOVL (R8)(R11*1), R10
8243 CMPL (R9)(R11*1), R10
8244 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B
8245 LEAL -4(DI), DI
8246 LEAL 4(R11), R11
8247
8248matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
8249 CMPL DI, $0x01
8250 JE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
8251 JB match_nolit_end_encodeBetterBlockAsm12B
8252 MOVW (R8)(R11*1), R10
8253 CMPW (R9)(R11*1), R10
8254 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
8255 LEAL 2(R11), R11
8256 SUBL $0x02, DI
8257 JZ match_nolit_end_encodeBetterBlockAsm12B
8258
8259matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
8260 MOVB (R8)(R11*1), R10
8261 CMPB (R9)(R11*1), R10
8262 JNE match_nolit_end_encodeBetterBlockAsm12B
8263 LEAL 1(R11), R11
8264
8265match_nolit_end_encodeBetterBlockAsm12B:
8266 MOVL CX, DI
8267 SUBL BX, DI
8268
8269 // Check if repeat
8270 CMPL 16(SP), DI
8271 JEQ match_is_repeat_encodeBetterBlockAsm12B
8272 MOVL DI, 16(SP)
8273 MOVL 12(SP), BX
8274 CMPL BX, SI
8275 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B
8276 MOVL SI, R8
8277 MOVL SI, 12(SP)
8278 LEAQ (DX)(BX*1), R9
8279 SUBL BX, R8
8280 LEAL -1(R8), BX
8281 CMPL BX, $0x3c
8282 JB one_byte_match_emit_encodeBetterBlockAsm12B
8283 CMPL BX, $0x00000100
8284 JB two_bytes_match_emit_encodeBetterBlockAsm12B
8285 JB three_bytes_match_emit_encodeBetterBlockAsm12B
8286
8287three_bytes_match_emit_encodeBetterBlockAsm12B:
8288 MOVB $0xf4, (AX)
8289 MOVW BX, 1(AX)
8290 ADDQ $0x03, AX
8291 JMP memmove_long_match_emit_encodeBetterBlockAsm12B
8292
8293two_bytes_match_emit_encodeBetterBlockAsm12B:
8294 MOVB $0xf0, (AX)
8295 MOVB BL, 1(AX)
8296 ADDQ $0x02, AX
8297 CMPL BX, $0x40
8298 JB memmove_match_emit_encodeBetterBlockAsm12B
8299 JMP memmove_long_match_emit_encodeBetterBlockAsm12B
8300
8301one_byte_match_emit_encodeBetterBlockAsm12B:
8302 SHLB $0x02, BL
8303 MOVB BL, (AX)
8304 ADDQ $0x01, AX
8305
8306memmove_match_emit_encodeBetterBlockAsm12B:
8307 LEAQ (AX)(R8*1), BX
8308
8309 // genMemMoveShort
8310 CMPQ R8, $0x04
8311 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
8312 CMPQ R8, $0x08
8313 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
8314 CMPQ R8, $0x10
8315 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
8316 CMPQ R8, $0x20
8317 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
8318 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
8319
8320emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
8321 MOVL (R9), R10
8322 MOVL R10, (AX)
8323 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8324
8325emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
8326 MOVL (R9), R10
8327 MOVL -4(R9)(R8*1), R9
8328 MOVL R10, (AX)
8329 MOVL R9, -4(AX)(R8*1)
8330 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8331
8332emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
8333 MOVQ (R9), R10
8334 MOVQ -8(R9)(R8*1), R9
8335 MOVQ R10, (AX)
8336 MOVQ R9, -8(AX)(R8*1)
8337 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8338
8339emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
8340 MOVOU (R9), X0
8341 MOVOU -16(R9)(R8*1), X1
8342 MOVOU X0, (AX)
8343 MOVOU X1, -16(AX)(R8*1)
8344 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8345
8346emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
8347 MOVOU (R9), X0
8348 MOVOU 16(R9), X1
8349 MOVOU -32(R9)(R8*1), X2
8350 MOVOU -16(R9)(R8*1), X3
8351 MOVOU X0, (AX)
8352 MOVOU X1, 16(AX)
8353 MOVOU X2, -32(AX)(R8*1)
8354 MOVOU X3, -16(AX)(R8*1)
8355
8356memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
8357 MOVQ BX, AX
8358 JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B
8359
8360memmove_long_match_emit_encodeBetterBlockAsm12B:
8361 LEAQ (AX)(R8*1), BX
8362
8363 // genMemMoveLong
8364 MOVOU (R9), X0
8365 MOVOU 16(R9), X1
8366 MOVOU -32(R9)(R8*1), X2
8367 MOVOU -16(R9)(R8*1), X3
8368 MOVQ R8, R12
8369 SHRQ $0x05, R12
8370 MOVQ AX, R10
8371 ANDL $0x0000001f, R10
8372 MOVQ $0x00000040, R13
8373 SUBQ R10, R13
8374 DECQ R12
8375 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8376 LEAQ -32(R9)(R13*1), R10
8377 LEAQ -32(AX)(R13*1), R14
8378
8379emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
8380 MOVOU (R10), X4
8381 MOVOU 16(R10), X5
8382 MOVOA X4, (R14)
8383 MOVOA X5, 16(R14)
8384 ADDQ $0x20, R14
8385 ADDQ $0x20, R10
8386 ADDQ $0x20, R13
8387 DECQ R12
8388 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
8389
8390emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
8391 MOVOU -32(R9)(R13*1), X4
8392 MOVOU -16(R9)(R13*1), X5
8393 MOVOA X4, -32(AX)(R13*1)
8394 MOVOA X5, -16(AX)(R13*1)
8395 ADDQ $0x20, R13
8396 CMPQ R8, R13
8397 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8398 MOVOU X0, (AX)
8399 MOVOU X1, 16(AX)
8400 MOVOU X2, -32(AX)(R8*1)
8401 MOVOU X3, -16(AX)(R8*1)
8402 MOVQ BX, AX
8403
8404emit_literal_done_match_emit_encodeBetterBlockAsm12B:
8405 ADDL R11, CX
8406 ADDL $0x04, R11
8407 MOVL CX, 12(SP)
8408
8409 // emitCopy
8410 CMPL R11, $0x40
8411 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
8412 CMPL DI, $0x00000800
8413 JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B
8414 MOVL $0x00000001, BX
8415 LEAL 16(BX), BX
8416 MOVB DI, 1(AX)
8417 SHRL $0x08, DI
8418 SHLL $0x05, DI
8419 ORL DI, BX
8420 MOVB BL, (AX)
8421 ADDQ $0x02, AX
8422 SUBL $0x08, R11
8423
8424 // emitRepeat
8425 LEAL -4(R11), R11
8426 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8427 MOVL R11, BX
8428 LEAL -4(R11), R11
8429 CMPL BX, $0x08
8430 JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8431 CMPL BX, $0x0c
8432 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8433 CMPL DI, $0x00000800
8434 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8435
8436cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8437 CMPL R11, $0x00000104
8438 JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8439 LEAL -256(R11), R11
8440 MOVW $0x0019, (AX)
8441 MOVW R11, 2(AX)
8442 ADDQ $0x04, AX
8443 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8444
8445repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8446 LEAL -4(R11), R11
8447 MOVW $0x0015, (AX)
8448 MOVB R11, 2(AX)
8449 ADDQ $0x03, AX
8450 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8451
8452repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8453 SHLL $0x02, R11
8454 ORL $0x01, R11
8455 MOVW R11, (AX)
8456 ADDQ $0x02, AX
8457 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8458
8459repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8460 XORQ BX, BX
8461 LEAL 1(BX)(R11*4), R11
8462 MOVB DI, 1(AX)
8463 SARL $0x08, DI
8464 SHLL $0x05, DI
8465 ORL DI, R11
8466 MOVB R11, (AX)
8467 ADDQ $0x02, AX
8468 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8469
8470long_offset_short_match_nolit_encodeBetterBlockAsm12B:
8471 MOVB $0xee, (AX)
8472 MOVW DI, 1(AX)
8473 LEAL -60(R11), R11
8474 ADDQ $0x03, AX
8475
8476 // emitRepeat
8477 MOVL R11, BX
8478 LEAL -4(R11), R11
8479 CMPL BX, $0x08
8480 JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8481 CMPL BX, $0x0c
8482 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8483 CMPL DI, $0x00000800
8484 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8485
8486cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8487 CMPL R11, $0x00000104
8488 JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8489 LEAL -256(R11), R11
8490 MOVW $0x0019, (AX)
8491 MOVW R11, 2(AX)
8492 ADDQ $0x04, AX
8493 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8494
8495repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8496 LEAL -4(R11), R11
8497 MOVW $0x0015, (AX)
8498 MOVB R11, 2(AX)
8499 ADDQ $0x03, AX
8500 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8501
8502repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8503 SHLL $0x02, R11
8504 ORL $0x01, R11
8505 MOVW R11, (AX)
8506 ADDQ $0x02, AX
8507 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8508
8509repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8510 XORQ BX, BX
8511 LEAL 1(BX)(R11*4), R11
8512 MOVB DI, 1(AX)
8513 SARL $0x08, DI
8514 SHLL $0x05, DI
8515 ORL DI, R11
8516 MOVB R11, (AX)
8517 ADDQ $0x02, AX
8518 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8519
8520two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
8521 MOVL R11, BX
8522 SHLL $0x02, BX
8523 CMPL R11, $0x0c
8524 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
8525 CMPL DI, $0x00000800
8526 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
8527 LEAL -15(BX), BX
8528 MOVB DI, 1(AX)
8529 SHRL $0x08, DI
8530 SHLL $0x05, DI
8531 ORL DI, BX
8532 MOVB BL, (AX)
8533 ADDQ $0x02, AX
8534 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8535
8536emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
8537 LEAL -2(BX), BX
8538 MOVB BL, (AX)
8539 MOVW DI, 1(AX)
8540 ADDQ $0x03, AX
8541 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8542
8543match_is_repeat_encodeBetterBlockAsm12B:
8544 MOVL 12(SP), BX
8545 CMPL BX, SI
8546 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
8547 MOVL SI, R8
8548 MOVL SI, 12(SP)
8549 LEAQ (DX)(BX*1), R9
8550 SUBL BX, R8
8551 LEAL -1(R8), BX
8552 CMPL BX, $0x3c
8553 JB one_byte_match_emit_repeat_encodeBetterBlockAsm12B
8554 CMPL BX, $0x00000100
8555 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
8556 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm12B
8557
8558three_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
8559 MOVB $0xf4, (AX)
8560 MOVW BX, 1(AX)
8561 ADDQ $0x03, AX
8562 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
8563
8564two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
8565 MOVB $0xf0, (AX)
8566 MOVB BL, 1(AX)
8567 ADDQ $0x02, AX
8568 CMPL BX, $0x40
8569 JB memmove_match_emit_repeat_encodeBetterBlockAsm12B
8570 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
8571
8572one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
8573 SHLB $0x02, BL
8574 MOVB BL, (AX)
8575 ADDQ $0x01, AX
8576
8577memmove_match_emit_repeat_encodeBetterBlockAsm12B:
8578 LEAQ (AX)(R8*1), BX
8579
8580 // genMemMoveShort
8581 CMPQ R8, $0x04
8582 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
8583 CMPQ R8, $0x08
8584 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
8585 CMPQ R8, $0x10
8586 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
8587 CMPQ R8, $0x20
8588 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
8589 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
8590
8591emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
8592 MOVL (R9), R10
8593 MOVL R10, (AX)
8594 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8595
8596emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
8597 MOVL (R9), R10
8598 MOVL -4(R9)(R8*1), R9
8599 MOVL R10, (AX)
8600 MOVL R9, -4(AX)(R8*1)
8601 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8602
8603emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
8604 MOVQ (R9), R10
8605 MOVQ -8(R9)(R8*1), R9
8606 MOVQ R10, (AX)
8607 MOVQ R9, -8(AX)(R8*1)
8608 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8609
8610emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
8611 MOVOU (R9), X0
8612 MOVOU -16(R9)(R8*1), X1
8613 MOVOU X0, (AX)
8614 MOVOU X1, -16(AX)(R8*1)
8615 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8616
8617emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
8618 MOVOU (R9), X0
8619 MOVOU 16(R9), X1
8620 MOVOU -32(R9)(R8*1), X2
8621 MOVOU -16(R9)(R8*1), X3
8622 MOVOU X0, (AX)
8623 MOVOU X1, 16(AX)
8624 MOVOU X2, -32(AX)(R8*1)
8625 MOVOU X3, -16(AX)(R8*1)
8626
8627memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
8628 MOVQ BX, AX
8629 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
8630
8631memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
8632 LEAQ (AX)(R8*1), BX
8633
8634 // genMemMoveLong
8635 MOVOU (R9), X0
8636 MOVOU 16(R9), X1
8637 MOVOU -32(R9)(R8*1), X2
8638 MOVOU -16(R9)(R8*1), X3
8639 MOVQ R8, R12
8640 SHRQ $0x05, R12
8641 MOVQ AX, R10
8642 ANDL $0x0000001f, R10
8643 MOVQ $0x00000040, R13
8644 SUBQ R10, R13
8645 DECQ R12
8646 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8647 LEAQ -32(R9)(R13*1), R10
8648 LEAQ -32(AX)(R13*1), R14
8649
8650emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
8651 MOVOU (R10), X4
8652 MOVOU 16(R10), X5
8653 MOVOA X4, (R14)
8654 MOVOA X5, 16(R14)
8655 ADDQ $0x20, R14
8656 ADDQ $0x20, R10
8657 ADDQ $0x20, R13
8658 DECQ R12
8659 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
8660
8661emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
8662 MOVOU -32(R9)(R13*1), X4
8663 MOVOU -16(R9)(R13*1), X5
8664 MOVOA X4, -32(AX)(R13*1)
8665 MOVOA X5, -16(AX)(R13*1)
8666 ADDQ $0x20, R13
8667 CMPQ R8, R13
8668 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8669 MOVOU X0, (AX)
8670 MOVOU X1, 16(AX)
8671 MOVOU X2, -32(AX)(R8*1)
8672 MOVOU X3, -16(AX)(R8*1)
8673 MOVQ BX, AX
8674
8675emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
8676 ADDL R11, CX
8677 ADDL $0x04, R11
8678 MOVL CX, 12(SP)
8679
8680 // emitRepeat
8681 MOVL R11, BX
8682 LEAL -4(R11), R11
8683 CMPL BX, $0x08
8684 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
8685 CMPL BX, $0x0c
8686 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
8687 CMPL DI, $0x00000800
8688 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
8689
8690cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
8691 CMPL R11, $0x00000104
8692 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
8693 LEAL -256(R11), R11
8694 MOVW $0x0019, (AX)
8695 MOVW R11, 2(AX)
8696 ADDQ $0x04, AX
8697 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8698
8699repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
8700 LEAL -4(R11), R11
8701 MOVW $0x0015, (AX)
8702 MOVB R11, 2(AX)
8703 ADDQ $0x03, AX
8704 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8705
8706repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
8707 SHLL $0x02, R11
8708 ORL $0x01, R11
8709 MOVW R11, (AX)
8710 ADDQ $0x02, AX
8711 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8712
8713repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
8714 XORQ BX, BX
8715 LEAL 1(BX)(R11*4), R11
8716 MOVB DI, 1(AX)
8717 SARL $0x08, DI
8718 SHLL $0x05, DI
8719 ORL DI, R11
8720 MOVB R11, (AX)
8721 ADDQ $0x02, AX
8722
8723match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
8724 CMPL CX, 8(SP)
8725 JAE emit_remainder_encodeBetterBlockAsm12B
8726 CMPQ AX, (SP)
8727 JB match_nolit_dst_ok_encodeBetterBlockAsm12B
8728 MOVQ $0x00000000, ret+48(FP)
8729 RET
8730
8731match_nolit_dst_ok_encodeBetterBlockAsm12B:
8732 MOVQ $0x0000cf1bbcdcbf9b, BX
8733 MOVQ $0x9e3779b1, DI
8734 LEAQ 1(SI), SI
8735 LEAQ -2(CX), R8
8736 MOVQ (DX)(SI*1), R9
8737 MOVQ 1(DX)(SI*1), R10
8738 MOVQ (DX)(R8*1), R11
8739 MOVQ 1(DX)(R8*1), R12
8740 SHLQ $0x10, R9
8741 IMULQ BX, R9
8742 SHRQ $0x32, R9
8743 SHLQ $0x20, R10
8744 IMULQ DI, R10
8745 SHRQ $0x34, R10
8746 SHLQ $0x10, R11
8747 IMULQ BX, R11
8748 SHRQ $0x32, R11
8749 SHLQ $0x20, R12
8750 IMULQ DI, R12
8751 SHRQ $0x34, R12
8752 LEAQ 1(SI), DI
8753 LEAQ 1(R8), R13
8754 MOVL SI, 24(SP)(R9*4)
8755 MOVL R8, 24(SP)(R11*4)
8756 MOVL DI, 65560(SP)(R10*4)
8757 MOVL R13, 65560(SP)(R12*4)
8758 LEAQ 1(R8)(SI*1), DI
8759 SHRQ $0x01, DI
8760 ADDQ $0x01, SI
8761 SUBQ $0x01, R8
8762
8763index_loop_encodeBetterBlockAsm12B:
8764 CMPQ DI, R8
8765 JAE search_loop_encodeBetterBlockAsm12B
8766 MOVQ (DX)(SI*1), R9
8767 MOVQ (DX)(DI*1), R10
8768 SHLQ $0x10, R9
8769 IMULQ BX, R9
8770 SHRQ $0x32, R9
8771 SHLQ $0x10, R10
8772 IMULQ BX, R10
8773 SHRQ $0x32, R10
8774 MOVL SI, 24(SP)(R9*4)
8775 MOVL DI, 24(SP)(R10*4)
8776 ADDQ $0x02, SI
8777 ADDQ $0x02, DI
8778 JMP index_loop_encodeBetterBlockAsm12B
8779
8780emit_remainder_encodeBetterBlockAsm12B:
8781 MOVQ src_len+32(FP), CX
8782 SUBL 12(SP), CX
8783 LEAQ 3(AX)(CX*1), CX
8784 CMPQ CX, (SP)
8785 JB emit_remainder_ok_encodeBetterBlockAsm12B
8786 MOVQ $0x00000000, ret+48(FP)
8787 RET
8788
8789emit_remainder_ok_encodeBetterBlockAsm12B:
8790 MOVQ src_len+32(FP), CX
8791 MOVL 12(SP), BX
8792 CMPL BX, CX
8793 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
8794 MOVL CX, SI
8795 MOVL CX, 12(SP)
8796 LEAQ (DX)(BX*1), CX
8797 SUBL BX, SI
8798 LEAL -1(SI), DX
8799 CMPL DX, $0x3c
8800 JB one_byte_emit_remainder_encodeBetterBlockAsm12B
8801 CMPL DX, $0x00000100
8802 JB two_bytes_emit_remainder_encodeBetterBlockAsm12B
8803 JB three_bytes_emit_remainder_encodeBetterBlockAsm12B
8804
8805three_bytes_emit_remainder_encodeBetterBlockAsm12B:
8806 MOVB $0xf4, (AX)
8807 MOVW DX, 1(AX)
8808 ADDQ $0x03, AX
8809 JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
8810
8811two_bytes_emit_remainder_encodeBetterBlockAsm12B:
8812 MOVB $0xf0, (AX)
8813 MOVB DL, 1(AX)
8814 ADDQ $0x02, AX
8815 CMPL DX, $0x40
8816 JB memmove_emit_remainder_encodeBetterBlockAsm12B
8817 JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
8818
8819one_byte_emit_remainder_encodeBetterBlockAsm12B:
8820 SHLB $0x02, DL
8821 MOVB DL, (AX)
8822 ADDQ $0x01, AX
8823
8824memmove_emit_remainder_encodeBetterBlockAsm12B:
8825 LEAQ (AX)(SI*1), DX
8826 MOVL SI, BX
8827
8828 // genMemMoveShort
8829 CMPQ BX, $0x03
8830 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
8831 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
8832 CMPQ BX, $0x08
8833 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
8834 CMPQ BX, $0x10
8835 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
8836 CMPQ BX, $0x20
8837 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
8838 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
8839
8840emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
8841 MOVB (CX), SI
8842 MOVB -1(CX)(BX*1), CL
8843 MOVB SI, (AX)
8844 MOVB CL, -1(AX)(BX*1)
8845 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8846
8847emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
8848 MOVW (CX), SI
8849 MOVB 2(CX), CL
8850 MOVW SI, (AX)
8851 MOVB CL, 2(AX)
8852 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8853
8854emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
8855 MOVL (CX), SI
8856 MOVL -4(CX)(BX*1), CX
8857 MOVL SI, (AX)
8858 MOVL CX, -4(AX)(BX*1)
8859 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8860
8861emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
8862 MOVQ (CX), SI
8863 MOVQ -8(CX)(BX*1), CX
8864 MOVQ SI, (AX)
8865 MOVQ CX, -8(AX)(BX*1)
8866 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8867
8868emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
8869 MOVOU (CX), X0
8870 MOVOU -16(CX)(BX*1), X1
8871 MOVOU X0, (AX)
8872 MOVOU X1, -16(AX)(BX*1)
8873 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8874
8875emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
8876 MOVOU (CX), X0
8877 MOVOU 16(CX), X1
8878 MOVOU -32(CX)(BX*1), X2
8879 MOVOU -16(CX)(BX*1), X3
8880 MOVOU X0, (AX)
8881 MOVOU X1, 16(AX)
8882 MOVOU X2, -32(AX)(BX*1)
8883 MOVOU X3, -16(AX)(BX*1)
8884
8885memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
8886 MOVQ DX, AX
8887 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
8888
8889memmove_long_emit_remainder_encodeBetterBlockAsm12B:
8890 LEAQ (AX)(SI*1), DX
8891 MOVL SI, BX
8892
8893 // genMemMoveLong
8894 MOVOU (CX), X0
8895 MOVOU 16(CX), X1
8896 MOVOU -32(CX)(BX*1), X2
8897 MOVOU -16(CX)(BX*1), X3
8898 MOVQ BX, DI
8899 SHRQ $0x05, DI
8900 MOVQ AX, SI
8901 ANDL $0x0000001f, SI
8902 MOVQ $0x00000040, R8
8903 SUBQ SI, R8
8904 DECQ DI
8905 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8906 LEAQ -32(CX)(R8*1), SI
8907 LEAQ -32(AX)(R8*1), R9
8908
8909emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
8910 MOVOU (SI), X4
8911 MOVOU 16(SI), X5
8912 MOVOA X4, (R9)
8913 MOVOA X5, 16(R9)
8914 ADDQ $0x20, R9
8915 ADDQ $0x20, SI
8916 ADDQ $0x20, R8
8917 DECQ DI
8918 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
8919
8920emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
8921 MOVOU -32(CX)(R8*1), X4
8922 MOVOU -16(CX)(R8*1), X5
8923 MOVOA X4, -32(AX)(R8*1)
8924 MOVOA X5, -16(AX)(R8*1)
8925 ADDQ $0x20, R8
8926 CMPQ BX, R8
8927 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8928 MOVOU X0, (AX)
8929 MOVOU X1, 16(AX)
8930 MOVOU X2, -32(AX)(BX*1)
8931 MOVOU X3, -16(AX)(BX*1)
8932 MOVQ DX, AX
8933
8934emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
8935 MOVQ dst_base+0(FP), CX
8936 SUBQ CX, AX
8937 MOVQ AX, ret+48(FP)
8938 RET
8939
8940// func encodeBetterBlockAsm10B(dst []byte, src []byte) int
8941// Requires: BMI, SSE2
8942TEXT ·encodeBetterBlockAsm10B(SB), $20504-56
8943 MOVQ dst_base+0(FP), AX
8944 MOVQ $0x000000a0, CX
8945 LEAQ 24(SP), DX
8946 PXOR X0, X0
8947
8948zero_loop_encodeBetterBlockAsm10B:
8949 MOVOU X0, (DX)
8950 MOVOU X0, 16(DX)
8951 MOVOU X0, 32(DX)
8952 MOVOU X0, 48(DX)
8953 MOVOU X0, 64(DX)
8954 MOVOU X0, 80(DX)
8955 MOVOU X0, 96(DX)
8956 MOVOU X0, 112(DX)
8957 ADDQ $0x80, DX
8958 DECQ CX
8959 JNZ zero_loop_encodeBetterBlockAsm10B
8960 MOVL $0x00000000, 12(SP)
8961 MOVQ src_len+32(FP), CX
8962 LEAQ -6(CX), DX
8963 LEAQ -8(CX), BX
8964 MOVL BX, 8(SP)
8965 SHRQ $0x05, CX
8966 SUBL CX, DX
8967 LEAQ (AX)(DX*1), DX
8968 MOVQ DX, (SP)
8969 MOVL $0x00000001, CX
8970 MOVL $0x00000000, 16(SP)
8971 MOVQ src_base+24(FP), DX
8972
8973search_loop_encodeBetterBlockAsm10B:
8974 MOVL CX, BX
8975 SUBL 12(SP), BX
8976 SHRL $0x05, BX
8977 LEAL 1(CX)(BX*1), BX
8978 CMPL BX, 8(SP)
8979 JAE emit_remainder_encodeBetterBlockAsm10B
8980 MOVQ (DX)(CX*1), SI
8981 MOVL BX, 20(SP)
8982 MOVQ $0x0000cf1bbcdcbf9b, R8
8983 MOVQ $0x9e3779b1, BX
8984 MOVQ SI, R9
8985 MOVQ SI, R10
8986 SHLQ $0x10, R9
8987 IMULQ R8, R9
8988 SHRQ $0x34, R9
8989 SHLQ $0x20, R10
8990 IMULQ BX, R10
8991 SHRQ $0x36, R10
8992 MOVL 24(SP)(R9*4), BX
8993 MOVL 16408(SP)(R10*4), DI
8994 MOVL CX, 24(SP)(R9*4)
8995 MOVL CX, 16408(SP)(R10*4)
8996 MOVQ (DX)(BX*1), R9
8997 MOVQ (DX)(DI*1), R10
8998 CMPQ R9, SI
8999 JEQ candidate_match_encodeBetterBlockAsm10B
9000 CMPQ R10, SI
9001 JNE no_short_found_encodeBetterBlockAsm10B
9002 MOVL DI, BX
9003 JMP candidate_match_encodeBetterBlockAsm10B
9004
9005no_short_found_encodeBetterBlockAsm10B:
9006 CMPL R9, SI
9007 JEQ candidate_match_encodeBetterBlockAsm10B
9008 CMPL R10, SI
9009 JEQ candidateS_match_encodeBetterBlockAsm10B
9010 MOVL 20(SP), CX
9011 JMP search_loop_encodeBetterBlockAsm10B
9012
9013candidateS_match_encodeBetterBlockAsm10B:
9014 SHRQ $0x08, SI
9015 MOVQ SI, R9
9016 SHLQ $0x10, R9
9017 IMULQ R8, R9
9018 SHRQ $0x34, R9
9019 MOVL 24(SP)(R9*4), BX
9020 INCL CX
9021 MOVL CX, 24(SP)(R9*4)
9022 CMPL (DX)(BX*1), SI
9023 JEQ candidate_match_encodeBetterBlockAsm10B
9024 DECL CX
9025 MOVL DI, BX
9026
9027candidate_match_encodeBetterBlockAsm10B:
9028 MOVL 12(SP), SI
9029 TESTL BX, BX
9030 JZ match_extend_back_end_encodeBetterBlockAsm10B
9031
9032match_extend_back_loop_encodeBetterBlockAsm10B:
9033 CMPL CX, SI
9034 JBE match_extend_back_end_encodeBetterBlockAsm10B
9035 MOVB -1(DX)(BX*1), DI
9036 MOVB -1(DX)(CX*1), R8
9037 CMPB DI, R8
9038 JNE match_extend_back_end_encodeBetterBlockAsm10B
9039 LEAL -1(CX), CX
9040 DECL BX
9041 JZ match_extend_back_end_encodeBetterBlockAsm10B
9042 JMP match_extend_back_loop_encodeBetterBlockAsm10B
9043
9044match_extend_back_end_encodeBetterBlockAsm10B:
9045 MOVL CX, SI
9046 SUBL 12(SP), SI
9047 LEAQ 3(AX)(SI*1), SI
9048 CMPQ SI, (SP)
9049 JB match_dst_size_check_encodeBetterBlockAsm10B
9050 MOVQ $0x00000000, ret+48(FP)
9051 RET
9052
9053match_dst_size_check_encodeBetterBlockAsm10B:
9054 MOVL CX, SI
9055 ADDL $0x04, CX
9056 ADDL $0x04, BX
9057 MOVQ src_len+32(FP), DI
9058 SUBL CX, DI
9059 LEAQ (DX)(CX*1), R8
9060 LEAQ (DX)(BX*1), R9
9061
9062 // matchLen
9063 XORL R11, R11
9064
9065matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B:
9066 CMPL DI, $0x10
9067 JB matchlen_match8_match_nolit_encodeBetterBlockAsm10B
9068 MOVQ (R8)(R11*1), R10
9069 MOVQ 8(R8)(R11*1), R12
9070 XORQ (R9)(R11*1), R10
9071 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
9072 XORQ 8(R9)(R11*1), R12
9073 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B
9074 LEAL -16(DI), DI
9075 LEAL 16(R11), R11
9076 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B
9077
9078matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B:
9079#ifdef GOAMD64_v3
9080 TZCNTQ R12, R12
9081
9082#else
9083 BSFQ R12, R12
9084
9085#endif
9086 SARQ $0x03, R12
9087 LEAL 8(R11)(R12*1), R11
9088 JMP match_nolit_end_encodeBetterBlockAsm10B
9089
9090matchlen_match8_match_nolit_encodeBetterBlockAsm10B:
9091 CMPL DI, $0x08
9092 JB matchlen_match4_match_nolit_encodeBetterBlockAsm10B
9093 MOVQ (R8)(R11*1), R10
9094 XORQ (R9)(R11*1), R10
9095 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
9096 LEAL -8(DI), DI
9097 LEAL 8(R11), R11
9098 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm10B
9099
9100matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B:
9101#ifdef GOAMD64_v3
9102 TZCNTQ R10, R10
9103
9104#else
9105 BSFQ R10, R10
9106
9107#endif
9108 SARQ $0x03, R10
9109 LEAL (R11)(R10*1), R11
9110 JMP match_nolit_end_encodeBetterBlockAsm10B
9111
9112matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
9113 CMPL DI, $0x04
9114 JB matchlen_match2_match_nolit_encodeBetterBlockAsm10B
9115 MOVL (R8)(R11*1), R10
9116 CMPL (R9)(R11*1), R10
9117 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B
9118 LEAL -4(DI), DI
9119 LEAL 4(R11), R11
9120
9121matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
9122 CMPL DI, $0x01
9123 JE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
9124 JB match_nolit_end_encodeBetterBlockAsm10B
9125 MOVW (R8)(R11*1), R10
9126 CMPW (R9)(R11*1), R10
9127 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
9128 LEAL 2(R11), R11
9129 SUBL $0x02, DI
9130 JZ match_nolit_end_encodeBetterBlockAsm10B
9131
9132matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
9133 MOVB (R8)(R11*1), R10
9134 CMPB (R9)(R11*1), R10
9135 JNE match_nolit_end_encodeBetterBlockAsm10B
9136 LEAL 1(R11), R11
9137
9138match_nolit_end_encodeBetterBlockAsm10B:
9139 MOVL CX, DI
9140 SUBL BX, DI
9141
9142 // Check if repeat
9143 CMPL 16(SP), DI
9144 JEQ match_is_repeat_encodeBetterBlockAsm10B
9145 MOVL DI, 16(SP)
9146 MOVL 12(SP), BX
9147 CMPL BX, SI
9148 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B
9149 MOVL SI, R8
9150 MOVL SI, 12(SP)
9151 LEAQ (DX)(BX*1), R9
9152 SUBL BX, R8
9153 LEAL -1(R8), BX
9154 CMPL BX, $0x3c
9155 JB one_byte_match_emit_encodeBetterBlockAsm10B
9156 CMPL BX, $0x00000100
9157 JB two_bytes_match_emit_encodeBetterBlockAsm10B
9158 JB three_bytes_match_emit_encodeBetterBlockAsm10B
9159
9160three_bytes_match_emit_encodeBetterBlockAsm10B:
9161 MOVB $0xf4, (AX)
9162 MOVW BX, 1(AX)
9163 ADDQ $0x03, AX
9164 JMP memmove_long_match_emit_encodeBetterBlockAsm10B
9165
9166two_bytes_match_emit_encodeBetterBlockAsm10B:
9167 MOVB $0xf0, (AX)
9168 MOVB BL, 1(AX)
9169 ADDQ $0x02, AX
9170 CMPL BX, $0x40
9171 JB memmove_match_emit_encodeBetterBlockAsm10B
9172 JMP memmove_long_match_emit_encodeBetterBlockAsm10B
9173
9174one_byte_match_emit_encodeBetterBlockAsm10B:
9175 SHLB $0x02, BL
9176 MOVB BL, (AX)
9177 ADDQ $0x01, AX
9178
9179memmove_match_emit_encodeBetterBlockAsm10B:
9180 LEAQ (AX)(R8*1), BX
9181
9182 // genMemMoveShort
9183 CMPQ R8, $0x04
9184 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
9185 CMPQ R8, $0x08
9186 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
9187 CMPQ R8, $0x10
9188 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
9189 CMPQ R8, $0x20
9190 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
9191 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
9192
9193emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
9194 MOVL (R9), R10
9195 MOVL R10, (AX)
9196 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9197
9198emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
9199 MOVL (R9), R10
9200 MOVL -4(R9)(R8*1), R9
9201 MOVL R10, (AX)
9202 MOVL R9, -4(AX)(R8*1)
9203 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9204
9205emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
9206 MOVQ (R9), R10
9207 MOVQ -8(R9)(R8*1), R9
9208 MOVQ R10, (AX)
9209 MOVQ R9, -8(AX)(R8*1)
9210 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9211
9212emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
9213 MOVOU (R9), X0
9214 MOVOU -16(R9)(R8*1), X1
9215 MOVOU X0, (AX)
9216 MOVOU X1, -16(AX)(R8*1)
9217 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9218
9219emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
9220 MOVOU (R9), X0
9221 MOVOU 16(R9), X1
9222 MOVOU -32(R9)(R8*1), X2
9223 MOVOU -16(R9)(R8*1), X3
9224 MOVOU X0, (AX)
9225 MOVOU X1, 16(AX)
9226 MOVOU X2, -32(AX)(R8*1)
9227 MOVOU X3, -16(AX)(R8*1)
9228
9229memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
9230 MOVQ BX, AX
9231 JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B
9232
9233memmove_long_match_emit_encodeBetterBlockAsm10B:
9234 LEAQ (AX)(R8*1), BX
9235
9236 // genMemMoveLong
9237 MOVOU (R9), X0
9238 MOVOU 16(R9), X1
9239 MOVOU -32(R9)(R8*1), X2
9240 MOVOU -16(R9)(R8*1), X3
9241 MOVQ R8, R12
9242 SHRQ $0x05, R12
9243 MOVQ AX, R10
9244 ANDL $0x0000001f, R10
9245 MOVQ $0x00000040, R13
9246 SUBQ R10, R13
9247 DECQ R12
9248 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9249 LEAQ -32(R9)(R13*1), R10
9250 LEAQ -32(AX)(R13*1), R14
9251
9252emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
9253 MOVOU (R10), X4
9254 MOVOU 16(R10), X5
9255 MOVOA X4, (R14)
9256 MOVOA X5, 16(R14)
9257 ADDQ $0x20, R14
9258 ADDQ $0x20, R10
9259 ADDQ $0x20, R13
9260 DECQ R12
9261 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
9262
9263emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
9264 MOVOU -32(R9)(R13*1), X4
9265 MOVOU -16(R9)(R13*1), X5
9266 MOVOA X4, -32(AX)(R13*1)
9267 MOVOA X5, -16(AX)(R13*1)
9268 ADDQ $0x20, R13
9269 CMPQ R8, R13
9270 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9271 MOVOU X0, (AX)
9272 MOVOU X1, 16(AX)
9273 MOVOU X2, -32(AX)(R8*1)
9274 MOVOU X3, -16(AX)(R8*1)
9275 MOVQ BX, AX
9276
9277emit_literal_done_match_emit_encodeBetterBlockAsm10B:
9278 ADDL R11, CX
9279 ADDL $0x04, R11
9280 MOVL CX, 12(SP)
9281
9282 // emitCopy
9283 CMPL R11, $0x40
9284 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
9285 CMPL DI, $0x00000800
9286 JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B
9287 MOVL $0x00000001, BX
9288 LEAL 16(BX), BX
9289 MOVB DI, 1(AX)
9290 SHRL $0x08, DI
9291 SHLL $0x05, DI
9292 ORL DI, BX
9293 MOVB BL, (AX)
9294 ADDQ $0x02, AX
9295 SUBL $0x08, R11
9296
9297 // emitRepeat
9298 LEAL -4(R11), R11
9299 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9300 MOVL R11, BX
9301 LEAL -4(R11), R11
9302 CMPL BX, $0x08
9303 JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9304 CMPL BX, $0x0c
9305 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9306 CMPL DI, $0x00000800
9307 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9308
9309cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9310 CMPL R11, $0x00000104
9311 JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9312 LEAL -256(R11), R11
9313 MOVW $0x0019, (AX)
9314 MOVW R11, 2(AX)
9315 ADDQ $0x04, AX
9316 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9317
9318repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9319 LEAL -4(R11), R11
9320 MOVW $0x0015, (AX)
9321 MOVB R11, 2(AX)
9322 ADDQ $0x03, AX
9323 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9324
9325repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9326 SHLL $0x02, R11
9327 ORL $0x01, R11
9328 MOVW R11, (AX)
9329 ADDQ $0x02, AX
9330 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9331
9332repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9333 XORQ BX, BX
9334 LEAL 1(BX)(R11*4), R11
9335 MOVB DI, 1(AX)
9336 SARL $0x08, DI
9337 SHLL $0x05, DI
9338 ORL DI, R11
9339 MOVB R11, (AX)
9340 ADDQ $0x02, AX
9341 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9342
9343long_offset_short_match_nolit_encodeBetterBlockAsm10B:
9344 MOVB $0xee, (AX)
9345 MOVW DI, 1(AX)
9346 LEAL -60(R11), R11
9347 ADDQ $0x03, AX
9348
9349 // emitRepeat
9350 MOVL R11, BX
9351 LEAL -4(R11), R11
9352 CMPL BX, $0x08
9353 JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9354 CMPL BX, $0x0c
9355 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9356 CMPL DI, $0x00000800
9357 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9358
9359cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9360 CMPL R11, $0x00000104
9361 JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9362 LEAL -256(R11), R11
9363 MOVW $0x0019, (AX)
9364 MOVW R11, 2(AX)
9365 ADDQ $0x04, AX
9366 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9367
9368repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9369 LEAL -4(R11), R11
9370 MOVW $0x0015, (AX)
9371 MOVB R11, 2(AX)
9372 ADDQ $0x03, AX
9373 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9374
9375repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9376 SHLL $0x02, R11
9377 ORL $0x01, R11
9378 MOVW R11, (AX)
9379 ADDQ $0x02, AX
9380 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9381
9382repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9383 XORQ BX, BX
9384 LEAL 1(BX)(R11*4), R11
9385 MOVB DI, 1(AX)
9386 SARL $0x08, DI
9387 SHLL $0x05, DI
9388 ORL DI, R11
9389 MOVB R11, (AX)
9390 ADDQ $0x02, AX
9391 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9392
9393two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
9394 MOVL R11, BX
9395 SHLL $0x02, BX
9396 CMPL R11, $0x0c
9397 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
9398 CMPL DI, $0x00000800
9399 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
9400 LEAL -15(BX), BX
9401 MOVB DI, 1(AX)
9402 SHRL $0x08, DI
9403 SHLL $0x05, DI
9404 ORL DI, BX
9405 MOVB BL, (AX)
9406 ADDQ $0x02, AX
9407 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9408
9409emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
9410 LEAL -2(BX), BX
9411 MOVB BL, (AX)
9412 MOVW DI, 1(AX)
9413 ADDQ $0x03, AX
9414 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9415
9416match_is_repeat_encodeBetterBlockAsm10B:
9417 MOVL 12(SP), BX
9418 CMPL BX, SI
9419 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
9420 MOVL SI, R8
9421 MOVL SI, 12(SP)
9422 LEAQ (DX)(BX*1), R9
9423 SUBL BX, R8
9424 LEAL -1(R8), BX
9425 CMPL BX, $0x3c
9426 JB one_byte_match_emit_repeat_encodeBetterBlockAsm10B
9427 CMPL BX, $0x00000100
9428 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
9429 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm10B
9430
9431three_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
9432 MOVB $0xf4, (AX)
9433 MOVW BX, 1(AX)
9434 ADDQ $0x03, AX
9435 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
9436
9437two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
9438 MOVB $0xf0, (AX)
9439 MOVB BL, 1(AX)
9440 ADDQ $0x02, AX
9441 CMPL BX, $0x40
9442 JB memmove_match_emit_repeat_encodeBetterBlockAsm10B
9443 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
9444
9445one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
9446 SHLB $0x02, BL
9447 MOVB BL, (AX)
9448 ADDQ $0x01, AX
9449
9450memmove_match_emit_repeat_encodeBetterBlockAsm10B:
9451 LEAQ (AX)(R8*1), BX
9452
9453 // genMemMoveShort
9454 CMPQ R8, $0x04
9455 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
9456 CMPQ R8, $0x08
9457 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
9458 CMPQ R8, $0x10
9459 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
9460 CMPQ R8, $0x20
9461 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
9462 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
9463
9464emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
9465 MOVL (R9), R10
9466 MOVL R10, (AX)
9467 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9468
9469emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
9470 MOVL (R9), R10
9471 MOVL -4(R9)(R8*1), R9
9472 MOVL R10, (AX)
9473 MOVL R9, -4(AX)(R8*1)
9474 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9475
9476emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
9477 MOVQ (R9), R10
9478 MOVQ -8(R9)(R8*1), R9
9479 MOVQ R10, (AX)
9480 MOVQ R9, -8(AX)(R8*1)
9481 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9482
9483emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
9484 MOVOU (R9), X0
9485 MOVOU -16(R9)(R8*1), X1
9486 MOVOU X0, (AX)
9487 MOVOU X1, -16(AX)(R8*1)
9488 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9489
9490emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
9491 MOVOU (R9), X0
9492 MOVOU 16(R9), X1
9493 MOVOU -32(R9)(R8*1), X2
9494 MOVOU -16(R9)(R8*1), X3
9495 MOVOU X0, (AX)
9496 MOVOU X1, 16(AX)
9497 MOVOU X2, -32(AX)(R8*1)
9498 MOVOU X3, -16(AX)(R8*1)
9499
9500memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
9501 MOVQ BX, AX
9502 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
9503
9504memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
9505 LEAQ (AX)(R8*1), BX
9506
9507 // genMemMoveLong
9508 MOVOU (R9), X0
9509 MOVOU 16(R9), X1
9510 MOVOU -32(R9)(R8*1), X2
9511 MOVOU -16(R9)(R8*1), X3
9512 MOVQ R8, R12
9513 SHRQ $0x05, R12
9514 MOVQ AX, R10
9515 ANDL $0x0000001f, R10
9516 MOVQ $0x00000040, R13
9517 SUBQ R10, R13
9518 DECQ R12
9519 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9520 LEAQ -32(R9)(R13*1), R10
9521 LEAQ -32(AX)(R13*1), R14
9522
9523emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
9524 MOVOU (R10), X4
9525 MOVOU 16(R10), X5
9526 MOVOA X4, (R14)
9527 MOVOA X5, 16(R14)
9528 ADDQ $0x20, R14
9529 ADDQ $0x20, R10
9530 ADDQ $0x20, R13
9531 DECQ R12
9532 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
9533
9534emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
9535 MOVOU -32(R9)(R13*1), X4
9536 MOVOU -16(R9)(R13*1), X5
9537 MOVOA X4, -32(AX)(R13*1)
9538 MOVOA X5, -16(AX)(R13*1)
9539 ADDQ $0x20, R13
9540 CMPQ R8, R13
9541 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9542 MOVOU X0, (AX)
9543 MOVOU X1, 16(AX)
9544 MOVOU X2, -32(AX)(R8*1)
9545 MOVOU X3, -16(AX)(R8*1)
9546 MOVQ BX, AX
9547
9548emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
9549 ADDL R11, CX
9550 ADDL $0x04, R11
9551 MOVL CX, 12(SP)
9552
9553 // emitRepeat
9554 MOVL R11, BX
9555 LEAL -4(R11), R11
9556 CMPL BX, $0x08
9557 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
9558 CMPL BX, $0x0c
9559 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
9560 CMPL DI, $0x00000800
9561 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
9562
9563cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
9564 CMPL R11, $0x00000104
9565 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
9566 LEAL -256(R11), R11
9567 MOVW $0x0019, (AX)
9568 MOVW R11, 2(AX)
9569 ADDQ $0x04, AX
9570 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9571
9572repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
9573 LEAL -4(R11), R11
9574 MOVW $0x0015, (AX)
9575 MOVB R11, 2(AX)
9576 ADDQ $0x03, AX
9577 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9578
9579repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
9580 SHLL $0x02, R11
9581 ORL $0x01, R11
9582 MOVW R11, (AX)
9583 ADDQ $0x02, AX
9584 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9585
9586repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
9587 XORQ BX, BX
9588 LEAL 1(BX)(R11*4), R11
9589 MOVB DI, 1(AX)
9590 SARL $0x08, DI
9591 SHLL $0x05, DI
9592 ORL DI, R11
9593 MOVB R11, (AX)
9594 ADDQ $0x02, AX
9595
9596match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
9597 CMPL CX, 8(SP)
9598 JAE emit_remainder_encodeBetterBlockAsm10B
9599 CMPQ AX, (SP)
9600 JB match_nolit_dst_ok_encodeBetterBlockAsm10B
9601 MOVQ $0x00000000, ret+48(FP)
9602 RET
9603
9604match_nolit_dst_ok_encodeBetterBlockAsm10B:
9605 MOVQ $0x0000cf1bbcdcbf9b, BX
9606 MOVQ $0x9e3779b1, DI
9607 LEAQ 1(SI), SI
9608 LEAQ -2(CX), R8
9609 MOVQ (DX)(SI*1), R9
9610 MOVQ 1(DX)(SI*1), R10
9611 MOVQ (DX)(R8*1), R11
9612 MOVQ 1(DX)(R8*1), R12
9613 SHLQ $0x10, R9
9614 IMULQ BX, R9
9615 SHRQ $0x34, R9
9616 SHLQ $0x20, R10
9617 IMULQ DI, R10
9618 SHRQ $0x36, R10
9619 SHLQ $0x10, R11
9620 IMULQ BX, R11
9621 SHRQ $0x34, R11
9622 SHLQ $0x20, R12
9623 IMULQ DI, R12
9624 SHRQ $0x36, R12
9625 LEAQ 1(SI), DI
9626 LEAQ 1(R8), R13
9627 MOVL SI, 24(SP)(R9*4)
9628 MOVL R8, 24(SP)(R11*4)
9629 MOVL DI, 16408(SP)(R10*4)
9630 MOVL R13, 16408(SP)(R12*4)
9631 LEAQ 1(R8)(SI*1), DI
9632 SHRQ $0x01, DI
9633 ADDQ $0x01, SI
9634 SUBQ $0x01, R8
9635
9636index_loop_encodeBetterBlockAsm10B:
9637 CMPQ DI, R8
9638 JAE search_loop_encodeBetterBlockAsm10B
9639 MOVQ (DX)(SI*1), R9
9640 MOVQ (DX)(DI*1), R10
9641 SHLQ $0x10, R9
9642 IMULQ BX, R9
9643 SHRQ $0x34, R9
9644 SHLQ $0x10, R10
9645 IMULQ BX, R10
9646 SHRQ $0x34, R10
9647 MOVL SI, 24(SP)(R9*4)
9648 MOVL DI, 24(SP)(R10*4)
9649 ADDQ $0x02, SI
9650 ADDQ $0x02, DI
9651 JMP index_loop_encodeBetterBlockAsm10B
9652
9653emit_remainder_encodeBetterBlockAsm10B:
9654 MOVQ src_len+32(FP), CX
9655 SUBL 12(SP), CX
9656 LEAQ 3(AX)(CX*1), CX
9657 CMPQ CX, (SP)
9658 JB emit_remainder_ok_encodeBetterBlockAsm10B
9659 MOVQ $0x00000000, ret+48(FP)
9660 RET
9661
9662emit_remainder_ok_encodeBetterBlockAsm10B:
9663 MOVQ src_len+32(FP), CX
9664 MOVL 12(SP), BX
9665 CMPL BX, CX
9666 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
9667 MOVL CX, SI
9668 MOVL CX, 12(SP)
9669 LEAQ (DX)(BX*1), CX
9670 SUBL BX, SI
9671 LEAL -1(SI), DX
9672 CMPL DX, $0x3c
9673 JB one_byte_emit_remainder_encodeBetterBlockAsm10B
9674 CMPL DX, $0x00000100
9675 JB two_bytes_emit_remainder_encodeBetterBlockAsm10B
9676 JB three_bytes_emit_remainder_encodeBetterBlockAsm10B
9677
9678three_bytes_emit_remainder_encodeBetterBlockAsm10B:
9679 MOVB $0xf4, (AX)
9680 MOVW DX, 1(AX)
9681 ADDQ $0x03, AX
9682 JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
9683
9684two_bytes_emit_remainder_encodeBetterBlockAsm10B:
9685 MOVB $0xf0, (AX)
9686 MOVB DL, 1(AX)
9687 ADDQ $0x02, AX
9688 CMPL DX, $0x40
9689 JB memmove_emit_remainder_encodeBetterBlockAsm10B
9690 JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
9691
9692one_byte_emit_remainder_encodeBetterBlockAsm10B:
9693 SHLB $0x02, DL
9694 MOVB DL, (AX)
9695 ADDQ $0x01, AX
9696
9697memmove_emit_remainder_encodeBetterBlockAsm10B:
9698 LEAQ (AX)(SI*1), DX
9699 MOVL SI, BX
9700
9701 // genMemMoveShort
9702 CMPQ BX, $0x03
9703 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
9704 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
9705 CMPQ BX, $0x08
9706 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
9707 CMPQ BX, $0x10
9708 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
9709 CMPQ BX, $0x20
9710 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
9711 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
9712
9713emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
9714 MOVB (CX), SI
9715 MOVB -1(CX)(BX*1), CL
9716 MOVB SI, (AX)
9717 MOVB CL, -1(AX)(BX*1)
9718 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9719
9720emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
9721 MOVW (CX), SI
9722 MOVB 2(CX), CL
9723 MOVW SI, (AX)
9724 MOVB CL, 2(AX)
9725 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9726
9727emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
9728 MOVL (CX), SI
9729 MOVL -4(CX)(BX*1), CX
9730 MOVL SI, (AX)
9731 MOVL CX, -4(AX)(BX*1)
9732 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9733
9734emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
9735 MOVQ (CX), SI
9736 MOVQ -8(CX)(BX*1), CX
9737 MOVQ SI, (AX)
9738 MOVQ CX, -8(AX)(BX*1)
9739 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9740
9741emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
9742 MOVOU (CX), X0
9743 MOVOU -16(CX)(BX*1), X1
9744 MOVOU X0, (AX)
9745 MOVOU X1, -16(AX)(BX*1)
9746 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9747
9748emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
9749 MOVOU (CX), X0
9750 MOVOU 16(CX), X1
9751 MOVOU -32(CX)(BX*1), X2
9752 MOVOU -16(CX)(BX*1), X3
9753 MOVOU X0, (AX)
9754 MOVOU X1, 16(AX)
9755 MOVOU X2, -32(AX)(BX*1)
9756 MOVOU X3, -16(AX)(BX*1)
9757
9758memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
9759 MOVQ DX, AX
9760 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
9761
9762memmove_long_emit_remainder_encodeBetterBlockAsm10B:
9763 LEAQ (AX)(SI*1), DX
9764 MOVL SI, BX
9765
9766 // genMemMoveLong
9767 MOVOU (CX), X0
9768 MOVOU 16(CX), X1
9769 MOVOU -32(CX)(BX*1), X2
9770 MOVOU -16(CX)(BX*1), X3
9771 MOVQ BX, DI
9772 SHRQ $0x05, DI
9773 MOVQ AX, SI
9774 ANDL $0x0000001f, SI
9775 MOVQ $0x00000040, R8
9776 SUBQ SI, R8
9777 DECQ DI
9778 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9779 LEAQ -32(CX)(R8*1), SI
9780 LEAQ -32(AX)(R8*1), R9
9781
9782emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
9783 MOVOU (SI), X4
9784 MOVOU 16(SI), X5
9785 MOVOA X4, (R9)
9786 MOVOA X5, 16(R9)
9787 ADDQ $0x20, R9
9788 ADDQ $0x20, SI
9789 ADDQ $0x20, R8
9790 DECQ DI
9791 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
9792
9793emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
9794 MOVOU -32(CX)(R8*1), X4
9795 MOVOU -16(CX)(R8*1), X5
9796 MOVOA X4, -32(AX)(R8*1)
9797 MOVOA X5, -16(AX)(R8*1)
9798 ADDQ $0x20, R8
9799 CMPQ BX, R8
9800 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9801 MOVOU X0, (AX)
9802 MOVOU X1, 16(AX)
9803 MOVOU X2, -32(AX)(BX*1)
9804 MOVOU X3, -16(AX)(BX*1)
9805 MOVQ DX, AX
9806
9807emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
9808 MOVQ dst_base+0(FP), CX
9809 SUBQ CX, AX
9810 MOVQ AX, ret+48(FP)
9811 RET
9812
9813// func encodeBetterBlockAsm8B(dst []byte, src []byte) int
9814// Requires: BMI, SSE2
9815TEXT ·encodeBetterBlockAsm8B(SB), $5144-56
9816 MOVQ dst_base+0(FP), AX
9817 MOVQ $0x00000028, CX
9818 LEAQ 24(SP), DX
9819 PXOR X0, X0
9820
9821zero_loop_encodeBetterBlockAsm8B:
9822 MOVOU X0, (DX)
9823 MOVOU X0, 16(DX)
9824 MOVOU X0, 32(DX)
9825 MOVOU X0, 48(DX)
9826 MOVOU X0, 64(DX)
9827 MOVOU X0, 80(DX)
9828 MOVOU X0, 96(DX)
9829 MOVOU X0, 112(DX)
9830 ADDQ $0x80, DX
9831 DECQ CX
9832 JNZ zero_loop_encodeBetterBlockAsm8B
9833 MOVL $0x00000000, 12(SP)
9834 MOVQ src_len+32(FP), CX
9835 LEAQ -6(CX), DX
9836 LEAQ -8(CX), BX
9837 MOVL BX, 8(SP)
9838 SHRQ $0x05, CX
9839 SUBL CX, DX
9840 LEAQ (AX)(DX*1), DX
9841 MOVQ DX, (SP)
9842 MOVL $0x00000001, CX
9843 MOVL $0x00000000, 16(SP)
9844 MOVQ src_base+24(FP), DX
9845
9846search_loop_encodeBetterBlockAsm8B:
9847 MOVL CX, BX
9848 SUBL 12(SP), BX
9849 SHRL $0x04, BX
9850 LEAL 1(CX)(BX*1), BX
9851 CMPL BX, 8(SP)
9852 JAE emit_remainder_encodeBetterBlockAsm8B
9853 MOVQ (DX)(CX*1), SI
9854 MOVL BX, 20(SP)
9855 MOVQ $0x0000cf1bbcdcbf9b, R8
9856 MOVQ $0x9e3779b1, BX
9857 MOVQ SI, R9
9858 MOVQ SI, R10
9859 SHLQ $0x10, R9
9860 IMULQ R8, R9
9861 SHRQ $0x36, R9
9862 SHLQ $0x20, R10
9863 IMULQ BX, R10
9864 SHRQ $0x38, R10
9865 MOVL 24(SP)(R9*4), BX
9866 MOVL 4120(SP)(R10*4), DI
9867 MOVL CX, 24(SP)(R9*4)
9868 MOVL CX, 4120(SP)(R10*4)
9869 MOVQ (DX)(BX*1), R9
9870 MOVQ (DX)(DI*1), R10
9871 CMPQ R9, SI
9872 JEQ candidate_match_encodeBetterBlockAsm8B
9873 CMPQ R10, SI
9874 JNE no_short_found_encodeBetterBlockAsm8B
9875 MOVL DI, BX
9876 JMP candidate_match_encodeBetterBlockAsm8B
9877
9878no_short_found_encodeBetterBlockAsm8B:
9879 CMPL R9, SI
9880 JEQ candidate_match_encodeBetterBlockAsm8B
9881 CMPL R10, SI
9882 JEQ candidateS_match_encodeBetterBlockAsm8B
9883 MOVL 20(SP), CX
9884 JMP search_loop_encodeBetterBlockAsm8B
9885
9886candidateS_match_encodeBetterBlockAsm8B:
9887 SHRQ $0x08, SI
9888 MOVQ SI, R9
9889 SHLQ $0x10, R9
9890 IMULQ R8, R9
9891 SHRQ $0x36, R9
9892 MOVL 24(SP)(R9*4), BX
9893 INCL CX
9894 MOVL CX, 24(SP)(R9*4)
9895 CMPL (DX)(BX*1), SI
9896 JEQ candidate_match_encodeBetterBlockAsm8B
9897 DECL CX
9898 MOVL DI, BX
9899
9900candidate_match_encodeBetterBlockAsm8B:
9901 MOVL 12(SP), SI
9902 TESTL BX, BX
9903 JZ match_extend_back_end_encodeBetterBlockAsm8B
9904
9905match_extend_back_loop_encodeBetterBlockAsm8B:
9906 CMPL CX, SI
9907 JBE match_extend_back_end_encodeBetterBlockAsm8B
9908 MOVB -1(DX)(BX*1), DI
9909 MOVB -1(DX)(CX*1), R8
9910 CMPB DI, R8
9911 JNE match_extend_back_end_encodeBetterBlockAsm8B
9912 LEAL -1(CX), CX
9913 DECL BX
9914 JZ match_extend_back_end_encodeBetterBlockAsm8B
9915 JMP match_extend_back_loop_encodeBetterBlockAsm8B
9916
9917match_extend_back_end_encodeBetterBlockAsm8B:
9918 MOVL CX, SI
9919 SUBL 12(SP), SI
9920 LEAQ 3(AX)(SI*1), SI
9921 CMPQ SI, (SP)
9922 JB match_dst_size_check_encodeBetterBlockAsm8B
9923 MOVQ $0x00000000, ret+48(FP)
9924 RET
9925
9926match_dst_size_check_encodeBetterBlockAsm8B:
9927 MOVL CX, SI
9928 ADDL $0x04, CX
9929 ADDL $0x04, BX
9930 MOVQ src_len+32(FP), DI
9931 SUBL CX, DI
9932 LEAQ (DX)(CX*1), R8
9933 LEAQ (DX)(BX*1), R9
9934
9935 // matchLen
9936 XORL R11, R11
9937
9938matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B:
9939 CMPL DI, $0x10
9940 JB matchlen_match8_match_nolit_encodeBetterBlockAsm8B
9941 MOVQ (R8)(R11*1), R10
9942 MOVQ 8(R8)(R11*1), R12
9943 XORQ (R9)(R11*1), R10
9944 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
9945 XORQ 8(R9)(R11*1), R12
9946 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B
9947 LEAL -16(DI), DI
9948 LEAL 16(R11), R11
9949 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B
9950
9951matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B:
9952#ifdef GOAMD64_v3
9953 TZCNTQ R12, R12
9954
9955#else
9956 BSFQ R12, R12
9957
9958#endif
9959 SARQ $0x03, R12
9960 LEAL 8(R11)(R12*1), R11
9961 JMP match_nolit_end_encodeBetterBlockAsm8B
9962
9963matchlen_match8_match_nolit_encodeBetterBlockAsm8B:
9964 CMPL DI, $0x08
9965 JB matchlen_match4_match_nolit_encodeBetterBlockAsm8B
9966 MOVQ (R8)(R11*1), R10
9967 XORQ (R9)(R11*1), R10
9968 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
9969 LEAL -8(DI), DI
9970 LEAL 8(R11), R11
9971 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm8B
9972
9973matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B:
9974#ifdef GOAMD64_v3
9975 TZCNTQ R10, R10
9976
9977#else
9978 BSFQ R10, R10
9979
9980#endif
9981 SARQ $0x03, R10
9982 LEAL (R11)(R10*1), R11
9983 JMP match_nolit_end_encodeBetterBlockAsm8B
9984
9985matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
9986 CMPL DI, $0x04
9987 JB matchlen_match2_match_nolit_encodeBetterBlockAsm8B
9988 MOVL (R8)(R11*1), R10
9989 CMPL (R9)(R11*1), R10
9990 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B
9991 LEAL -4(DI), DI
9992 LEAL 4(R11), R11
9993
9994matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
9995 CMPL DI, $0x01
9996 JE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
9997 JB match_nolit_end_encodeBetterBlockAsm8B
9998 MOVW (R8)(R11*1), R10
9999 CMPW (R9)(R11*1), R10
10000 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
10001 LEAL 2(R11), R11
10002 SUBL $0x02, DI
10003 JZ match_nolit_end_encodeBetterBlockAsm8B
10004
10005matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
10006 MOVB (R8)(R11*1), R10
10007 CMPB (R9)(R11*1), R10
10008 JNE match_nolit_end_encodeBetterBlockAsm8B
10009 LEAL 1(R11), R11
10010
10011match_nolit_end_encodeBetterBlockAsm8B:
10012 MOVL CX, DI
10013 SUBL BX, DI
10014
10015 // Check if repeat
10016 CMPL 16(SP), DI
10017 JEQ match_is_repeat_encodeBetterBlockAsm8B
10018 MOVL DI, 16(SP)
10019 MOVL 12(SP), BX
10020 CMPL BX, SI
10021 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B
10022 MOVL SI, R8
10023 MOVL SI, 12(SP)
10024 LEAQ (DX)(BX*1), R9
10025 SUBL BX, R8
10026 LEAL -1(R8), BX
10027 CMPL BX, $0x3c
10028 JB one_byte_match_emit_encodeBetterBlockAsm8B
10029 CMPL BX, $0x00000100
10030 JB two_bytes_match_emit_encodeBetterBlockAsm8B
10031 JB three_bytes_match_emit_encodeBetterBlockAsm8B
10032
10033three_bytes_match_emit_encodeBetterBlockAsm8B:
10034 MOVB $0xf4, (AX)
10035 MOVW BX, 1(AX)
10036 ADDQ $0x03, AX
10037 JMP memmove_long_match_emit_encodeBetterBlockAsm8B
10038
10039two_bytes_match_emit_encodeBetterBlockAsm8B:
10040 MOVB $0xf0, (AX)
10041 MOVB BL, 1(AX)
10042 ADDQ $0x02, AX
10043 CMPL BX, $0x40
10044 JB memmove_match_emit_encodeBetterBlockAsm8B
10045 JMP memmove_long_match_emit_encodeBetterBlockAsm8B
10046
10047one_byte_match_emit_encodeBetterBlockAsm8B:
10048 SHLB $0x02, BL
10049 MOVB BL, (AX)
10050 ADDQ $0x01, AX
10051
10052memmove_match_emit_encodeBetterBlockAsm8B:
10053 LEAQ (AX)(R8*1), BX
10054
10055 // genMemMoveShort
10056 CMPQ R8, $0x04
10057 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
10058 CMPQ R8, $0x08
10059 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
10060 CMPQ R8, $0x10
10061 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
10062 CMPQ R8, $0x20
10063 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
10064 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
10065
10066emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
10067 MOVL (R9), R10
10068 MOVL R10, (AX)
10069 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10070
10071emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
10072 MOVL (R9), R10
10073 MOVL -4(R9)(R8*1), R9
10074 MOVL R10, (AX)
10075 MOVL R9, -4(AX)(R8*1)
10076 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10077
10078emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
10079 MOVQ (R9), R10
10080 MOVQ -8(R9)(R8*1), R9
10081 MOVQ R10, (AX)
10082 MOVQ R9, -8(AX)(R8*1)
10083 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10084
10085emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
10086 MOVOU (R9), X0
10087 MOVOU -16(R9)(R8*1), X1
10088 MOVOU X0, (AX)
10089 MOVOU X1, -16(AX)(R8*1)
10090 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10091
10092emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
10093 MOVOU (R9), X0
10094 MOVOU 16(R9), X1
10095 MOVOU -32(R9)(R8*1), X2
10096 MOVOU -16(R9)(R8*1), X3
10097 MOVOU X0, (AX)
10098 MOVOU X1, 16(AX)
10099 MOVOU X2, -32(AX)(R8*1)
10100 MOVOU X3, -16(AX)(R8*1)
10101
10102memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
10103 MOVQ BX, AX
10104 JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B
10105
10106memmove_long_match_emit_encodeBetterBlockAsm8B:
10107 LEAQ (AX)(R8*1), BX
10108
10109 // genMemMoveLong
10110 MOVOU (R9), X0
10111 MOVOU 16(R9), X1
10112 MOVOU -32(R9)(R8*1), X2
10113 MOVOU -16(R9)(R8*1), X3
10114 MOVQ R8, R12
10115 SHRQ $0x05, R12
10116 MOVQ AX, R10
10117 ANDL $0x0000001f, R10
10118 MOVQ $0x00000040, R13
10119 SUBQ R10, R13
10120 DECQ R12
10121 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10122 LEAQ -32(R9)(R13*1), R10
10123 LEAQ -32(AX)(R13*1), R14
10124
10125emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
10126 MOVOU (R10), X4
10127 MOVOU 16(R10), X5
10128 MOVOA X4, (R14)
10129 MOVOA X5, 16(R14)
10130 ADDQ $0x20, R14
10131 ADDQ $0x20, R10
10132 ADDQ $0x20, R13
10133 DECQ R12
10134 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
10135
10136emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
10137 MOVOU -32(R9)(R13*1), X4
10138 MOVOU -16(R9)(R13*1), X5
10139 MOVOA X4, -32(AX)(R13*1)
10140 MOVOA X5, -16(AX)(R13*1)
10141 ADDQ $0x20, R13
10142 CMPQ R8, R13
10143 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10144 MOVOU X0, (AX)
10145 MOVOU X1, 16(AX)
10146 MOVOU X2, -32(AX)(R8*1)
10147 MOVOU X3, -16(AX)(R8*1)
10148 MOVQ BX, AX
10149
10150emit_literal_done_match_emit_encodeBetterBlockAsm8B:
10151 ADDL R11, CX
10152 ADDL $0x04, R11
10153 MOVL CX, 12(SP)
10154
10155 // emitCopy
10156 CMPL R11, $0x40
10157 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
10158 CMPL DI, $0x00000800
10159 JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B
10160 MOVL $0x00000001, BX
10161 LEAL 16(BX), BX
10162 MOVB DI, 1(AX)
10163 SHRL $0x08, DI
10164 SHLL $0x05, DI
10165 ORL DI, BX
10166 MOVB BL, (AX)
10167 ADDQ $0x02, AX
10168 SUBL $0x08, R11
10169
10170 // emitRepeat
10171 LEAL -4(R11), R11
10172 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10173 MOVL R11, BX
10174 LEAL -4(R11), R11
10175 CMPL BX, $0x08
10176 JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10177 CMPL BX, $0x0c
10178 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10179
10180cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
10181 CMPL R11, $0x00000104
10182 JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10183 LEAL -256(R11), R11
10184 MOVW $0x0019, (AX)
10185 MOVW R11, 2(AX)
10186 ADDQ $0x04, AX
10187 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10188
10189repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
10190 LEAL -4(R11), R11
10191 MOVW $0x0015, (AX)
10192 MOVB R11, 2(AX)
10193 ADDQ $0x03, AX
10194 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10195
10196repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
10197 SHLL $0x02, R11
10198 ORL $0x01, R11
10199 MOVW R11, (AX)
10200 ADDQ $0x02, AX
10201 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10202 XORQ BX, BX
10203 LEAL 1(BX)(R11*4), R11
10204 MOVB DI, 1(AX)
10205 SARL $0x08, DI
10206 SHLL $0x05, DI
10207 ORL DI, R11
10208 MOVB R11, (AX)
10209 ADDQ $0x02, AX
10210 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10211
10212long_offset_short_match_nolit_encodeBetterBlockAsm8B:
10213 MOVB $0xee, (AX)
10214 MOVW DI, 1(AX)
10215 LEAL -60(R11), R11
10216 ADDQ $0x03, AX
10217
10218 // emitRepeat
10219 MOVL R11, BX
10220 LEAL -4(R11), R11
10221 CMPL BX, $0x08
10222 JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
10223 CMPL BX, $0x0c
10224 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
10225
10226cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
10227 CMPL R11, $0x00000104
10228 JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
10229 LEAL -256(R11), R11
10230 MOVW $0x0019, (AX)
10231 MOVW R11, 2(AX)
10232 ADDQ $0x04, AX
10233 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10234
10235repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
10236 LEAL -4(R11), R11
10237 MOVW $0x0015, (AX)
10238 MOVB R11, 2(AX)
10239 ADDQ $0x03, AX
10240 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10241
10242repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
10243 SHLL $0x02, R11
10244 ORL $0x01, R11
10245 MOVW R11, (AX)
10246 ADDQ $0x02, AX
10247 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10248 XORQ BX, BX
10249 LEAL 1(BX)(R11*4), R11
10250 MOVB DI, 1(AX)
10251 SARL $0x08, DI
10252 SHLL $0x05, DI
10253 ORL DI, R11
10254 MOVB R11, (AX)
10255 ADDQ $0x02, AX
10256 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10257
10258two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
10259 MOVL R11, BX
10260 SHLL $0x02, BX
10261 CMPL R11, $0x0c
10262 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm8B
10263 LEAL -15(BX), BX
10264 MOVB DI, 1(AX)
10265 SHRL $0x08, DI
10266 SHLL $0x05, DI
10267 ORL DI, BX
10268 MOVB BL, (AX)
10269 ADDQ $0x02, AX
10270 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10271
10272emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
10273 LEAL -2(BX), BX
10274 MOVB BL, (AX)
10275 MOVW DI, 1(AX)
10276 ADDQ $0x03, AX
10277 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10278
10279match_is_repeat_encodeBetterBlockAsm8B:
10280 MOVL 12(SP), BX
10281 CMPL BX, SI
10282 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
10283 MOVL SI, DI
10284 MOVL SI, 12(SP)
10285 LEAQ (DX)(BX*1), R8
10286 SUBL BX, DI
10287 LEAL -1(DI), BX
10288 CMPL BX, $0x3c
10289 JB one_byte_match_emit_repeat_encodeBetterBlockAsm8B
10290 CMPL BX, $0x00000100
10291 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
10292 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm8B
10293
10294three_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
10295 MOVB $0xf4, (AX)
10296 MOVW BX, 1(AX)
10297 ADDQ $0x03, AX
10298 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
10299
10300two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
10301 MOVB $0xf0, (AX)
10302 MOVB BL, 1(AX)
10303 ADDQ $0x02, AX
10304 CMPL BX, $0x40
10305 JB memmove_match_emit_repeat_encodeBetterBlockAsm8B
10306 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
10307
10308one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
10309 SHLB $0x02, BL
10310 MOVB BL, (AX)
10311 ADDQ $0x01, AX
10312
10313memmove_match_emit_repeat_encodeBetterBlockAsm8B:
10314 LEAQ (AX)(DI*1), BX
10315
10316 // genMemMoveShort
10317 CMPQ DI, $0x04
10318 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
10319 CMPQ DI, $0x08
10320 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
10321 CMPQ DI, $0x10
10322 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
10323 CMPQ DI, $0x20
10324 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
10325 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
10326
10327emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
10328 MOVL (R8), R9
10329 MOVL R9, (AX)
10330 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10331
10332emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
10333 MOVL (R8), R9
10334 MOVL -4(R8)(DI*1), R8
10335 MOVL R9, (AX)
10336 MOVL R8, -4(AX)(DI*1)
10337 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10338
10339emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
10340 MOVQ (R8), R9
10341 MOVQ -8(R8)(DI*1), R8
10342 MOVQ R9, (AX)
10343 MOVQ R8, -8(AX)(DI*1)
10344 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10345
10346emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
10347 MOVOU (R8), X0
10348 MOVOU -16(R8)(DI*1), X1
10349 MOVOU X0, (AX)
10350 MOVOU X1, -16(AX)(DI*1)
10351 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10352
10353emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
10354 MOVOU (R8), X0
10355 MOVOU 16(R8), X1
10356 MOVOU -32(R8)(DI*1), X2
10357 MOVOU -16(R8)(DI*1), X3
10358 MOVOU X0, (AX)
10359 MOVOU X1, 16(AX)
10360 MOVOU X2, -32(AX)(DI*1)
10361 MOVOU X3, -16(AX)(DI*1)
10362
10363memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
10364 MOVQ BX, AX
10365 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
10366
10367memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
10368 LEAQ (AX)(DI*1), BX
10369
10370 // genMemMoveLong
10371 MOVOU (R8), X0
10372 MOVOU 16(R8), X1
10373 MOVOU -32(R8)(DI*1), X2
10374 MOVOU -16(R8)(DI*1), X3
10375 MOVQ DI, R10
10376 SHRQ $0x05, R10
10377 MOVQ AX, R9
10378 ANDL $0x0000001f, R9
10379 MOVQ $0x00000040, R12
10380 SUBQ R9, R12
10381 DECQ R10
10382 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10383 LEAQ -32(R8)(R12*1), R9
10384 LEAQ -32(AX)(R12*1), R13
10385
10386emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
10387 MOVOU (R9), X4
10388 MOVOU 16(R9), X5
10389 MOVOA X4, (R13)
10390 MOVOA X5, 16(R13)
10391 ADDQ $0x20, R13
10392 ADDQ $0x20, R9
10393 ADDQ $0x20, R12
10394 DECQ R10
10395 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
10396
10397emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
10398 MOVOU -32(R8)(R12*1), X4
10399 MOVOU -16(R8)(R12*1), X5
10400 MOVOA X4, -32(AX)(R12*1)
10401 MOVOA X5, -16(AX)(R12*1)
10402 ADDQ $0x20, R12
10403 CMPQ DI, R12
10404 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10405 MOVOU X0, (AX)
10406 MOVOU X1, 16(AX)
10407 MOVOU X2, -32(AX)(DI*1)
10408 MOVOU X3, -16(AX)(DI*1)
10409 MOVQ BX, AX
10410
10411emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
10412 ADDL R11, CX
10413 ADDL $0x04, R11
10414 MOVL CX, 12(SP)
10415
10416 // emitRepeat
10417 MOVL R11, BX
10418 LEAL -4(R11), R11
10419 CMPL BX, $0x08
10420 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
10421 CMPL BX, $0x0c
10422 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
10423
10424cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
10425 CMPL R11, $0x00000104
10426 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
10427 LEAL -256(R11), R11
10428 MOVW $0x0019, (AX)
10429 MOVW R11, 2(AX)
10430 ADDQ $0x04, AX
10431 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10432
10433repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
10434 LEAL -4(R11), R11
10435 MOVW $0x0015, (AX)
10436 MOVB R11, 2(AX)
10437 ADDQ $0x03, AX
10438 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10439
10440repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
10441 SHLL $0x02, R11
10442 ORL $0x01, R11
10443 MOVW R11, (AX)
10444 ADDQ $0x02, AX
10445 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10446 XORQ BX, BX
10447 LEAL 1(BX)(R11*4), R11
10448 MOVB DI, 1(AX)
10449 SARL $0x08, DI
10450 SHLL $0x05, DI
10451 ORL DI, R11
10452 MOVB R11, (AX)
10453 ADDQ $0x02, AX
10454
10455match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
10456 CMPL CX, 8(SP)
10457 JAE emit_remainder_encodeBetterBlockAsm8B
10458 CMPQ AX, (SP)
10459 JB match_nolit_dst_ok_encodeBetterBlockAsm8B
10460 MOVQ $0x00000000, ret+48(FP)
10461 RET
10462
10463match_nolit_dst_ok_encodeBetterBlockAsm8B:
10464 MOVQ $0x0000cf1bbcdcbf9b, BX
10465 MOVQ $0x9e3779b1, DI
10466 LEAQ 1(SI), SI
10467 LEAQ -2(CX), R8
10468 MOVQ (DX)(SI*1), R9
10469 MOVQ 1(DX)(SI*1), R10
10470 MOVQ (DX)(R8*1), R11
10471 MOVQ 1(DX)(R8*1), R12
10472 SHLQ $0x10, R9
10473 IMULQ BX, R9
10474 SHRQ $0x36, R9
10475 SHLQ $0x20, R10
10476 IMULQ DI, R10
10477 SHRQ $0x38, R10
10478 SHLQ $0x10, R11
10479 IMULQ BX, R11
10480 SHRQ $0x36, R11
10481 SHLQ $0x20, R12
10482 IMULQ DI, R12
10483 SHRQ $0x38, R12
10484 LEAQ 1(SI), DI
10485 LEAQ 1(R8), R13
10486 MOVL SI, 24(SP)(R9*4)
10487 MOVL R8, 24(SP)(R11*4)
10488 MOVL DI, 4120(SP)(R10*4)
10489 MOVL R13, 4120(SP)(R12*4)
10490 LEAQ 1(R8)(SI*1), DI
10491 SHRQ $0x01, DI
10492 ADDQ $0x01, SI
10493 SUBQ $0x01, R8
10494
10495index_loop_encodeBetterBlockAsm8B:
10496 CMPQ DI, R8
10497 JAE search_loop_encodeBetterBlockAsm8B
10498 MOVQ (DX)(SI*1), R9
10499 MOVQ (DX)(DI*1), R10
10500 SHLQ $0x10, R9
10501 IMULQ BX, R9
10502 SHRQ $0x36, R9
10503 SHLQ $0x10, R10
10504 IMULQ BX, R10
10505 SHRQ $0x36, R10
10506 MOVL SI, 24(SP)(R9*4)
10507 MOVL DI, 24(SP)(R10*4)
10508 ADDQ $0x02, SI
10509 ADDQ $0x02, DI
10510 JMP index_loop_encodeBetterBlockAsm8B
10511
10512emit_remainder_encodeBetterBlockAsm8B:
10513 MOVQ src_len+32(FP), CX
10514 SUBL 12(SP), CX
10515 LEAQ 3(AX)(CX*1), CX
10516 CMPQ CX, (SP)
10517 JB emit_remainder_ok_encodeBetterBlockAsm8B
10518 MOVQ $0x00000000, ret+48(FP)
10519 RET
10520
10521emit_remainder_ok_encodeBetterBlockAsm8B:
10522 MOVQ src_len+32(FP), CX
10523 MOVL 12(SP), BX
10524 CMPL BX, CX
10525 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
10526 MOVL CX, SI
10527 MOVL CX, 12(SP)
10528 LEAQ (DX)(BX*1), CX
10529 SUBL BX, SI
10530 LEAL -1(SI), DX
10531 CMPL DX, $0x3c
10532 JB one_byte_emit_remainder_encodeBetterBlockAsm8B
10533 CMPL DX, $0x00000100
10534 JB two_bytes_emit_remainder_encodeBetterBlockAsm8B
10535 JB three_bytes_emit_remainder_encodeBetterBlockAsm8B
10536
10537three_bytes_emit_remainder_encodeBetterBlockAsm8B:
10538 MOVB $0xf4, (AX)
10539 MOVW DX, 1(AX)
10540 ADDQ $0x03, AX
10541 JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
10542
10543two_bytes_emit_remainder_encodeBetterBlockAsm8B:
10544 MOVB $0xf0, (AX)
10545 MOVB DL, 1(AX)
10546 ADDQ $0x02, AX
10547 CMPL DX, $0x40
10548 JB memmove_emit_remainder_encodeBetterBlockAsm8B
10549 JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
10550
10551one_byte_emit_remainder_encodeBetterBlockAsm8B:
10552 SHLB $0x02, DL
10553 MOVB DL, (AX)
10554 ADDQ $0x01, AX
10555
10556memmove_emit_remainder_encodeBetterBlockAsm8B:
10557 LEAQ (AX)(SI*1), DX
10558 MOVL SI, BX
10559
10560 // genMemMoveShort
10561 CMPQ BX, $0x03
10562 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
10563 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
10564 CMPQ BX, $0x08
10565 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
10566 CMPQ BX, $0x10
10567 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
10568 CMPQ BX, $0x20
10569 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
10570 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
10571
10572emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
10573 MOVB (CX), SI
10574 MOVB -1(CX)(BX*1), CL
10575 MOVB SI, (AX)
10576 MOVB CL, -1(AX)(BX*1)
10577 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10578
10579emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
10580 MOVW (CX), SI
10581 MOVB 2(CX), CL
10582 MOVW SI, (AX)
10583 MOVB CL, 2(AX)
10584 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10585
10586emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
10587 MOVL (CX), SI
10588 MOVL -4(CX)(BX*1), CX
10589 MOVL SI, (AX)
10590 MOVL CX, -4(AX)(BX*1)
10591 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10592
10593emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
10594 MOVQ (CX), SI
10595 MOVQ -8(CX)(BX*1), CX
10596 MOVQ SI, (AX)
10597 MOVQ CX, -8(AX)(BX*1)
10598 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10599
10600emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
10601 MOVOU (CX), X0
10602 MOVOU -16(CX)(BX*1), X1
10603 MOVOU X0, (AX)
10604 MOVOU X1, -16(AX)(BX*1)
10605 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10606
10607emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
10608 MOVOU (CX), X0
10609 MOVOU 16(CX), X1
10610 MOVOU -32(CX)(BX*1), X2
10611 MOVOU -16(CX)(BX*1), X3
10612 MOVOU X0, (AX)
10613 MOVOU X1, 16(AX)
10614 MOVOU X2, -32(AX)(BX*1)
10615 MOVOU X3, -16(AX)(BX*1)
10616
10617memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
10618 MOVQ DX, AX
10619 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
10620
10621memmove_long_emit_remainder_encodeBetterBlockAsm8B:
10622 LEAQ (AX)(SI*1), DX
10623 MOVL SI, BX
10624
10625 // genMemMoveLong
10626 MOVOU (CX), X0
10627 MOVOU 16(CX), X1
10628 MOVOU -32(CX)(BX*1), X2
10629 MOVOU -16(CX)(BX*1), X3
10630 MOVQ BX, DI
10631 SHRQ $0x05, DI
10632 MOVQ AX, SI
10633 ANDL $0x0000001f, SI
10634 MOVQ $0x00000040, R8
10635 SUBQ SI, R8
10636 DECQ DI
10637 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10638 LEAQ -32(CX)(R8*1), SI
10639 LEAQ -32(AX)(R8*1), R9
10640
10641emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
10642 MOVOU (SI), X4
10643 MOVOU 16(SI), X5
10644 MOVOA X4, (R9)
10645 MOVOA X5, 16(R9)
10646 ADDQ $0x20, R9
10647 ADDQ $0x20, SI
10648 ADDQ $0x20, R8
10649 DECQ DI
10650 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
10651
10652emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
10653 MOVOU -32(CX)(R8*1), X4
10654 MOVOU -16(CX)(R8*1), X5
10655 MOVOA X4, -32(AX)(R8*1)
10656 MOVOA X5, -16(AX)(R8*1)
10657 ADDQ $0x20, R8
10658 CMPQ BX, R8
10659 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10660 MOVOU X0, (AX)
10661 MOVOU X1, 16(AX)
10662 MOVOU X2, -32(AX)(BX*1)
10663 MOVOU X3, -16(AX)(BX*1)
10664 MOVQ DX, AX
10665
10666emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
10667 MOVQ dst_base+0(FP), CX
10668 SUBQ CX, AX
10669 MOVQ AX, ret+48(FP)
10670 RET
10671
10672// func encodeSnappyBlockAsm(dst []byte, src []byte) int
10673// Requires: BMI, SSE2
10674TEXT ·encodeSnappyBlockAsm(SB), $65560-56
10675 MOVQ dst_base+0(FP), AX
10676 MOVQ $0x00000200, CX
10677 LEAQ 24(SP), DX
10678 PXOR X0, X0
10679
10680zero_loop_encodeSnappyBlockAsm:
10681 MOVOU X0, (DX)
10682 MOVOU X0, 16(DX)
10683 MOVOU X0, 32(DX)
10684 MOVOU X0, 48(DX)
10685 MOVOU X0, 64(DX)
10686 MOVOU X0, 80(DX)
10687 MOVOU X0, 96(DX)
10688 MOVOU X0, 112(DX)
10689 ADDQ $0x80, DX
10690 DECQ CX
10691 JNZ zero_loop_encodeSnappyBlockAsm
10692 MOVL $0x00000000, 12(SP)
10693 MOVQ src_len+32(FP), CX
10694 LEAQ -9(CX), DX
10695 LEAQ -8(CX), BX
10696 MOVL BX, 8(SP)
10697 SHRQ $0x05, CX
10698 SUBL CX, DX
10699 LEAQ (AX)(DX*1), DX
10700 MOVQ DX, (SP)
10701 MOVL $0x00000001, CX
10702 MOVL CX, 16(SP)
10703 MOVQ src_base+24(FP), DX
10704
10705search_loop_encodeSnappyBlockAsm:
10706 MOVL CX, BX
10707 SUBL 12(SP), BX
10708 SHRL $0x06, BX
10709 LEAL 4(CX)(BX*1), BX
10710 CMPL BX, 8(SP)
10711 JAE emit_remainder_encodeSnappyBlockAsm
10712 MOVQ (DX)(CX*1), SI
10713 MOVL BX, 20(SP)
10714 MOVQ $0x0000cf1bbcdcbf9b, R8
10715 MOVQ SI, R9
10716 MOVQ SI, R10
10717 SHRQ $0x08, R10
10718 SHLQ $0x10, R9
10719 IMULQ R8, R9
10720 SHRQ $0x32, R9
10721 SHLQ $0x10, R10
10722 IMULQ R8, R10
10723 SHRQ $0x32, R10
10724 MOVL 24(SP)(R9*4), BX
10725 MOVL 24(SP)(R10*4), DI
10726 MOVL CX, 24(SP)(R9*4)
10727 LEAL 1(CX), R9
10728 MOVL R9, 24(SP)(R10*4)
10729 MOVQ SI, R9
10730 SHRQ $0x10, R9
10731 SHLQ $0x10, R9
10732 IMULQ R8, R9
10733 SHRQ $0x32, R9
10734 MOVL CX, R8
10735 SUBL 16(SP), R8
10736 MOVL 1(DX)(R8*1), R10
10737 MOVQ SI, R8
10738 SHRQ $0x08, R8
10739 CMPL R8, R10
10740 JNE no_repeat_found_encodeSnappyBlockAsm
10741 LEAL 1(CX), SI
10742 MOVL 12(SP), BX
10743 MOVL SI, DI
10744 SUBL 16(SP), DI
10745 JZ repeat_extend_back_end_encodeSnappyBlockAsm
10746
10747repeat_extend_back_loop_encodeSnappyBlockAsm:
10748 CMPL SI, BX
10749 JBE repeat_extend_back_end_encodeSnappyBlockAsm
10750 MOVB -1(DX)(DI*1), R8
10751 MOVB -1(DX)(SI*1), R9
10752 CMPB R8, R9
10753 JNE repeat_extend_back_end_encodeSnappyBlockAsm
10754 LEAL -1(SI), SI
10755 DECL DI
10756 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm
10757
10758repeat_extend_back_end_encodeSnappyBlockAsm:
10759 MOVL 12(SP), BX
10760 CMPL BX, SI
10761 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm
10762 MOVL SI, DI
10763 MOVL SI, 12(SP)
10764 LEAQ (DX)(BX*1), R8
10765 SUBL BX, DI
10766 LEAL -1(DI), BX
10767 CMPL BX, $0x3c
10768 JB one_byte_repeat_emit_encodeSnappyBlockAsm
10769 CMPL BX, $0x00000100
10770 JB two_bytes_repeat_emit_encodeSnappyBlockAsm
10771 CMPL BX, $0x00010000
10772 JB three_bytes_repeat_emit_encodeSnappyBlockAsm
10773 CMPL BX, $0x01000000
10774 JB four_bytes_repeat_emit_encodeSnappyBlockAsm
10775 MOVB $0xfc, (AX)
10776 MOVL BX, 1(AX)
10777 ADDQ $0x05, AX
10778 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10779
10780four_bytes_repeat_emit_encodeSnappyBlockAsm:
10781 MOVL BX, R9
10782 SHRL $0x10, R9
10783 MOVB $0xf8, (AX)
10784 MOVW BX, 1(AX)
10785 MOVB R9, 3(AX)
10786 ADDQ $0x04, AX
10787 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10788
10789three_bytes_repeat_emit_encodeSnappyBlockAsm:
10790 MOVB $0xf4, (AX)
10791 MOVW BX, 1(AX)
10792 ADDQ $0x03, AX
10793 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10794
10795two_bytes_repeat_emit_encodeSnappyBlockAsm:
10796 MOVB $0xf0, (AX)
10797 MOVB BL, 1(AX)
10798 ADDQ $0x02, AX
10799 CMPL BX, $0x40
10800 JB memmove_repeat_emit_encodeSnappyBlockAsm
10801 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10802
10803one_byte_repeat_emit_encodeSnappyBlockAsm:
10804 SHLB $0x02, BL
10805 MOVB BL, (AX)
10806 ADDQ $0x01, AX
10807
10808memmove_repeat_emit_encodeSnappyBlockAsm:
10809 LEAQ (AX)(DI*1), BX
10810
10811 // genMemMoveShort
10812 CMPQ DI, $0x08
10813 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
10814 CMPQ DI, $0x10
10815 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
10816 CMPQ DI, $0x20
10817 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
10818 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
10819
10820emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
10821 MOVQ (R8), R9
10822 MOVQ R9, (AX)
10823 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
10824
10825emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
10826 MOVQ (R8), R9
10827 MOVQ -8(R8)(DI*1), R8
10828 MOVQ R9, (AX)
10829 MOVQ R8, -8(AX)(DI*1)
10830 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
10831
10832emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
10833 MOVOU (R8), X0
10834 MOVOU -16(R8)(DI*1), X1
10835 MOVOU X0, (AX)
10836 MOVOU X1, -16(AX)(DI*1)
10837 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
10838
10839emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
10840 MOVOU (R8), X0
10841 MOVOU 16(R8), X1
10842 MOVOU -32(R8)(DI*1), X2
10843 MOVOU -16(R8)(DI*1), X3
10844 MOVOU X0, (AX)
10845 MOVOU X1, 16(AX)
10846 MOVOU X2, -32(AX)(DI*1)
10847 MOVOU X3, -16(AX)(DI*1)
10848
10849memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
10850 MOVQ BX, AX
10851 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm
10852
10853memmove_long_repeat_emit_encodeSnappyBlockAsm:
10854 LEAQ (AX)(DI*1), BX
10855
10856 // genMemMoveLong
10857 MOVOU (R8), X0
10858 MOVOU 16(R8), X1
10859 MOVOU -32(R8)(DI*1), X2
10860 MOVOU -16(R8)(DI*1), X3
10861 MOVQ DI, R10
10862 SHRQ $0x05, R10
10863 MOVQ AX, R9
10864 ANDL $0x0000001f, R9
10865 MOVQ $0x00000040, R11
10866 SUBQ R9, R11
10867 DECQ R10
10868 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
10869 LEAQ -32(R8)(R11*1), R9
10870 LEAQ -32(AX)(R11*1), R12
10871
10872emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
10873 MOVOU (R9), X4
10874 MOVOU 16(R9), X5
10875 MOVOA X4, (R12)
10876 MOVOA X5, 16(R12)
10877 ADDQ $0x20, R12
10878 ADDQ $0x20, R9
10879 ADDQ $0x20, R11
10880 DECQ R10
10881 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
10882
10883emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
10884 MOVOU -32(R8)(R11*1), X4
10885 MOVOU -16(R8)(R11*1), X5
10886 MOVOA X4, -32(AX)(R11*1)
10887 MOVOA X5, -16(AX)(R11*1)
10888 ADDQ $0x20, R11
10889 CMPQ DI, R11
10890 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
10891 MOVOU X0, (AX)
10892 MOVOU X1, 16(AX)
10893 MOVOU X2, -32(AX)(DI*1)
10894 MOVOU X3, -16(AX)(DI*1)
10895 MOVQ BX, AX
10896
10897emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
10898 ADDL $0x05, CX
10899 MOVL CX, BX
10900 SUBL 16(SP), BX
10901 MOVQ src_len+32(FP), DI
10902 SUBL CX, DI
10903 LEAQ (DX)(CX*1), R8
10904 LEAQ (DX)(BX*1), BX
10905
10906 // matchLen
10907 XORL R10, R10
10908
10909matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm:
10910 CMPL DI, $0x10
10911 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm
10912 MOVQ (R8)(R10*1), R9
10913 MOVQ 8(R8)(R10*1), R11
10914 XORQ (BX)(R10*1), R9
10915 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
10916 XORQ 8(BX)(R10*1), R11
10917 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm
10918 LEAL -16(DI), DI
10919 LEAL 16(R10), R10
10920 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm
10921
10922matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm:
10923#ifdef GOAMD64_v3
10924 TZCNTQ R11, R11
10925
10926#else
10927 BSFQ R11, R11
10928
10929#endif
10930 SARQ $0x03, R11
10931 LEAL 8(R10)(R11*1), R10
10932 JMP repeat_extend_forward_end_encodeSnappyBlockAsm
10933
10934matchlen_match8_repeat_extend_encodeSnappyBlockAsm:
10935 CMPL DI, $0x08
10936 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm
10937 MOVQ (R8)(R10*1), R9
10938 XORQ (BX)(R10*1), R9
10939 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
10940 LEAL -8(DI), DI
10941 LEAL 8(R10), R10
10942 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm
10943
10944matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm:
10945#ifdef GOAMD64_v3
10946 TZCNTQ R9, R9
10947
10948#else
10949 BSFQ R9, R9
10950
10951#endif
10952 SARQ $0x03, R9
10953 LEAL (R10)(R9*1), R10
10954 JMP repeat_extend_forward_end_encodeSnappyBlockAsm
10955
10956matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
10957 CMPL DI, $0x04
10958 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm
10959 MOVL (R8)(R10*1), R9
10960 CMPL (BX)(R10*1), R9
10961 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm
10962 LEAL -4(DI), DI
10963 LEAL 4(R10), R10
10964
10965matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
10966 CMPL DI, $0x01
10967 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
10968 JB repeat_extend_forward_end_encodeSnappyBlockAsm
10969 MOVW (R8)(R10*1), R9
10970 CMPW (BX)(R10*1), R9
10971 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
10972 LEAL 2(R10), R10
10973 SUBL $0x02, DI
10974 JZ repeat_extend_forward_end_encodeSnappyBlockAsm
10975
10976matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
10977 MOVB (R8)(R10*1), R9
10978 CMPB (BX)(R10*1), R9
10979 JNE repeat_extend_forward_end_encodeSnappyBlockAsm
10980 LEAL 1(R10), R10
10981
10982repeat_extend_forward_end_encodeSnappyBlockAsm:
10983 ADDL R10, CX
10984 MOVL CX, BX
10985 SUBL SI, BX
10986 MOVL 16(SP), SI
10987
10988 // emitCopy
10989 CMPL SI, $0x00010000
10990 JB two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
10991
10992four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
10993 CMPL BX, $0x40
10994 JBE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
10995 MOVB $0xff, (AX)
10996 MOVL SI, 1(AX)
10997 LEAL -64(BX), BX
10998 ADDQ $0x05, AX
10999 CMPL BX, $0x04
11000 JB four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
11001 JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
11002
11003four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
11004 TESTL BX, BX
11005 JZ repeat_end_emit_encodeSnappyBlockAsm
11006 XORL DI, DI
11007 LEAL -1(DI)(BX*4), BX
11008 MOVB BL, (AX)
11009 MOVL SI, 1(AX)
11010 ADDQ $0x05, AX
11011 JMP repeat_end_emit_encodeSnappyBlockAsm
11012
11013two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
11014 CMPL BX, $0x40
11015 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
11016 MOVB $0xee, (AX)
11017 MOVW SI, 1(AX)
11018 LEAL -60(BX), BX
11019 ADDQ $0x03, AX
11020 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
11021
11022two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
11023 MOVL BX, DI
11024 SHLL $0x02, DI
11025 CMPL BX, $0x0c
11026 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
11027 CMPL SI, $0x00000800
11028 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
11029 LEAL -15(DI), DI
11030 MOVB SI, 1(AX)
11031 SHRL $0x08, SI
11032 SHLL $0x05, SI
11033 ORL SI, DI
11034 MOVB DI, (AX)
11035 ADDQ $0x02, AX
11036 JMP repeat_end_emit_encodeSnappyBlockAsm
11037
11038emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
11039 LEAL -2(DI), DI
11040 MOVB DI, (AX)
11041 MOVW SI, 1(AX)
11042 ADDQ $0x03, AX
11043
11044repeat_end_emit_encodeSnappyBlockAsm:
11045 MOVL CX, 12(SP)
11046 JMP search_loop_encodeSnappyBlockAsm
11047
11048no_repeat_found_encodeSnappyBlockAsm:
11049 CMPL (DX)(BX*1), SI
11050 JEQ candidate_match_encodeSnappyBlockAsm
11051 SHRQ $0x08, SI
11052 MOVL 24(SP)(R9*4), BX
11053 LEAL 2(CX), R8
11054 CMPL (DX)(DI*1), SI
11055 JEQ candidate2_match_encodeSnappyBlockAsm
11056 MOVL R8, 24(SP)(R9*4)
11057 SHRQ $0x08, SI
11058 CMPL (DX)(BX*1), SI
11059 JEQ candidate3_match_encodeSnappyBlockAsm
11060 MOVL 20(SP), CX
11061 JMP search_loop_encodeSnappyBlockAsm
11062
11063candidate3_match_encodeSnappyBlockAsm:
11064 ADDL $0x02, CX
11065 JMP candidate_match_encodeSnappyBlockAsm
11066
11067candidate2_match_encodeSnappyBlockAsm:
11068 MOVL R8, 24(SP)(R9*4)
11069 INCL CX
11070 MOVL DI, BX
11071
11072candidate_match_encodeSnappyBlockAsm:
11073 MOVL 12(SP), SI
11074 TESTL BX, BX
11075 JZ match_extend_back_end_encodeSnappyBlockAsm
11076
11077match_extend_back_loop_encodeSnappyBlockAsm:
11078 CMPL CX, SI
11079 JBE match_extend_back_end_encodeSnappyBlockAsm
11080 MOVB -1(DX)(BX*1), DI
11081 MOVB -1(DX)(CX*1), R8
11082 CMPB DI, R8
11083 JNE match_extend_back_end_encodeSnappyBlockAsm
11084 LEAL -1(CX), CX
11085 DECL BX
11086 JZ match_extend_back_end_encodeSnappyBlockAsm
11087 JMP match_extend_back_loop_encodeSnappyBlockAsm
11088
11089match_extend_back_end_encodeSnappyBlockAsm:
11090 MOVL CX, SI
11091 SUBL 12(SP), SI
11092 LEAQ 5(AX)(SI*1), SI
11093 CMPQ SI, (SP)
11094 JB match_dst_size_check_encodeSnappyBlockAsm
11095 MOVQ $0x00000000, ret+48(FP)
11096 RET
11097
11098match_dst_size_check_encodeSnappyBlockAsm:
11099 MOVL CX, SI
11100 MOVL 12(SP), DI
11101 CMPL DI, SI
11102 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm
11103 MOVL SI, R8
11104 MOVL SI, 12(SP)
11105 LEAQ (DX)(DI*1), SI
11106 SUBL DI, R8
11107 LEAL -1(R8), DI
11108 CMPL DI, $0x3c
11109 JB one_byte_match_emit_encodeSnappyBlockAsm
11110 CMPL DI, $0x00000100
11111 JB two_bytes_match_emit_encodeSnappyBlockAsm
11112 CMPL DI, $0x00010000
11113 JB three_bytes_match_emit_encodeSnappyBlockAsm
11114 CMPL DI, $0x01000000
11115 JB four_bytes_match_emit_encodeSnappyBlockAsm
11116 MOVB $0xfc, (AX)
11117 MOVL DI, 1(AX)
11118 ADDQ $0x05, AX
11119 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11120
11121four_bytes_match_emit_encodeSnappyBlockAsm:
11122 MOVL DI, R9
11123 SHRL $0x10, R9
11124 MOVB $0xf8, (AX)
11125 MOVW DI, 1(AX)
11126 MOVB R9, 3(AX)
11127 ADDQ $0x04, AX
11128 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11129
11130three_bytes_match_emit_encodeSnappyBlockAsm:
11131 MOVB $0xf4, (AX)
11132 MOVW DI, 1(AX)
11133 ADDQ $0x03, AX
11134 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11135
11136two_bytes_match_emit_encodeSnappyBlockAsm:
11137 MOVB $0xf0, (AX)
11138 MOVB DI, 1(AX)
11139 ADDQ $0x02, AX
11140 CMPL DI, $0x40
11141 JB memmove_match_emit_encodeSnappyBlockAsm
11142 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11143
11144one_byte_match_emit_encodeSnappyBlockAsm:
11145 SHLB $0x02, DI
11146 MOVB DI, (AX)
11147 ADDQ $0x01, AX
11148
11149memmove_match_emit_encodeSnappyBlockAsm:
11150 LEAQ (AX)(R8*1), DI
11151
11152 // genMemMoveShort
11153 CMPQ R8, $0x08
11154 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
11155 CMPQ R8, $0x10
11156 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
11157 CMPQ R8, $0x20
11158 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
11159 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
11160
11161emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
11162 MOVQ (SI), R9
11163 MOVQ R9, (AX)
11164 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
11165
11166emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
11167 MOVQ (SI), R9
11168 MOVQ -8(SI)(R8*1), SI
11169 MOVQ R9, (AX)
11170 MOVQ SI, -8(AX)(R8*1)
11171 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
11172
11173emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
11174 MOVOU (SI), X0
11175 MOVOU -16(SI)(R8*1), X1
11176 MOVOU X0, (AX)
11177 MOVOU X1, -16(AX)(R8*1)
11178 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
11179
11180emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
11181 MOVOU (SI), X0
11182 MOVOU 16(SI), X1
11183 MOVOU -32(SI)(R8*1), X2
11184 MOVOU -16(SI)(R8*1), X3
11185 MOVOU X0, (AX)
11186 MOVOU X1, 16(AX)
11187 MOVOU X2, -32(AX)(R8*1)
11188 MOVOU X3, -16(AX)(R8*1)
11189
11190memmove_end_copy_match_emit_encodeSnappyBlockAsm:
11191 MOVQ DI, AX
11192 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm
11193
11194memmove_long_match_emit_encodeSnappyBlockAsm:
11195 LEAQ (AX)(R8*1), DI
11196
11197 // genMemMoveLong
11198 MOVOU (SI), X0
11199 MOVOU 16(SI), X1
11200 MOVOU -32(SI)(R8*1), X2
11201 MOVOU -16(SI)(R8*1), X3
11202 MOVQ R8, R10
11203 SHRQ $0x05, R10
11204 MOVQ AX, R9
11205 ANDL $0x0000001f, R9
11206 MOVQ $0x00000040, R11
11207 SUBQ R9, R11
11208 DECQ R10
11209 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11210 LEAQ -32(SI)(R11*1), R9
11211 LEAQ -32(AX)(R11*1), R12
11212
11213emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
11214 MOVOU (R9), X4
11215 MOVOU 16(R9), X5
11216 MOVOA X4, (R12)
11217 MOVOA X5, 16(R12)
11218 ADDQ $0x20, R12
11219 ADDQ $0x20, R9
11220 ADDQ $0x20, R11
11221 DECQ R10
11222 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
11223
11224emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
11225 MOVOU -32(SI)(R11*1), X4
11226 MOVOU -16(SI)(R11*1), X5
11227 MOVOA X4, -32(AX)(R11*1)
11228 MOVOA X5, -16(AX)(R11*1)
11229 ADDQ $0x20, R11
11230 CMPQ R8, R11
11231 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11232 MOVOU X0, (AX)
11233 MOVOU X1, 16(AX)
11234 MOVOU X2, -32(AX)(R8*1)
11235 MOVOU X3, -16(AX)(R8*1)
11236 MOVQ DI, AX
11237
11238emit_literal_done_match_emit_encodeSnappyBlockAsm:
11239match_nolit_loop_encodeSnappyBlockAsm:
11240 MOVL CX, SI
11241 SUBL BX, SI
11242 MOVL SI, 16(SP)
11243 ADDL $0x04, CX
11244 ADDL $0x04, BX
11245 MOVQ src_len+32(FP), SI
11246 SUBL CX, SI
11247 LEAQ (DX)(CX*1), DI
11248 LEAQ (DX)(BX*1), BX
11249
11250 // matchLen
11251 XORL R9, R9
11252
11253matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm:
11254 CMPL SI, $0x10
11255 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm
11256 MOVQ (DI)(R9*1), R8
11257 MOVQ 8(DI)(R9*1), R10
11258 XORQ (BX)(R9*1), R8
11259 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
11260 XORQ 8(BX)(R9*1), R10
11261 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm
11262 LEAL -16(SI), SI
11263 LEAL 16(R9), R9
11264 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm
11265
11266matchlen_bsf_16match_nolit_encodeSnappyBlockAsm:
11267#ifdef GOAMD64_v3
11268 TZCNTQ R10, R10
11269
11270#else
11271 BSFQ R10, R10
11272
11273#endif
11274 SARQ $0x03, R10
11275 LEAL 8(R9)(R10*1), R9
11276 JMP match_nolit_end_encodeSnappyBlockAsm
11277
11278matchlen_match8_match_nolit_encodeSnappyBlockAsm:
11279 CMPL SI, $0x08
11280 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm
11281 MOVQ (DI)(R9*1), R8
11282 XORQ (BX)(R9*1), R8
11283 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
11284 LEAL -8(SI), SI
11285 LEAL 8(R9), R9
11286 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm
11287
11288matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm:
11289#ifdef GOAMD64_v3
11290 TZCNTQ R8, R8
11291
11292#else
11293 BSFQ R8, R8
11294
11295#endif
11296 SARQ $0x03, R8
11297 LEAL (R9)(R8*1), R9
11298 JMP match_nolit_end_encodeSnappyBlockAsm
11299
11300matchlen_match4_match_nolit_encodeSnappyBlockAsm:
11301 CMPL SI, $0x04
11302 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm
11303 MOVL (DI)(R9*1), R8
11304 CMPL (BX)(R9*1), R8
11305 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm
11306 LEAL -4(SI), SI
11307 LEAL 4(R9), R9
11308
11309matchlen_match2_match_nolit_encodeSnappyBlockAsm:
11310 CMPL SI, $0x01
11311 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm
11312 JB match_nolit_end_encodeSnappyBlockAsm
11313 MOVW (DI)(R9*1), R8
11314 CMPW (BX)(R9*1), R8
11315 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm
11316 LEAL 2(R9), R9
11317 SUBL $0x02, SI
11318 JZ match_nolit_end_encodeSnappyBlockAsm
11319
11320matchlen_match1_match_nolit_encodeSnappyBlockAsm:
11321 MOVB (DI)(R9*1), R8
11322 CMPB (BX)(R9*1), R8
11323 JNE match_nolit_end_encodeSnappyBlockAsm
11324 LEAL 1(R9), R9
11325
11326match_nolit_end_encodeSnappyBlockAsm:
11327 ADDL R9, CX
11328 MOVL 16(SP), BX
11329 ADDL $0x04, R9
11330 MOVL CX, 12(SP)
11331
11332 // emitCopy
11333 CMPL BX, $0x00010000
11334 JB two_byte_offset_match_nolit_encodeSnappyBlockAsm
11335
11336four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
11337 CMPL R9, $0x40
11338 JBE four_bytes_remain_match_nolit_encodeSnappyBlockAsm
11339 MOVB $0xff, (AX)
11340 MOVL BX, 1(AX)
11341 LEAL -64(R9), R9
11342 ADDQ $0x05, AX
11343 CMPL R9, $0x04
11344 JB four_bytes_remain_match_nolit_encodeSnappyBlockAsm
11345 JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
11346
11347four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
11348 TESTL R9, R9
11349 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm
11350 XORL SI, SI
11351 LEAL -1(SI)(R9*4), R9
11352 MOVB R9, (AX)
11353 MOVL BX, 1(AX)
11354 ADDQ $0x05, AX
11355 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
11356
11357two_byte_offset_match_nolit_encodeSnappyBlockAsm:
11358 CMPL R9, $0x40
11359 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
11360 MOVB $0xee, (AX)
11361 MOVW BX, 1(AX)
11362 LEAL -60(R9), R9
11363 ADDQ $0x03, AX
11364 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm
11365
11366two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
11367 MOVL R9, SI
11368 SHLL $0x02, SI
11369 CMPL R9, $0x0c
11370 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm
11371 CMPL BX, $0x00000800
11372 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm
11373 LEAL -15(SI), SI
11374 MOVB BL, 1(AX)
11375 SHRL $0x08, BX
11376 SHLL $0x05, BX
11377 ORL BX, SI
11378 MOVB SI, (AX)
11379 ADDQ $0x02, AX
11380 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
11381
11382emit_copy_three_match_nolit_encodeSnappyBlockAsm:
11383 LEAL -2(SI), SI
11384 MOVB SI, (AX)
11385 MOVW BX, 1(AX)
11386 ADDQ $0x03, AX
11387
11388match_nolit_emitcopy_end_encodeSnappyBlockAsm:
11389 CMPL CX, 8(SP)
11390 JAE emit_remainder_encodeSnappyBlockAsm
11391 MOVQ -2(DX)(CX*1), SI
11392 CMPQ AX, (SP)
11393 JB match_nolit_dst_ok_encodeSnappyBlockAsm
11394 MOVQ $0x00000000, ret+48(FP)
11395 RET
11396
11397match_nolit_dst_ok_encodeSnappyBlockAsm:
11398 MOVQ $0x0000cf1bbcdcbf9b, R8
11399 MOVQ SI, DI
11400 SHRQ $0x10, SI
11401 MOVQ SI, BX
11402 SHLQ $0x10, DI
11403 IMULQ R8, DI
11404 SHRQ $0x32, DI
11405 SHLQ $0x10, BX
11406 IMULQ R8, BX
11407 SHRQ $0x32, BX
11408 LEAL -2(CX), R8
11409 LEAQ 24(SP)(BX*4), R9
11410 MOVL (R9), BX
11411 MOVL R8, 24(SP)(DI*4)
11412 MOVL CX, (R9)
11413 CMPL (DX)(BX*1), SI
11414 JEQ match_nolit_loop_encodeSnappyBlockAsm
11415 INCL CX
11416 JMP search_loop_encodeSnappyBlockAsm
11417
11418emit_remainder_encodeSnappyBlockAsm:
11419 MOVQ src_len+32(FP), CX
11420 SUBL 12(SP), CX
11421 LEAQ 5(AX)(CX*1), CX
11422 CMPQ CX, (SP)
11423 JB emit_remainder_ok_encodeSnappyBlockAsm
11424 MOVQ $0x00000000, ret+48(FP)
11425 RET
11426
11427emit_remainder_ok_encodeSnappyBlockAsm:
11428 MOVQ src_len+32(FP), CX
11429 MOVL 12(SP), BX
11430 CMPL BX, CX
11431 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm
11432 MOVL CX, SI
11433 MOVL CX, 12(SP)
11434 LEAQ (DX)(BX*1), CX
11435 SUBL BX, SI
11436 LEAL -1(SI), DX
11437 CMPL DX, $0x3c
11438 JB one_byte_emit_remainder_encodeSnappyBlockAsm
11439 CMPL DX, $0x00000100
11440 JB two_bytes_emit_remainder_encodeSnappyBlockAsm
11441 CMPL DX, $0x00010000
11442 JB three_bytes_emit_remainder_encodeSnappyBlockAsm
11443 CMPL DX, $0x01000000
11444 JB four_bytes_emit_remainder_encodeSnappyBlockAsm
11445 MOVB $0xfc, (AX)
11446 MOVL DX, 1(AX)
11447 ADDQ $0x05, AX
11448 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11449
11450four_bytes_emit_remainder_encodeSnappyBlockAsm:
11451 MOVL DX, BX
11452 SHRL $0x10, BX
11453 MOVB $0xf8, (AX)
11454 MOVW DX, 1(AX)
11455 MOVB BL, 3(AX)
11456 ADDQ $0x04, AX
11457 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11458
11459three_bytes_emit_remainder_encodeSnappyBlockAsm:
11460 MOVB $0xf4, (AX)
11461 MOVW DX, 1(AX)
11462 ADDQ $0x03, AX
11463 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11464
11465two_bytes_emit_remainder_encodeSnappyBlockAsm:
11466 MOVB $0xf0, (AX)
11467 MOVB DL, 1(AX)
11468 ADDQ $0x02, AX
11469 CMPL DX, $0x40
11470 JB memmove_emit_remainder_encodeSnappyBlockAsm
11471 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11472
11473one_byte_emit_remainder_encodeSnappyBlockAsm:
11474 SHLB $0x02, DL
11475 MOVB DL, (AX)
11476 ADDQ $0x01, AX
11477
11478memmove_emit_remainder_encodeSnappyBlockAsm:
11479 LEAQ (AX)(SI*1), DX
11480 MOVL SI, BX
11481
11482 // genMemMoveShort
11483 CMPQ BX, $0x03
11484 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
11485 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
11486 CMPQ BX, $0x08
11487 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
11488 CMPQ BX, $0x10
11489 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
11490 CMPQ BX, $0x20
11491 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
11492 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
11493
11494emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
11495 MOVB (CX), SI
11496 MOVB -1(CX)(BX*1), CL
11497 MOVB SI, (AX)
11498 MOVB CL, -1(AX)(BX*1)
11499 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11500
11501emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
11502 MOVW (CX), SI
11503 MOVB 2(CX), CL
11504 MOVW SI, (AX)
11505 MOVB CL, 2(AX)
11506 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11507
11508emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
11509 MOVL (CX), SI
11510 MOVL -4(CX)(BX*1), CX
11511 MOVL SI, (AX)
11512 MOVL CX, -4(AX)(BX*1)
11513 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11514
11515emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
11516 MOVQ (CX), SI
11517 MOVQ -8(CX)(BX*1), CX
11518 MOVQ SI, (AX)
11519 MOVQ CX, -8(AX)(BX*1)
11520 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11521
11522emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
11523 MOVOU (CX), X0
11524 MOVOU -16(CX)(BX*1), X1
11525 MOVOU X0, (AX)
11526 MOVOU X1, -16(AX)(BX*1)
11527 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11528
11529emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
11530 MOVOU (CX), X0
11531 MOVOU 16(CX), X1
11532 MOVOU -32(CX)(BX*1), X2
11533 MOVOU -16(CX)(BX*1), X3
11534 MOVOU X0, (AX)
11535 MOVOU X1, 16(AX)
11536 MOVOU X2, -32(AX)(BX*1)
11537 MOVOU X3, -16(AX)(BX*1)
11538
11539memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
11540 MOVQ DX, AX
11541 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm
11542
11543memmove_long_emit_remainder_encodeSnappyBlockAsm:
11544 LEAQ (AX)(SI*1), DX
11545 MOVL SI, BX
11546
11547 // genMemMoveLong
11548 MOVOU (CX), X0
11549 MOVOU 16(CX), X1
11550 MOVOU -32(CX)(BX*1), X2
11551 MOVOU -16(CX)(BX*1), X3
11552 MOVQ BX, DI
11553 SHRQ $0x05, DI
11554 MOVQ AX, SI
11555 ANDL $0x0000001f, SI
11556 MOVQ $0x00000040, R8
11557 SUBQ SI, R8
11558 DECQ DI
11559 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11560 LEAQ -32(CX)(R8*1), SI
11561 LEAQ -32(AX)(R8*1), R9
11562
11563emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
11564 MOVOU (SI), X4
11565 MOVOU 16(SI), X5
11566 MOVOA X4, (R9)
11567 MOVOA X5, 16(R9)
11568 ADDQ $0x20, R9
11569 ADDQ $0x20, SI
11570 ADDQ $0x20, R8
11571 DECQ DI
11572 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
11573
11574emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
11575 MOVOU -32(CX)(R8*1), X4
11576 MOVOU -16(CX)(R8*1), X5
11577 MOVOA X4, -32(AX)(R8*1)
11578 MOVOA X5, -16(AX)(R8*1)
11579 ADDQ $0x20, R8
11580 CMPQ BX, R8
11581 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11582 MOVOU X0, (AX)
11583 MOVOU X1, 16(AX)
11584 MOVOU X2, -32(AX)(BX*1)
11585 MOVOU X3, -16(AX)(BX*1)
11586 MOVQ DX, AX
11587
11588emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
11589 MOVQ dst_base+0(FP), CX
11590 SUBQ CX, AX
11591 MOVQ AX, ret+48(FP)
11592 RET
11593
11594// func encodeSnappyBlockAsm64K(dst []byte, src []byte) int
11595// Requires: BMI, SSE2
11596TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56
11597 MOVQ dst_base+0(FP), AX
11598 MOVQ $0x00000200, CX
11599 LEAQ 24(SP), DX
11600 PXOR X0, X0
11601
11602zero_loop_encodeSnappyBlockAsm64K:
11603 MOVOU X0, (DX)
11604 MOVOU X0, 16(DX)
11605 MOVOU X0, 32(DX)
11606 MOVOU X0, 48(DX)
11607 MOVOU X0, 64(DX)
11608 MOVOU X0, 80(DX)
11609 MOVOU X0, 96(DX)
11610 MOVOU X0, 112(DX)
11611 ADDQ $0x80, DX
11612 DECQ CX
11613 JNZ zero_loop_encodeSnappyBlockAsm64K
11614 MOVL $0x00000000, 12(SP)
11615 MOVQ src_len+32(FP), CX
11616 LEAQ -9(CX), DX
11617 LEAQ -8(CX), BX
11618 MOVL BX, 8(SP)
11619 SHRQ $0x05, CX
11620 SUBL CX, DX
11621 LEAQ (AX)(DX*1), DX
11622 MOVQ DX, (SP)
11623 MOVL $0x00000001, CX
11624 MOVL CX, 16(SP)
11625 MOVQ src_base+24(FP), DX
11626
11627search_loop_encodeSnappyBlockAsm64K:
11628 MOVL CX, BX
11629 SUBL 12(SP), BX
11630 SHRL $0x06, BX
11631 LEAL 4(CX)(BX*1), BX
11632 CMPL BX, 8(SP)
11633 JAE emit_remainder_encodeSnappyBlockAsm64K
11634 MOVQ (DX)(CX*1), SI
11635 MOVL BX, 20(SP)
11636 MOVQ $0x0000cf1bbcdcbf9b, R8
11637 MOVQ SI, R9
11638 MOVQ SI, R10
11639 SHRQ $0x08, R10
11640 SHLQ $0x10, R9
11641 IMULQ R8, R9
11642 SHRQ $0x32, R9
11643 SHLQ $0x10, R10
11644 IMULQ R8, R10
11645 SHRQ $0x32, R10
11646 MOVL 24(SP)(R9*4), BX
11647 MOVL 24(SP)(R10*4), DI
11648 MOVL CX, 24(SP)(R9*4)
11649 LEAL 1(CX), R9
11650 MOVL R9, 24(SP)(R10*4)
11651 MOVQ SI, R9
11652 SHRQ $0x10, R9
11653 SHLQ $0x10, R9
11654 IMULQ R8, R9
11655 SHRQ $0x32, R9
11656 MOVL CX, R8
11657 SUBL 16(SP), R8
11658 MOVL 1(DX)(R8*1), R10
11659 MOVQ SI, R8
11660 SHRQ $0x08, R8
11661 CMPL R8, R10
11662 JNE no_repeat_found_encodeSnappyBlockAsm64K
11663 LEAL 1(CX), SI
11664 MOVL 12(SP), BX
11665 MOVL SI, DI
11666 SUBL 16(SP), DI
11667 JZ repeat_extend_back_end_encodeSnappyBlockAsm64K
11668
11669repeat_extend_back_loop_encodeSnappyBlockAsm64K:
11670 CMPL SI, BX
11671 JBE repeat_extend_back_end_encodeSnappyBlockAsm64K
11672 MOVB -1(DX)(DI*1), R8
11673 MOVB -1(DX)(SI*1), R9
11674 CMPB R8, R9
11675 JNE repeat_extend_back_end_encodeSnappyBlockAsm64K
11676 LEAL -1(SI), SI
11677 DECL DI
11678 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K
11679
11680repeat_extend_back_end_encodeSnappyBlockAsm64K:
11681 MOVL 12(SP), BX
11682 CMPL BX, SI
11683 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
11684 MOVL SI, DI
11685 MOVL SI, 12(SP)
11686 LEAQ (DX)(BX*1), R8
11687 SUBL BX, DI
11688 LEAL -1(DI), BX
11689 CMPL BX, $0x3c
11690 JB one_byte_repeat_emit_encodeSnappyBlockAsm64K
11691 CMPL BX, $0x00000100
11692 JB two_bytes_repeat_emit_encodeSnappyBlockAsm64K
11693 JB three_bytes_repeat_emit_encodeSnappyBlockAsm64K
11694
11695three_bytes_repeat_emit_encodeSnappyBlockAsm64K:
11696 MOVB $0xf4, (AX)
11697 MOVW BX, 1(AX)
11698 ADDQ $0x03, AX
11699 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
11700
11701two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
11702 MOVB $0xf0, (AX)
11703 MOVB BL, 1(AX)
11704 ADDQ $0x02, AX
11705 CMPL BX, $0x40
11706 JB memmove_repeat_emit_encodeSnappyBlockAsm64K
11707 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
11708
11709one_byte_repeat_emit_encodeSnappyBlockAsm64K:
11710 SHLB $0x02, BL
11711 MOVB BL, (AX)
11712 ADDQ $0x01, AX
11713
11714memmove_repeat_emit_encodeSnappyBlockAsm64K:
11715 LEAQ (AX)(DI*1), BX
11716
11717 // genMemMoveShort
11718 CMPQ DI, $0x08
11719 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
11720 CMPQ DI, $0x10
11721 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
11722 CMPQ DI, $0x20
11723 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
11724 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
11725
11726emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
11727 MOVQ (R8), R9
11728 MOVQ R9, (AX)
11729 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
11730
11731emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
11732 MOVQ (R8), R9
11733 MOVQ -8(R8)(DI*1), R8
11734 MOVQ R9, (AX)
11735 MOVQ R8, -8(AX)(DI*1)
11736 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
11737
11738emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
11739 MOVOU (R8), X0
11740 MOVOU -16(R8)(DI*1), X1
11741 MOVOU X0, (AX)
11742 MOVOU X1, -16(AX)(DI*1)
11743 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
11744
11745emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
11746 MOVOU (R8), X0
11747 MOVOU 16(R8), X1
11748 MOVOU -32(R8)(DI*1), X2
11749 MOVOU -16(R8)(DI*1), X3
11750 MOVOU X0, (AX)
11751 MOVOU X1, 16(AX)
11752 MOVOU X2, -32(AX)(DI*1)
11753 MOVOU X3, -16(AX)(DI*1)
11754
11755memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
11756 MOVQ BX, AX
11757 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
11758
11759memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
11760 LEAQ (AX)(DI*1), BX
11761
11762 // genMemMoveLong
11763 MOVOU (R8), X0
11764 MOVOU 16(R8), X1
11765 MOVOU -32(R8)(DI*1), X2
11766 MOVOU -16(R8)(DI*1), X3
11767 MOVQ DI, R10
11768 SHRQ $0x05, R10
11769 MOVQ AX, R9
11770 ANDL $0x0000001f, R9
11771 MOVQ $0x00000040, R11
11772 SUBQ R9, R11
11773 DECQ R10
11774 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
11775 LEAQ -32(R8)(R11*1), R9
11776 LEAQ -32(AX)(R11*1), R12
11777
11778emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
11779 MOVOU (R9), X4
11780 MOVOU 16(R9), X5
11781 MOVOA X4, (R12)
11782 MOVOA X5, 16(R12)
11783 ADDQ $0x20, R12
11784 ADDQ $0x20, R9
11785 ADDQ $0x20, R11
11786 DECQ R10
11787 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
11788
11789emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
11790 MOVOU -32(R8)(R11*1), X4
11791 MOVOU -16(R8)(R11*1), X5
11792 MOVOA X4, -32(AX)(R11*1)
11793 MOVOA X5, -16(AX)(R11*1)
11794 ADDQ $0x20, R11
11795 CMPQ DI, R11
11796 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
11797 MOVOU X0, (AX)
11798 MOVOU X1, 16(AX)
11799 MOVOU X2, -32(AX)(DI*1)
11800 MOVOU X3, -16(AX)(DI*1)
11801 MOVQ BX, AX
11802
11803emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
11804 ADDL $0x05, CX
11805 MOVL CX, BX
11806 SUBL 16(SP), BX
11807 MOVQ src_len+32(FP), DI
11808 SUBL CX, DI
11809 LEAQ (DX)(CX*1), R8
11810 LEAQ (DX)(BX*1), BX
11811
11812 // matchLen
11813 XORL R10, R10
11814
11815matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K:
11816 CMPL DI, $0x10
11817 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K
11818 MOVQ (R8)(R10*1), R9
11819 MOVQ 8(R8)(R10*1), R11
11820 XORQ (BX)(R10*1), R9
11821 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
11822 XORQ 8(BX)(R10*1), R11
11823 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K
11824 LEAL -16(DI), DI
11825 LEAL 16(R10), R10
11826 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K
11827
11828matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K:
11829#ifdef GOAMD64_v3
11830 TZCNTQ R11, R11
11831
11832#else
11833 BSFQ R11, R11
11834
11835#endif
11836 SARQ $0x03, R11
11837 LEAL 8(R10)(R11*1), R10
11838 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
11839
11840matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K:
11841 CMPL DI, $0x08
11842 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
11843 MOVQ (R8)(R10*1), R9
11844 XORQ (BX)(R10*1), R9
11845 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
11846 LEAL -8(DI), DI
11847 LEAL 8(R10), R10
11848 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
11849
11850matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K:
11851#ifdef GOAMD64_v3
11852 TZCNTQ R9, R9
11853
11854#else
11855 BSFQ R9, R9
11856
11857#endif
11858 SARQ $0x03, R9
11859 LEAL (R10)(R9*1), R10
11860 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
11861
11862matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
11863 CMPL DI, $0x04
11864 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
11865 MOVL (R8)(R10*1), R9
11866 CMPL (BX)(R10*1), R9
11867 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
11868 LEAL -4(DI), DI
11869 LEAL 4(R10), R10
11870
11871matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
11872 CMPL DI, $0x01
11873 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
11874 JB repeat_extend_forward_end_encodeSnappyBlockAsm64K
11875 MOVW (R8)(R10*1), R9
11876 CMPW (BX)(R10*1), R9
11877 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
11878 LEAL 2(R10), R10
11879 SUBL $0x02, DI
11880 JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
11881
11882matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
11883 MOVB (R8)(R10*1), R9
11884 CMPB (BX)(R10*1), R9
11885 JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
11886 LEAL 1(R10), R10
11887
11888repeat_extend_forward_end_encodeSnappyBlockAsm64K:
11889 ADDL R10, CX
11890 MOVL CX, BX
11891 SUBL SI, BX
11892 MOVL 16(SP), SI
11893
11894 // emitCopy
11895two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
11896 CMPL BX, $0x40
11897 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
11898 MOVB $0xee, (AX)
11899 MOVW SI, 1(AX)
11900 LEAL -60(BX), BX
11901 ADDQ $0x03, AX
11902 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
11903
11904two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
11905 MOVL BX, DI
11906 SHLL $0x02, DI
11907 CMPL BX, $0x0c
11908 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
11909 CMPL SI, $0x00000800
11910 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
11911 LEAL -15(DI), DI
11912 MOVB SI, 1(AX)
11913 SHRL $0x08, SI
11914 SHLL $0x05, SI
11915 ORL SI, DI
11916 MOVB DI, (AX)
11917 ADDQ $0x02, AX
11918 JMP repeat_end_emit_encodeSnappyBlockAsm64K
11919
11920emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
11921 LEAL -2(DI), DI
11922 MOVB DI, (AX)
11923 MOVW SI, 1(AX)
11924 ADDQ $0x03, AX
11925
11926repeat_end_emit_encodeSnappyBlockAsm64K:
11927 MOVL CX, 12(SP)
11928 JMP search_loop_encodeSnappyBlockAsm64K
11929
11930no_repeat_found_encodeSnappyBlockAsm64K:
11931 CMPL (DX)(BX*1), SI
11932 JEQ candidate_match_encodeSnappyBlockAsm64K
11933 SHRQ $0x08, SI
11934 MOVL 24(SP)(R9*4), BX
11935 LEAL 2(CX), R8
11936 CMPL (DX)(DI*1), SI
11937 JEQ candidate2_match_encodeSnappyBlockAsm64K
11938 MOVL R8, 24(SP)(R9*4)
11939 SHRQ $0x08, SI
11940 CMPL (DX)(BX*1), SI
11941 JEQ candidate3_match_encodeSnappyBlockAsm64K
11942 MOVL 20(SP), CX
11943 JMP search_loop_encodeSnappyBlockAsm64K
11944
11945candidate3_match_encodeSnappyBlockAsm64K:
11946 ADDL $0x02, CX
11947 JMP candidate_match_encodeSnappyBlockAsm64K
11948
11949candidate2_match_encodeSnappyBlockAsm64K:
11950 MOVL R8, 24(SP)(R9*4)
11951 INCL CX
11952 MOVL DI, BX
11953
11954candidate_match_encodeSnappyBlockAsm64K:
11955 MOVL 12(SP), SI
11956 TESTL BX, BX
11957 JZ match_extend_back_end_encodeSnappyBlockAsm64K
11958
11959match_extend_back_loop_encodeSnappyBlockAsm64K:
11960 CMPL CX, SI
11961 JBE match_extend_back_end_encodeSnappyBlockAsm64K
11962 MOVB -1(DX)(BX*1), DI
11963 MOVB -1(DX)(CX*1), R8
11964 CMPB DI, R8
11965 JNE match_extend_back_end_encodeSnappyBlockAsm64K
11966 LEAL -1(CX), CX
11967 DECL BX
11968 JZ match_extend_back_end_encodeSnappyBlockAsm64K
11969 JMP match_extend_back_loop_encodeSnappyBlockAsm64K
11970
11971match_extend_back_end_encodeSnappyBlockAsm64K:
11972 MOVL CX, SI
11973 SUBL 12(SP), SI
11974 LEAQ 3(AX)(SI*1), SI
11975 CMPQ SI, (SP)
11976 JB match_dst_size_check_encodeSnappyBlockAsm64K
11977 MOVQ $0x00000000, ret+48(FP)
11978 RET
11979
11980match_dst_size_check_encodeSnappyBlockAsm64K:
11981 MOVL CX, SI
11982 MOVL 12(SP), DI
11983 CMPL DI, SI
11984 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K
11985 MOVL SI, R8
11986 MOVL SI, 12(SP)
11987 LEAQ (DX)(DI*1), SI
11988 SUBL DI, R8
11989 LEAL -1(R8), DI
11990 CMPL DI, $0x3c
11991 JB one_byte_match_emit_encodeSnappyBlockAsm64K
11992 CMPL DI, $0x00000100
11993 JB two_bytes_match_emit_encodeSnappyBlockAsm64K
11994 JB three_bytes_match_emit_encodeSnappyBlockAsm64K
11995
11996three_bytes_match_emit_encodeSnappyBlockAsm64K:
11997 MOVB $0xf4, (AX)
11998 MOVW DI, 1(AX)
11999 ADDQ $0x03, AX
12000 JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
12001
12002two_bytes_match_emit_encodeSnappyBlockAsm64K:
12003 MOVB $0xf0, (AX)
12004 MOVB DI, 1(AX)
12005 ADDQ $0x02, AX
12006 CMPL DI, $0x40
12007 JB memmove_match_emit_encodeSnappyBlockAsm64K
12008 JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
12009
12010one_byte_match_emit_encodeSnappyBlockAsm64K:
12011 SHLB $0x02, DI
12012 MOVB DI, (AX)
12013 ADDQ $0x01, AX
12014
12015memmove_match_emit_encodeSnappyBlockAsm64K:
12016 LEAQ (AX)(R8*1), DI
12017
12018 // genMemMoveShort
12019 CMPQ R8, $0x08
12020 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
12021 CMPQ R8, $0x10
12022 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
12023 CMPQ R8, $0x20
12024 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
12025 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
12026
12027emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
12028 MOVQ (SI), R9
12029 MOVQ R9, (AX)
12030 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
12031
12032emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
12033 MOVQ (SI), R9
12034 MOVQ -8(SI)(R8*1), SI
12035 MOVQ R9, (AX)
12036 MOVQ SI, -8(AX)(R8*1)
12037 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
12038
12039emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
12040 MOVOU (SI), X0
12041 MOVOU -16(SI)(R8*1), X1
12042 MOVOU X0, (AX)
12043 MOVOU X1, -16(AX)(R8*1)
12044 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
12045
12046emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
12047 MOVOU (SI), X0
12048 MOVOU 16(SI), X1
12049 MOVOU -32(SI)(R8*1), X2
12050 MOVOU -16(SI)(R8*1), X3
12051 MOVOU X0, (AX)
12052 MOVOU X1, 16(AX)
12053 MOVOU X2, -32(AX)(R8*1)
12054 MOVOU X3, -16(AX)(R8*1)
12055
12056memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
12057 MOVQ DI, AX
12058 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K
12059
12060memmove_long_match_emit_encodeSnappyBlockAsm64K:
12061 LEAQ (AX)(R8*1), DI
12062
12063 // genMemMoveLong
12064 MOVOU (SI), X0
12065 MOVOU 16(SI), X1
12066 MOVOU -32(SI)(R8*1), X2
12067 MOVOU -16(SI)(R8*1), X3
12068 MOVQ R8, R10
12069 SHRQ $0x05, R10
12070 MOVQ AX, R9
12071 ANDL $0x0000001f, R9
12072 MOVQ $0x00000040, R11
12073 SUBQ R9, R11
12074 DECQ R10
12075 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12076 LEAQ -32(SI)(R11*1), R9
12077 LEAQ -32(AX)(R11*1), R12
12078
12079emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
12080 MOVOU (R9), X4
12081 MOVOU 16(R9), X5
12082 MOVOA X4, (R12)
12083 MOVOA X5, 16(R12)
12084 ADDQ $0x20, R12
12085 ADDQ $0x20, R9
12086 ADDQ $0x20, R11
12087 DECQ R10
12088 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
12089
12090emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
12091 MOVOU -32(SI)(R11*1), X4
12092 MOVOU -16(SI)(R11*1), X5
12093 MOVOA X4, -32(AX)(R11*1)
12094 MOVOA X5, -16(AX)(R11*1)
12095 ADDQ $0x20, R11
12096 CMPQ R8, R11
12097 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12098 MOVOU X0, (AX)
12099 MOVOU X1, 16(AX)
12100 MOVOU X2, -32(AX)(R8*1)
12101 MOVOU X3, -16(AX)(R8*1)
12102 MOVQ DI, AX
12103
12104emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
12105match_nolit_loop_encodeSnappyBlockAsm64K:
12106 MOVL CX, SI
12107 SUBL BX, SI
12108 MOVL SI, 16(SP)
12109 ADDL $0x04, CX
12110 ADDL $0x04, BX
12111 MOVQ src_len+32(FP), SI
12112 SUBL CX, SI
12113 LEAQ (DX)(CX*1), DI
12114 LEAQ (DX)(BX*1), BX
12115
12116 // matchLen
12117 XORL R9, R9
12118
12119matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K:
12120 CMPL SI, $0x10
12121 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm64K
12122 MOVQ (DI)(R9*1), R8
12123 MOVQ 8(DI)(R9*1), R10
12124 XORQ (BX)(R9*1), R8
12125 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
12126 XORQ 8(BX)(R9*1), R10
12127 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K
12128 LEAL -16(SI), SI
12129 LEAL 16(R9), R9
12130 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K
12131
12132matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K:
12133#ifdef GOAMD64_v3
12134 TZCNTQ R10, R10
12135
12136#else
12137 BSFQ R10, R10
12138
12139#endif
12140 SARQ $0x03, R10
12141 LEAL 8(R9)(R10*1), R9
12142 JMP match_nolit_end_encodeSnappyBlockAsm64K
12143
12144matchlen_match8_match_nolit_encodeSnappyBlockAsm64K:
12145 CMPL SI, $0x08
12146 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
12147 MOVQ (DI)(R9*1), R8
12148 XORQ (BX)(R9*1), R8
12149 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
12150 LEAL -8(SI), SI
12151 LEAL 8(R9), R9
12152 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
12153
12154matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K:
12155#ifdef GOAMD64_v3
12156 TZCNTQ R8, R8
12157
12158#else
12159 BSFQ R8, R8
12160
12161#endif
12162 SARQ $0x03, R8
12163 LEAL (R9)(R8*1), R9
12164 JMP match_nolit_end_encodeSnappyBlockAsm64K
12165
12166matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
12167 CMPL SI, $0x04
12168 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
12169 MOVL (DI)(R9*1), R8
12170 CMPL (BX)(R9*1), R8
12171 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
12172 LEAL -4(SI), SI
12173 LEAL 4(R9), R9
12174
12175matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
12176 CMPL SI, $0x01
12177 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
12178 JB match_nolit_end_encodeSnappyBlockAsm64K
12179 MOVW (DI)(R9*1), R8
12180 CMPW (BX)(R9*1), R8
12181 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
12182 LEAL 2(R9), R9
12183 SUBL $0x02, SI
12184 JZ match_nolit_end_encodeSnappyBlockAsm64K
12185
12186matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
12187 MOVB (DI)(R9*1), R8
12188 CMPB (BX)(R9*1), R8
12189 JNE match_nolit_end_encodeSnappyBlockAsm64K
12190 LEAL 1(R9), R9
12191
12192match_nolit_end_encodeSnappyBlockAsm64K:
12193 ADDL R9, CX
12194 MOVL 16(SP), BX
12195 ADDL $0x04, R9
12196 MOVL CX, 12(SP)
12197
12198 // emitCopy
12199two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
12200 CMPL R9, $0x40
12201 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
12202 MOVB $0xee, (AX)
12203 MOVW BX, 1(AX)
12204 LEAL -60(R9), R9
12205 ADDQ $0x03, AX
12206 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
12207
12208two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
12209 MOVL R9, SI
12210 SHLL $0x02, SI
12211 CMPL R9, $0x0c
12212 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
12213 CMPL BX, $0x00000800
12214 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
12215 LEAL -15(SI), SI
12216 MOVB BL, 1(AX)
12217 SHRL $0x08, BX
12218 SHLL $0x05, BX
12219 ORL BX, SI
12220 MOVB SI, (AX)
12221 ADDQ $0x02, AX
12222 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
12223
12224emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
12225 LEAL -2(SI), SI
12226 MOVB SI, (AX)
12227 MOVW BX, 1(AX)
12228 ADDQ $0x03, AX
12229
12230match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
12231 CMPL CX, 8(SP)
12232 JAE emit_remainder_encodeSnappyBlockAsm64K
12233 MOVQ -2(DX)(CX*1), SI
12234 CMPQ AX, (SP)
12235 JB match_nolit_dst_ok_encodeSnappyBlockAsm64K
12236 MOVQ $0x00000000, ret+48(FP)
12237 RET
12238
12239match_nolit_dst_ok_encodeSnappyBlockAsm64K:
12240 MOVQ $0x0000cf1bbcdcbf9b, R8
12241 MOVQ SI, DI
12242 SHRQ $0x10, SI
12243 MOVQ SI, BX
12244 SHLQ $0x10, DI
12245 IMULQ R8, DI
12246 SHRQ $0x32, DI
12247 SHLQ $0x10, BX
12248 IMULQ R8, BX
12249 SHRQ $0x32, BX
12250 LEAL -2(CX), R8
12251 LEAQ 24(SP)(BX*4), R9
12252 MOVL (R9), BX
12253 MOVL R8, 24(SP)(DI*4)
12254 MOVL CX, (R9)
12255 CMPL (DX)(BX*1), SI
12256 JEQ match_nolit_loop_encodeSnappyBlockAsm64K
12257 INCL CX
12258 JMP search_loop_encodeSnappyBlockAsm64K
12259
12260emit_remainder_encodeSnappyBlockAsm64K:
12261 MOVQ src_len+32(FP), CX
12262 SUBL 12(SP), CX
12263 LEAQ 3(AX)(CX*1), CX
12264 CMPQ CX, (SP)
12265 JB emit_remainder_ok_encodeSnappyBlockAsm64K
12266 MOVQ $0x00000000, ret+48(FP)
12267 RET
12268
12269emit_remainder_ok_encodeSnappyBlockAsm64K:
12270 MOVQ src_len+32(FP), CX
12271 MOVL 12(SP), BX
12272 CMPL BX, CX
12273 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
12274 MOVL CX, SI
12275 MOVL CX, 12(SP)
12276 LEAQ (DX)(BX*1), CX
12277 SUBL BX, SI
12278 LEAL -1(SI), DX
12279 CMPL DX, $0x3c
12280 JB one_byte_emit_remainder_encodeSnappyBlockAsm64K
12281 CMPL DX, $0x00000100
12282 JB two_bytes_emit_remainder_encodeSnappyBlockAsm64K
12283 JB three_bytes_emit_remainder_encodeSnappyBlockAsm64K
12284
12285three_bytes_emit_remainder_encodeSnappyBlockAsm64K:
12286 MOVB $0xf4, (AX)
12287 MOVW DX, 1(AX)
12288 ADDQ $0x03, AX
12289 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
12290
12291two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
12292 MOVB $0xf0, (AX)
12293 MOVB DL, 1(AX)
12294 ADDQ $0x02, AX
12295 CMPL DX, $0x40
12296 JB memmove_emit_remainder_encodeSnappyBlockAsm64K
12297 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
12298
12299one_byte_emit_remainder_encodeSnappyBlockAsm64K:
12300 SHLB $0x02, DL
12301 MOVB DL, (AX)
12302 ADDQ $0x01, AX
12303
12304memmove_emit_remainder_encodeSnappyBlockAsm64K:
12305 LEAQ (AX)(SI*1), DX
12306 MOVL SI, BX
12307
12308 // genMemMoveShort
12309 CMPQ BX, $0x03
12310 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2
12311 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3
12312 CMPQ BX, $0x08
12313 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7
12314 CMPQ BX, $0x10
12315 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
12316 CMPQ BX, $0x20
12317 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
12318 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
12319
12320emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2:
12321 MOVB (CX), SI
12322 MOVB -1(CX)(BX*1), CL
12323 MOVB SI, (AX)
12324 MOVB CL, -1(AX)(BX*1)
12325 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12326
12327emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3:
12328 MOVW (CX), SI
12329 MOVB 2(CX), CL
12330 MOVW SI, (AX)
12331 MOVB CL, 2(AX)
12332 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12333
12334emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7:
12335 MOVL (CX), SI
12336 MOVL -4(CX)(BX*1), CX
12337 MOVL SI, (AX)
12338 MOVL CX, -4(AX)(BX*1)
12339 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12340
12341emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
12342 MOVQ (CX), SI
12343 MOVQ -8(CX)(BX*1), CX
12344 MOVQ SI, (AX)
12345 MOVQ CX, -8(AX)(BX*1)
12346 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12347
12348emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
12349 MOVOU (CX), X0
12350 MOVOU -16(CX)(BX*1), X1
12351 MOVOU X0, (AX)
12352 MOVOU X1, -16(AX)(BX*1)
12353 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12354
12355emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
12356 MOVOU (CX), X0
12357 MOVOU 16(CX), X1
12358 MOVOU -32(CX)(BX*1), X2
12359 MOVOU -16(CX)(BX*1), X3
12360 MOVOU X0, (AX)
12361 MOVOU X1, 16(AX)
12362 MOVOU X2, -32(AX)(BX*1)
12363 MOVOU X3, -16(AX)(BX*1)
12364
12365memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
12366 MOVQ DX, AX
12367 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
12368
12369memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
12370 LEAQ (AX)(SI*1), DX
12371 MOVL SI, BX
12372
12373 // genMemMoveLong
12374 MOVOU (CX), X0
12375 MOVOU 16(CX), X1
12376 MOVOU -32(CX)(BX*1), X2
12377 MOVOU -16(CX)(BX*1), X3
12378 MOVQ BX, DI
12379 SHRQ $0x05, DI
12380 MOVQ AX, SI
12381 ANDL $0x0000001f, SI
12382 MOVQ $0x00000040, R8
12383 SUBQ SI, R8
12384 DECQ DI
12385 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12386 LEAQ -32(CX)(R8*1), SI
12387 LEAQ -32(AX)(R8*1), R9
12388
12389emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
12390 MOVOU (SI), X4
12391 MOVOU 16(SI), X5
12392 MOVOA X4, (R9)
12393 MOVOA X5, 16(R9)
12394 ADDQ $0x20, R9
12395 ADDQ $0x20, SI
12396 ADDQ $0x20, R8
12397 DECQ DI
12398 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
12399
12400emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
12401 MOVOU -32(CX)(R8*1), X4
12402 MOVOU -16(CX)(R8*1), X5
12403 MOVOA X4, -32(AX)(R8*1)
12404 MOVOA X5, -16(AX)(R8*1)
12405 ADDQ $0x20, R8
12406 CMPQ BX, R8
12407 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12408 MOVOU X0, (AX)
12409 MOVOU X1, 16(AX)
12410 MOVOU X2, -32(AX)(BX*1)
12411 MOVOU X3, -16(AX)(BX*1)
12412 MOVQ DX, AX
12413
12414emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
12415 MOVQ dst_base+0(FP), CX
12416 SUBQ CX, AX
12417 MOVQ AX, ret+48(FP)
12418 RET
12419
12420// func encodeSnappyBlockAsm12B(dst []byte, src []byte) int
12421// Requires: BMI, SSE2
12422TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56
12423 MOVQ dst_base+0(FP), AX
12424 MOVQ $0x00000080, CX
12425 LEAQ 24(SP), DX
12426 PXOR X0, X0
12427
12428zero_loop_encodeSnappyBlockAsm12B:
12429 MOVOU X0, (DX)
12430 MOVOU X0, 16(DX)
12431 MOVOU X0, 32(DX)
12432 MOVOU X0, 48(DX)
12433 MOVOU X0, 64(DX)
12434 MOVOU X0, 80(DX)
12435 MOVOU X0, 96(DX)
12436 MOVOU X0, 112(DX)
12437 ADDQ $0x80, DX
12438 DECQ CX
12439 JNZ zero_loop_encodeSnappyBlockAsm12B
12440 MOVL $0x00000000, 12(SP)
12441 MOVQ src_len+32(FP), CX
12442 LEAQ -9(CX), DX
12443 LEAQ -8(CX), BX
12444 MOVL BX, 8(SP)
12445 SHRQ $0x05, CX
12446 SUBL CX, DX
12447 LEAQ (AX)(DX*1), DX
12448 MOVQ DX, (SP)
12449 MOVL $0x00000001, CX
12450 MOVL CX, 16(SP)
12451 MOVQ src_base+24(FP), DX
12452
12453search_loop_encodeSnappyBlockAsm12B:
12454 MOVL CX, BX
12455 SUBL 12(SP), BX
12456 SHRL $0x05, BX
12457 LEAL 4(CX)(BX*1), BX
12458 CMPL BX, 8(SP)
12459 JAE emit_remainder_encodeSnappyBlockAsm12B
12460 MOVQ (DX)(CX*1), SI
12461 MOVL BX, 20(SP)
12462 MOVQ $0x000000cf1bbcdcbb, R8
12463 MOVQ SI, R9
12464 MOVQ SI, R10
12465 SHRQ $0x08, R10
12466 SHLQ $0x18, R9
12467 IMULQ R8, R9
12468 SHRQ $0x34, R9
12469 SHLQ $0x18, R10
12470 IMULQ R8, R10
12471 SHRQ $0x34, R10
12472 MOVL 24(SP)(R9*4), BX
12473 MOVL 24(SP)(R10*4), DI
12474 MOVL CX, 24(SP)(R9*4)
12475 LEAL 1(CX), R9
12476 MOVL R9, 24(SP)(R10*4)
12477 MOVQ SI, R9
12478 SHRQ $0x10, R9
12479 SHLQ $0x18, R9
12480 IMULQ R8, R9
12481 SHRQ $0x34, R9
12482 MOVL CX, R8
12483 SUBL 16(SP), R8
12484 MOVL 1(DX)(R8*1), R10
12485 MOVQ SI, R8
12486 SHRQ $0x08, R8
12487 CMPL R8, R10
12488 JNE no_repeat_found_encodeSnappyBlockAsm12B
12489 LEAL 1(CX), SI
12490 MOVL 12(SP), BX
12491 MOVL SI, DI
12492 SUBL 16(SP), DI
12493 JZ repeat_extend_back_end_encodeSnappyBlockAsm12B
12494
12495repeat_extend_back_loop_encodeSnappyBlockAsm12B:
12496 CMPL SI, BX
12497 JBE repeat_extend_back_end_encodeSnappyBlockAsm12B
12498 MOVB -1(DX)(DI*1), R8
12499 MOVB -1(DX)(SI*1), R9
12500 CMPB R8, R9
12501 JNE repeat_extend_back_end_encodeSnappyBlockAsm12B
12502 LEAL -1(SI), SI
12503 DECL DI
12504 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B
12505
12506repeat_extend_back_end_encodeSnappyBlockAsm12B:
12507 MOVL 12(SP), BX
12508 CMPL BX, SI
12509 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
12510 MOVL SI, DI
12511 MOVL SI, 12(SP)
12512 LEAQ (DX)(BX*1), R8
12513 SUBL BX, DI
12514 LEAL -1(DI), BX
12515 CMPL BX, $0x3c
12516 JB one_byte_repeat_emit_encodeSnappyBlockAsm12B
12517 CMPL BX, $0x00000100
12518 JB two_bytes_repeat_emit_encodeSnappyBlockAsm12B
12519 JB three_bytes_repeat_emit_encodeSnappyBlockAsm12B
12520
12521three_bytes_repeat_emit_encodeSnappyBlockAsm12B:
12522 MOVB $0xf4, (AX)
12523 MOVW BX, 1(AX)
12524 ADDQ $0x03, AX
12525 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
12526
12527two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
12528 MOVB $0xf0, (AX)
12529 MOVB BL, 1(AX)
12530 ADDQ $0x02, AX
12531 CMPL BX, $0x40
12532 JB memmove_repeat_emit_encodeSnappyBlockAsm12B
12533 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
12534
12535one_byte_repeat_emit_encodeSnappyBlockAsm12B:
12536 SHLB $0x02, BL
12537 MOVB BL, (AX)
12538 ADDQ $0x01, AX
12539
12540memmove_repeat_emit_encodeSnappyBlockAsm12B:
12541 LEAQ (AX)(DI*1), BX
12542
12543 // genMemMoveShort
12544 CMPQ DI, $0x08
12545 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
12546 CMPQ DI, $0x10
12547 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
12548 CMPQ DI, $0x20
12549 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
12550 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
12551
12552emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
12553 MOVQ (R8), R9
12554 MOVQ R9, (AX)
12555 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
12556
12557emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
12558 MOVQ (R8), R9
12559 MOVQ -8(R8)(DI*1), R8
12560 MOVQ R9, (AX)
12561 MOVQ R8, -8(AX)(DI*1)
12562 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
12563
12564emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
12565 MOVOU (R8), X0
12566 MOVOU -16(R8)(DI*1), X1
12567 MOVOU X0, (AX)
12568 MOVOU X1, -16(AX)(DI*1)
12569 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
12570
12571emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
12572 MOVOU (R8), X0
12573 MOVOU 16(R8), X1
12574 MOVOU -32(R8)(DI*1), X2
12575 MOVOU -16(R8)(DI*1), X3
12576 MOVOU X0, (AX)
12577 MOVOU X1, 16(AX)
12578 MOVOU X2, -32(AX)(DI*1)
12579 MOVOU X3, -16(AX)(DI*1)
12580
12581memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
12582 MOVQ BX, AX
12583 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
12584
12585memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
12586 LEAQ (AX)(DI*1), BX
12587
12588 // genMemMoveLong
12589 MOVOU (R8), X0
12590 MOVOU 16(R8), X1
12591 MOVOU -32(R8)(DI*1), X2
12592 MOVOU -16(R8)(DI*1), X3
12593 MOVQ DI, R10
12594 SHRQ $0x05, R10
12595 MOVQ AX, R9
12596 ANDL $0x0000001f, R9
12597 MOVQ $0x00000040, R11
12598 SUBQ R9, R11
12599 DECQ R10
12600 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12601 LEAQ -32(R8)(R11*1), R9
12602 LEAQ -32(AX)(R11*1), R12
12603
12604emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
12605 MOVOU (R9), X4
12606 MOVOU 16(R9), X5
12607 MOVOA X4, (R12)
12608 MOVOA X5, 16(R12)
12609 ADDQ $0x20, R12
12610 ADDQ $0x20, R9
12611 ADDQ $0x20, R11
12612 DECQ R10
12613 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
12614
12615emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
12616 MOVOU -32(R8)(R11*1), X4
12617 MOVOU -16(R8)(R11*1), X5
12618 MOVOA X4, -32(AX)(R11*1)
12619 MOVOA X5, -16(AX)(R11*1)
12620 ADDQ $0x20, R11
12621 CMPQ DI, R11
12622 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12623 MOVOU X0, (AX)
12624 MOVOU X1, 16(AX)
12625 MOVOU X2, -32(AX)(DI*1)
12626 MOVOU X3, -16(AX)(DI*1)
12627 MOVQ BX, AX
12628
12629emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
12630 ADDL $0x05, CX
12631 MOVL CX, BX
12632 SUBL 16(SP), BX
12633 MOVQ src_len+32(FP), DI
12634 SUBL CX, DI
12635 LEAQ (DX)(CX*1), R8
12636 LEAQ (DX)(BX*1), BX
12637
12638 // matchLen
12639 XORL R10, R10
12640
12641matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B:
12642 CMPL DI, $0x10
12643 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B
12644 MOVQ (R8)(R10*1), R9
12645 MOVQ 8(R8)(R10*1), R11
12646 XORQ (BX)(R10*1), R9
12647 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
12648 XORQ 8(BX)(R10*1), R11
12649 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B
12650 LEAL -16(DI), DI
12651 LEAL 16(R10), R10
12652 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B
12653
12654matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B:
12655#ifdef GOAMD64_v3
12656 TZCNTQ R11, R11
12657
12658#else
12659 BSFQ R11, R11
12660
12661#endif
12662 SARQ $0x03, R11
12663 LEAL 8(R10)(R11*1), R10
12664 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
12665
12666matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B:
12667 CMPL DI, $0x08
12668 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
12669 MOVQ (R8)(R10*1), R9
12670 XORQ (BX)(R10*1), R9
12671 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
12672 LEAL -8(DI), DI
12673 LEAL 8(R10), R10
12674 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
12675
12676matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B:
12677#ifdef GOAMD64_v3
12678 TZCNTQ R9, R9
12679
12680#else
12681 BSFQ R9, R9
12682
12683#endif
12684 SARQ $0x03, R9
12685 LEAL (R10)(R9*1), R10
12686 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
12687
12688matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
12689 CMPL DI, $0x04
12690 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
12691 MOVL (R8)(R10*1), R9
12692 CMPL (BX)(R10*1), R9
12693 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
12694 LEAL -4(DI), DI
12695 LEAL 4(R10), R10
12696
12697matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
12698 CMPL DI, $0x01
12699 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
12700 JB repeat_extend_forward_end_encodeSnappyBlockAsm12B
12701 MOVW (R8)(R10*1), R9
12702 CMPW (BX)(R10*1), R9
12703 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
12704 LEAL 2(R10), R10
12705 SUBL $0x02, DI
12706 JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
12707
12708matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
12709 MOVB (R8)(R10*1), R9
12710 CMPB (BX)(R10*1), R9
12711 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
12712 LEAL 1(R10), R10
12713
12714repeat_extend_forward_end_encodeSnappyBlockAsm12B:
12715 ADDL R10, CX
12716 MOVL CX, BX
12717 SUBL SI, BX
12718 MOVL 16(SP), SI
12719
12720 // emitCopy
12721two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
12722 CMPL BX, $0x40
12723 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
12724 MOVB $0xee, (AX)
12725 MOVW SI, 1(AX)
12726 LEAL -60(BX), BX
12727 ADDQ $0x03, AX
12728 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
12729
12730two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
12731 MOVL BX, DI
12732 SHLL $0x02, DI
12733 CMPL BX, $0x0c
12734 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
12735 CMPL SI, $0x00000800
12736 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
12737 LEAL -15(DI), DI
12738 MOVB SI, 1(AX)
12739 SHRL $0x08, SI
12740 SHLL $0x05, SI
12741 ORL SI, DI
12742 MOVB DI, (AX)
12743 ADDQ $0x02, AX
12744 JMP repeat_end_emit_encodeSnappyBlockAsm12B
12745
12746emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
12747 LEAL -2(DI), DI
12748 MOVB DI, (AX)
12749 MOVW SI, 1(AX)
12750 ADDQ $0x03, AX
12751
12752repeat_end_emit_encodeSnappyBlockAsm12B:
12753 MOVL CX, 12(SP)
12754 JMP search_loop_encodeSnappyBlockAsm12B
12755
12756no_repeat_found_encodeSnappyBlockAsm12B:
12757 CMPL (DX)(BX*1), SI
12758 JEQ candidate_match_encodeSnappyBlockAsm12B
12759 SHRQ $0x08, SI
12760 MOVL 24(SP)(R9*4), BX
12761 LEAL 2(CX), R8
12762 CMPL (DX)(DI*1), SI
12763 JEQ candidate2_match_encodeSnappyBlockAsm12B
12764 MOVL R8, 24(SP)(R9*4)
12765 SHRQ $0x08, SI
12766 CMPL (DX)(BX*1), SI
12767 JEQ candidate3_match_encodeSnappyBlockAsm12B
12768 MOVL 20(SP), CX
12769 JMP search_loop_encodeSnappyBlockAsm12B
12770
12771candidate3_match_encodeSnappyBlockAsm12B:
12772 ADDL $0x02, CX
12773 JMP candidate_match_encodeSnappyBlockAsm12B
12774
12775candidate2_match_encodeSnappyBlockAsm12B:
12776 MOVL R8, 24(SP)(R9*4)
12777 INCL CX
12778 MOVL DI, BX
12779
12780candidate_match_encodeSnappyBlockAsm12B:
12781 MOVL 12(SP), SI
12782 TESTL BX, BX
12783 JZ match_extend_back_end_encodeSnappyBlockAsm12B
12784
12785match_extend_back_loop_encodeSnappyBlockAsm12B:
12786 CMPL CX, SI
12787 JBE match_extend_back_end_encodeSnappyBlockAsm12B
12788 MOVB -1(DX)(BX*1), DI
12789 MOVB -1(DX)(CX*1), R8
12790 CMPB DI, R8
12791 JNE match_extend_back_end_encodeSnappyBlockAsm12B
12792 LEAL -1(CX), CX
12793 DECL BX
12794 JZ match_extend_back_end_encodeSnappyBlockAsm12B
12795 JMP match_extend_back_loop_encodeSnappyBlockAsm12B
12796
12797match_extend_back_end_encodeSnappyBlockAsm12B:
12798 MOVL CX, SI
12799 SUBL 12(SP), SI
12800 LEAQ 3(AX)(SI*1), SI
12801 CMPQ SI, (SP)
12802 JB match_dst_size_check_encodeSnappyBlockAsm12B
12803 MOVQ $0x00000000, ret+48(FP)
12804 RET
12805
12806match_dst_size_check_encodeSnappyBlockAsm12B:
12807 MOVL CX, SI
12808 MOVL 12(SP), DI
12809 CMPL DI, SI
12810 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B
12811 MOVL SI, R8
12812 MOVL SI, 12(SP)
12813 LEAQ (DX)(DI*1), SI
12814 SUBL DI, R8
12815 LEAL -1(R8), DI
12816 CMPL DI, $0x3c
12817 JB one_byte_match_emit_encodeSnappyBlockAsm12B
12818 CMPL DI, $0x00000100
12819 JB two_bytes_match_emit_encodeSnappyBlockAsm12B
12820 JB three_bytes_match_emit_encodeSnappyBlockAsm12B
12821
12822three_bytes_match_emit_encodeSnappyBlockAsm12B:
12823 MOVB $0xf4, (AX)
12824 MOVW DI, 1(AX)
12825 ADDQ $0x03, AX
12826 JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
12827
12828two_bytes_match_emit_encodeSnappyBlockAsm12B:
12829 MOVB $0xf0, (AX)
12830 MOVB DI, 1(AX)
12831 ADDQ $0x02, AX
12832 CMPL DI, $0x40
12833 JB memmove_match_emit_encodeSnappyBlockAsm12B
12834 JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
12835
12836one_byte_match_emit_encodeSnappyBlockAsm12B:
12837 SHLB $0x02, DI
12838 MOVB DI, (AX)
12839 ADDQ $0x01, AX
12840
12841memmove_match_emit_encodeSnappyBlockAsm12B:
12842 LEAQ (AX)(R8*1), DI
12843
12844 // genMemMoveShort
12845 CMPQ R8, $0x08
12846 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
12847 CMPQ R8, $0x10
12848 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
12849 CMPQ R8, $0x20
12850 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
12851 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
12852
12853emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
12854 MOVQ (SI), R9
12855 MOVQ R9, (AX)
12856 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
12857
12858emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
12859 MOVQ (SI), R9
12860 MOVQ -8(SI)(R8*1), SI
12861 MOVQ R9, (AX)
12862 MOVQ SI, -8(AX)(R8*1)
12863 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
12864
12865emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
12866 MOVOU (SI), X0
12867 MOVOU -16(SI)(R8*1), X1
12868 MOVOU X0, (AX)
12869 MOVOU X1, -16(AX)(R8*1)
12870 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
12871
12872emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
12873 MOVOU (SI), X0
12874 MOVOU 16(SI), X1
12875 MOVOU -32(SI)(R8*1), X2
12876 MOVOU -16(SI)(R8*1), X3
12877 MOVOU X0, (AX)
12878 MOVOU X1, 16(AX)
12879 MOVOU X2, -32(AX)(R8*1)
12880 MOVOU X3, -16(AX)(R8*1)
12881
12882memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
12883 MOVQ DI, AX
12884 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B
12885
12886memmove_long_match_emit_encodeSnappyBlockAsm12B:
12887 LEAQ (AX)(R8*1), DI
12888
12889 // genMemMoveLong
12890 MOVOU (SI), X0
12891 MOVOU 16(SI), X1
12892 MOVOU -32(SI)(R8*1), X2
12893 MOVOU -16(SI)(R8*1), X3
12894 MOVQ R8, R10
12895 SHRQ $0x05, R10
12896 MOVQ AX, R9
12897 ANDL $0x0000001f, R9
12898 MOVQ $0x00000040, R11
12899 SUBQ R9, R11
12900 DECQ R10
12901 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12902 LEAQ -32(SI)(R11*1), R9
12903 LEAQ -32(AX)(R11*1), R12
12904
12905emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
12906 MOVOU (R9), X4
12907 MOVOU 16(R9), X5
12908 MOVOA X4, (R12)
12909 MOVOA X5, 16(R12)
12910 ADDQ $0x20, R12
12911 ADDQ $0x20, R9
12912 ADDQ $0x20, R11
12913 DECQ R10
12914 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
12915
12916emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
12917 MOVOU -32(SI)(R11*1), X4
12918 MOVOU -16(SI)(R11*1), X5
12919 MOVOA X4, -32(AX)(R11*1)
12920 MOVOA X5, -16(AX)(R11*1)
12921 ADDQ $0x20, R11
12922 CMPQ R8, R11
12923 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12924 MOVOU X0, (AX)
12925 MOVOU X1, 16(AX)
12926 MOVOU X2, -32(AX)(R8*1)
12927 MOVOU X3, -16(AX)(R8*1)
12928 MOVQ DI, AX
12929
12930emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
12931match_nolit_loop_encodeSnappyBlockAsm12B:
12932 MOVL CX, SI
12933 SUBL BX, SI
12934 MOVL SI, 16(SP)
12935 ADDL $0x04, CX
12936 ADDL $0x04, BX
12937 MOVQ src_len+32(FP), SI
12938 SUBL CX, SI
12939 LEAQ (DX)(CX*1), DI
12940 LEAQ (DX)(BX*1), BX
12941
12942 // matchLen
12943 XORL R9, R9
12944
12945matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B:
12946 CMPL SI, $0x10
12947 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm12B
12948 MOVQ (DI)(R9*1), R8
12949 MOVQ 8(DI)(R9*1), R10
12950 XORQ (BX)(R9*1), R8
12951 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
12952 XORQ 8(BX)(R9*1), R10
12953 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B
12954 LEAL -16(SI), SI
12955 LEAL 16(R9), R9
12956 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B
12957
12958matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B:
12959#ifdef GOAMD64_v3
12960 TZCNTQ R10, R10
12961
12962#else
12963 BSFQ R10, R10
12964
12965#endif
12966 SARQ $0x03, R10
12967 LEAL 8(R9)(R10*1), R9
12968 JMP match_nolit_end_encodeSnappyBlockAsm12B
12969
12970matchlen_match8_match_nolit_encodeSnappyBlockAsm12B:
12971 CMPL SI, $0x08
12972 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
12973 MOVQ (DI)(R9*1), R8
12974 XORQ (BX)(R9*1), R8
12975 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
12976 LEAL -8(SI), SI
12977 LEAL 8(R9), R9
12978 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
12979
12980matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B:
12981#ifdef GOAMD64_v3
12982 TZCNTQ R8, R8
12983
12984#else
12985 BSFQ R8, R8
12986
12987#endif
12988 SARQ $0x03, R8
12989 LEAL (R9)(R8*1), R9
12990 JMP match_nolit_end_encodeSnappyBlockAsm12B
12991
12992matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
12993 CMPL SI, $0x04
12994 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
12995 MOVL (DI)(R9*1), R8
12996 CMPL (BX)(R9*1), R8
12997 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
12998 LEAL -4(SI), SI
12999 LEAL 4(R9), R9
13000
13001matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
13002 CMPL SI, $0x01
13003 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
13004 JB match_nolit_end_encodeSnappyBlockAsm12B
13005 MOVW (DI)(R9*1), R8
13006 CMPW (BX)(R9*1), R8
13007 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
13008 LEAL 2(R9), R9
13009 SUBL $0x02, SI
13010 JZ match_nolit_end_encodeSnappyBlockAsm12B
13011
13012matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
13013 MOVB (DI)(R9*1), R8
13014 CMPB (BX)(R9*1), R8
13015 JNE match_nolit_end_encodeSnappyBlockAsm12B
13016 LEAL 1(R9), R9
13017
13018match_nolit_end_encodeSnappyBlockAsm12B:
13019 ADDL R9, CX
13020 MOVL 16(SP), BX
13021 ADDL $0x04, R9
13022 MOVL CX, 12(SP)
13023
13024 // emitCopy
13025two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
13026 CMPL R9, $0x40
13027 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
13028 MOVB $0xee, (AX)
13029 MOVW BX, 1(AX)
13030 LEAL -60(R9), R9
13031 ADDQ $0x03, AX
13032 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
13033
13034two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
13035 MOVL R9, SI
13036 SHLL $0x02, SI
13037 CMPL R9, $0x0c
13038 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
13039 CMPL BX, $0x00000800
13040 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
13041 LEAL -15(SI), SI
13042 MOVB BL, 1(AX)
13043 SHRL $0x08, BX
13044 SHLL $0x05, BX
13045 ORL BX, SI
13046 MOVB SI, (AX)
13047 ADDQ $0x02, AX
13048 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
13049
13050emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
13051 LEAL -2(SI), SI
13052 MOVB SI, (AX)
13053 MOVW BX, 1(AX)
13054 ADDQ $0x03, AX
13055
13056match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
13057 CMPL CX, 8(SP)
13058 JAE emit_remainder_encodeSnappyBlockAsm12B
13059 MOVQ -2(DX)(CX*1), SI
13060 CMPQ AX, (SP)
13061 JB match_nolit_dst_ok_encodeSnappyBlockAsm12B
13062 MOVQ $0x00000000, ret+48(FP)
13063 RET
13064
13065match_nolit_dst_ok_encodeSnappyBlockAsm12B:
13066 MOVQ $0x000000cf1bbcdcbb, R8
13067 MOVQ SI, DI
13068 SHRQ $0x10, SI
13069 MOVQ SI, BX
13070 SHLQ $0x18, DI
13071 IMULQ R8, DI
13072 SHRQ $0x34, DI
13073 SHLQ $0x18, BX
13074 IMULQ R8, BX
13075 SHRQ $0x34, BX
13076 LEAL -2(CX), R8
13077 LEAQ 24(SP)(BX*4), R9
13078 MOVL (R9), BX
13079 MOVL R8, 24(SP)(DI*4)
13080 MOVL CX, (R9)
13081 CMPL (DX)(BX*1), SI
13082 JEQ match_nolit_loop_encodeSnappyBlockAsm12B
13083 INCL CX
13084 JMP search_loop_encodeSnappyBlockAsm12B
13085
13086emit_remainder_encodeSnappyBlockAsm12B:
13087 MOVQ src_len+32(FP), CX
13088 SUBL 12(SP), CX
13089 LEAQ 3(AX)(CX*1), CX
13090 CMPQ CX, (SP)
13091 JB emit_remainder_ok_encodeSnappyBlockAsm12B
13092 MOVQ $0x00000000, ret+48(FP)
13093 RET
13094
13095emit_remainder_ok_encodeSnappyBlockAsm12B:
13096 MOVQ src_len+32(FP), CX
13097 MOVL 12(SP), BX
13098 CMPL BX, CX
13099 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
13100 MOVL CX, SI
13101 MOVL CX, 12(SP)
13102 LEAQ (DX)(BX*1), CX
13103 SUBL BX, SI
13104 LEAL -1(SI), DX
13105 CMPL DX, $0x3c
13106 JB one_byte_emit_remainder_encodeSnappyBlockAsm12B
13107 CMPL DX, $0x00000100
13108 JB two_bytes_emit_remainder_encodeSnappyBlockAsm12B
13109 JB three_bytes_emit_remainder_encodeSnappyBlockAsm12B
13110
13111three_bytes_emit_remainder_encodeSnappyBlockAsm12B:
13112 MOVB $0xf4, (AX)
13113 MOVW DX, 1(AX)
13114 ADDQ $0x03, AX
13115 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
13116
13117two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
13118 MOVB $0xf0, (AX)
13119 MOVB DL, 1(AX)
13120 ADDQ $0x02, AX
13121 CMPL DX, $0x40
13122 JB memmove_emit_remainder_encodeSnappyBlockAsm12B
13123 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
13124
13125one_byte_emit_remainder_encodeSnappyBlockAsm12B:
13126 SHLB $0x02, DL
13127 MOVB DL, (AX)
13128 ADDQ $0x01, AX
13129
13130memmove_emit_remainder_encodeSnappyBlockAsm12B:
13131 LEAQ (AX)(SI*1), DX
13132 MOVL SI, BX
13133
13134 // genMemMoveShort
13135 CMPQ BX, $0x03
13136 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
13137 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
13138 CMPQ BX, $0x08
13139 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
13140 CMPQ BX, $0x10
13141 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
13142 CMPQ BX, $0x20
13143 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
13144 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
13145
13146emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
13147 MOVB (CX), SI
13148 MOVB -1(CX)(BX*1), CL
13149 MOVB SI, (AX)
13150 MOVB CL, -1(AX)(BX*1)
13151 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13152
13153emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
13154 MOVW (CX), SI
13155 MOVB 2(CX), CL
13156 MOVW SI, (AX)
13157 MOVB CL, 2(AX)
13158 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13159
13160emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
13161 MOVL (CX), SI
13162 MOVL -4(CX)(BX*1), CX
13163 MOVL SI, (AX)
13164 MOVL CX, -4(AX)(BX*1)
13165 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13166
13167emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
13168 MOVQ (CX), SI
13169 MOVQ -8(CX)(BX*1), CX
13170 MOVQ SI, (AX)
13171 MOVQ CX, -8(AX)(BX*1)
13172 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13173
13174emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
13175 MOVOU (CX), X0
13176 MOVOU -16(CX)(BX*1), X1
13177 MOVOU X0, (AX)
13178 MOVOU X1, -16(AX)(BX*1)
13179 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13180
13181emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
13182 MOVOU (CX), X0
13183 MOVOU 16(CX), X1
13184 MOVOU -32(CX)(BX*1), X2
13185 MOVOU -16(CX)(BX*1), X3
13186 MOVOU X0, (AX)
13187 MOVOU X1, 16(AX)
13188 MOVOU X2, -32(AX)(BX*1)
13189 MOVOU X3, -16(AX)(BX*1)
13190
13191memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
13192 MOVQ DX, AX
13193 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
13194
13195memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
13196 LEAQ (AX)(SI*1), DX
13197 MOVL SI, BX
13198
13199 // genMemMoveLong
13200 MOVOU (CX), X0
13201 MOVOU 16(CX), X1
13202 MOVOU -32(CX)(BX*1), X2
13203 MOVOU -16(CX)(BX*1), X3
13204 MOVQ BX, DI
13205 SHRQ $0x05, DI
13206 MOVQ AX, SI
13207 ANDL $0x0000001f, SI
13208 MOVQ $0x00000040, R8
13209 SUBQ SI, R8
13210 DECQ DI
13211 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
13212 LEAQ -32(CX)(R8*1), SI
13213 LEAQ -32(AX)(R8*1), R9
13214
13215emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
13216 MOVOU (SI), X4
13217 MOVOU 16(SI), X5
13218 MOVOA X4, (R9)
13219 MOVOA X5, 16(R9)
13220 ADDQ $0x20, R9
13221 ADDQ $0x20, SI
13222 ADDQ $0x20, R8
13223 DECQ DI
13224 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
13225
13226emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
13227 MOVOU -32(CX)(R8*1), X4
13228 MOVOU -16(CX)(R8*1), X5
13229 MOVOA X4, -32(AX)(R8*1)
13230 MOVOA X5, -16(AX)(R8*1)
13231 ADDQ $0x20, R8
13232 CMPQ BX, R8
13233 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
13234 MOVOU X0, (AX)
13235 MOVOU X1, 16(AX)
13236 MOVOU X2, -32(AX)(BX*1)
13237 MOVOU X3, -16(AX)(BX*1)
13238 MOVQ DX, AX
13239
13240emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
13241 MOVQ dst_base+0(FP), CX
13242 SUBQ CX, AX
13243 MOVQ AX, ret+48(FP)
13244 RET
13245
13246// func encodeSnappyBlockAsm10B(dst []byte, src []byte) int
13247// Requires: BMI, SSE2
13248TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56
13249 MOVQ dst_base+0(FP), AX
13250 MOVQ $0x00000020, CX
13251 LEAQ 24(SP), DX
13252 PXOR X0, X0
13253
13254zero_loop_encodeSnappyBlockAsm10B:
13255 MOVOU X0, (DX)
13256 MOVOU X0, 16(DX)
13257 MOVOU X0, 32(DX)
13258 MOVOU X0, 48(DX)
13259 MOVOU X0, 64(DX)
13260 MOVOU X0, 80(DX)
13261 MOVOU X0, 96(DX)
13262 MOVOU X0, 112(DX)
13263 ADDQ $0x80, DX
13264 DECQ CX
13265 JNZ zero_loop_encodeSnappyBlockAsm10B
13266 MOVL $0x00000000, 12(SP)
13267 MOVQ src_len+32(FP), CX
13268 LEAQ -9(CX), DX
13269 LEAQ -8(CX), BX
13270 MOVL BX, 8(SP)
13271 SHRQ $0x05, CX
13272 SUBL CX, DX
13273 LEAQ (AX)(DX*1), DX
13274 MOVQ DX, (SP)
13275 MOVL $0x00000001, CX
13276 MOVL CX, 16(SP)
13277 MOVQ src_base+24(FP), DX
13278
13279search_loop_encodeSnappyBlockAsm10B:
13280 MOVL CX, BX
13281 SUBL 12(SP), BX
13282 SHRL $0x05, BX
13283 LEAL 4(CX)(BX*1), BX
13284 CMPL BX, 8(SP)
13285 JAE emit_remainder_encodeSnappyBlockAsm10B
13286 MOVQ (DX)(CX*1), SI
13287 MOVL BX, 20(SP)
13288 MOVQ $0x9e3779b1, R8
13289 MOVQ SI, R9
13290 MOVQ SI, R10
13291 SHRQ $0x08, R10
13292 SHLQ $0x20, R9
13293 IMULQ R8, R9
13294 SHRQ $0x36, R9
13295 SHLQ $0x20, R10
13296 IMULQ R8, R10
13297 SHRQ $0x36, R10
13298 MOVL 24(SP)(R9*4), BX
13299 MOVL 24(SP)(R10*4), DI
13300 MOVL CX, 24(SP)(R9*4)
13301 LEAL 1(CX), R9
13302 MOVL R9, 24(SP)(R10*4)
13303 MOVQ SI, R9
13304 SHRQ $0x10, R9
13305 SHLQ $0x20, R9
13306 IMULQ R8, R9
13307 SHRQ $0x36, R9
13308 MOVL CX, R8
13309 SUBL 16(SP), R8
13310 MOVL 1(DX)(R8*1), R10
13311 MOVQ SI, R8
13312 SHRQ $0x08, R8
13313 CMPL R8, R10
13314 JNE no_repeat_found_encodeSnappyBlockAsm10B
13315 LEAL 1(CX), SI
13316 MOVL 12(SP), BX
13317 MOVL SI, DI
13318 SUBL 16(SP), DI
13319 JZ repeat_extend_back_end_encodeSnappyBlockAsm10B
13320
13321repeat_extend_back_loop_encodeSnappyBlockAsm10B:
13322 CMPL SI, BX
13323 JBE repeat_extend_back_end_encodeSnappyBlockAsm10B
13324 MOVB -1(DX)(DI*1), R8
13325 MOVB -1(DX)(SI*1), R9
13326 CMPB R8, R9
13327 JNE repeat_extend_back_end_encodeSnappyBlockAsm10B
13328 LEAL -1(SI), SI
13329 DECL DI
13330 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B
13331
13332repeat_extend_back_end_encodeSnappyBlockAsm10B:
13333 MOVL 12(SP), BX
13334 CMPL BX, SI
13335 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
13336 MOVL SI, DI
13337 MOVL SI, 12(SP)
13338 LEAQ (DX)(BX*1), R8
13339 SUBL BX, DI
13340 LEAL -1(DI), BX
13341 CMPL BX, $0x3c
13342 JB one_byte_repeat_emit_encodeSnappyBlockAsm10B
13343 CMPL BX, $0x00000100
13344 JB two_bytes_repeat_emit_encodeSnappyBlockAsm10B
13345 JB three_bytes_repeat_emit_encodeSnappyBlockAsm10B
13346
13347three_bytes_repeat_emit_encodeSnappyBlockAsm10B:
13348 MOVB $0xf4, (AX)
13349 MOVW BX, 1(AX)
13350 ADDQ $0x03, AX
13351 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
13352
13353two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
13354 MOVB $0xf0, (AX)
13355 MOVB BL, 1(AX)
13356 ADDQ $0x02, AX
13357 CMPL BX, $0x40
13358 JB memmove_repeat_emit_encodeSnappyBlockAsm10B
13359 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
13360
13361one_byte_repeat_emit_encodeSnappyBlockAsm10B:
13362 SHLB $0x02, BL
13363 MOVB BL, (AX)
13364 ADDQ $0x01, AX
13365
13366memmove_repeat_emit_encodeSnappyBlockAsm10B:
13367 LEAQ (AX)(DI*1), BX
13368
13369 // genMemMoveShort
13370 CMPQ DI, $0x08
13371 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
13372 CMPQ DI, $0x10
13373 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
13374 CMPQ DI, $0x20
13375 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
13376 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
13377
13378emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
13379 MOVQ (R8), R9
13380 MOVQ R9, (AX)
13381 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
13382
13383emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
13384 MOVQ (R8), R9
13385 MOVQ -8(R8)(DI*1), R8
13386 MOVQ R9, (AX)
13387 MOVQ R8, -8(AX)(DI*1)
13388 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
13389
13390emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
13391 MOVOU (R8), X0
13392 MOVOU -16(R8)(DI*1), X1
13393 MOVOU X0, (AX)
13394 MOVOU X1, -16(AX)(DI*1)
13395 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
13396
13397emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
13398 MOVOU (R8), X0
13399 MOVOU 16(R8), X1
13400 MOVOU -32(R8)(DI*1), X2
13401 MOVOU -16(R8)(DI*1), X3
13402 MOVOU X0, (AX)
13403 MOVOU X1, 16(AX)
13404 MOVOU X2, -32(AX)(DI*1)
13405 MOVOU X3, -16(AX)(DI*1)
13406
13407memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
13408 MOVQ BX, AX
13409 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
13410
13411memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
13412 LEAQ (AX)(DI*1), BX
13413
13414 // genMemMoveLong
13415 MOVOU (R8), X0
13416 MOVOU 16(R8), X1
13417 MOVOU -32(R8)(DI*1), X2
13418 MOVOU -16(R8)(DI*1), X3
13419 MOVQ DI, R10
13420 SHRQ $0x05, R10
13421 MOVQ AX, R9
13422 ANDL $0x0000001f, R9
13423 MOVQ $0x00000040, R11
13424 SUBQ R9, R11
13425 DECQ R10
13426 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13427 LEAQ -32(R8)(R11*1), R9
13428 LEAQ -32(AX)(R11*1), R12
13429
13430emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
13431 MOVOU (R9), X4
13432 MOVOU 16(R9), X5
13433 MOVOA X4, (R12)
13434 MOVOA X5, 16(R12)
13435 ADDQ $0x20, R12
13436 ADDQ $0x20, R9
13437 ADDQ $0x20, R11
13438 DECQ R10
13439 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
13440
13441emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
13442 MOVOU -32(R8)(R11*1), X4
13443 MOVOU -16(R8)(R11*1), X5
13444 MOVOA X4, -32(AX)(R11*1)
13445 MOVOA X5, -16(AX)(R11*1)
13446 ADDQ $0x20, R11
13447 CMPQ DI, R11
13448 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13449 MOVOU X0, (AX)
13450 MOVOU X1, 16(AX)
13451 MOVOU X2, -32(AX)(DI*1)
13452 MOVOU X3, -16(AX)(DI*1)
13453 MOVQ BX, AX
13454
13455emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
13456 ADDL $0x05, CX
13457 MOVL CX, BX
13458 SUBL 16(SP), BX
13459 MOVQ src_len+32(FP), DI
13460 SUBL CX, DI
13461 LEAQ (DX)(CX*1), R8
13462 LEAQ (DX)(BX*1), BX
13463
13464 // matchLen
13465 XORL R10, R10
13466
13467matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B:
13468 CMPL DI, $0x10
13469 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B
13470 MOVQ (R8)(R10*1), R9
13471 MOVQ 8(R8)(R10*1), R11
13472 XORQ (BX)(R10*1), R9
13473 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
13474 XORQ 8(BX)(R10*1), R11
13475 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B
13476 LEAL -16(DI), DI
13477 LEAL 16(R10), R10
13478 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B
13479
13480matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B:
13481#ifdef GOAMD64_v3
13482 TZCNTQ R11, R11
13483
13484#else
13485 BSFQ R11, R11
13486
13487#endif
13488 SARQ $0x03, R11
13489 LEAL 8(R10)(R11*1), R10
13490 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
13491
13492matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B:
13493 CMPL DI, $0x08
13494 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
13495 MOVQ (R8)(R10*1), R9
13496 XORQ (BX)(R10*1), R9
13497 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
13498 LEAL -8(DI), DI
13499 LEAL 8(R10), R10
13500 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
13501
13502matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B:
13503#ifdef GOAMD64_v3
13504 TZCNTQ R9, R9
13505
13506#else
13507 BSFQ R9, R9
13508
13509#endif
13510 SARQ $0x03, R9
13511 LEAL (R10)(R9*1), R10
13512 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
13513
13514matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
13515 CMPL DI, $0x04
13516 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
13517 MOVL (R8)(R10*1), R9
13518 CMPL (BX)(R10*1), R9
13519 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
13520 LEAL -4(DI), DI
13521 LEAL 4(R10), R10
13522
13523matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
13524 CMPL DI, $0x01
13525 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
13526 JB repeat_extend_forward_end_encodeSnappyBlockAsm10B
13527 MOVW (R8)(R10*1), R9
13528 CMPW (BX)(R10*1), R9
13529 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
13530 LEAL 2(R10), R10
13531 SUBL $0x02, DI
13532 JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
13533
13534matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
13535 MOVB (R8)(R10*1), R9
13536 CMPB (BX)(R10*1), R9
13537 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
13538 LEAL 1(R10), R10
13539
13540repeat_extend_forward_end_encodeSnappyBlockAsm10B:
13541 ADDL R10, CX
13542 MOVL CX, BX
13543 SUBL SI, BX
13544 MOVL 16(SP), SI
13545
13546 // emitCopy
13547two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
13548 CMPL BX, $0x40
13549 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
13550 MOVB $0xee, (AX)
13551 MOVW SI, 1(AX)
13552 LEAL -60(BX), BX
13553 ADDQ $0x03, AX
13554 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
13555
13556two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
13557 MOVL BX, DI
13558 SHLL $0x02, DI
13559 CMPL BX, $0x0c
13560 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
13561 CMPL SI, $0x00000800
13562 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
13563 LEAL -15(DI), DI
13564 MOVB SI, 1(AX)
13565 SHRL $0x08, SI
13566 SHLL $0x05, SI
13567 ORL SI, DI
13568 MOVB DI, (AX)
13569 ADDQ $0x02, AX
13570 JMP repeat_end_emit_encodeSnappyBlockAsm10B
13571
13572emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
13573 LEAL -2(DI), DI
13574 MOVB DI, (AX)
13575 MOVW SI, 1(AX)
13576 ADDQ $0x03, AX
13577
13578repeat_end_emit_encodeSnappyBlockAsm10B:
13579 MOVL CX, 12(SP)
13580 JMP search_loop_encodeSnappyBlockAsm10B
13581
13582no_repeat_found_encodeSnappyBlockAsm10B:
13583 CMPL (DX)(BX*1), SI
13584 JEQ candidate_match_encodeSnappyBlockAsm10B
13585 SHRQ $0x08, SI
13586 MOVL 24(SP)(R9*4), BX
13587 LEAL 2(CX), R8
13588 CMPL (DX)(DI*1), SI
13589 JEQ candidate2_match_encodeSnappyBlockAsm10B
13590 MOVL R8, 24(SP)(R9*4)
13591 SHRQ $0x08, SI
13592 CMPL (DX)(BX*1), SI
13593 JEQ candidate3_match_encodeSnappyBlockAsm10B
13594 MOVL 20(SP), CX
13595 JMP search_loop_encodeSnappyBlockAsm10B
13596
13597candidate3_match_encodeSnappyBlockAsm10B:
13598 ADDL $0x02, CX
13599 JMP candidate_match_encodeSnappyBlockAsm10B
13600
13601candidate2_match_encodeSnappyBlockAsm10B:
13602 MOVL R8, 24(SP)(R9*4)
13603 INCL CX
13604 MOVL DI, BX
13605
13606candidate_match_encodeSnappyBlockAsm10B:
13607 MOVL 12(SP), SI
13608 TESTL BX, BX
13609 JZ match_extend_back_end_encodeSnappyBlockAsm10B
13610
13611match_extend_back_loop_encodeSnappyBlockAsm10B:
13612 CMPL CX, SI
13613 JBE match_extend_back_end_encodeSnappyBlockAsm10B
13614 MOVB -1(DX)(BX*1), DI
13615 MOVB -1(DX)(CX*1), R8
13616 CMPB DI, R8
13617 JNE match_extend_back_end_encodeSnappyBlockAsm10B
13618 LEAL -1(CX), CX
13619 DECL BX
13620 JZ match_extend_back_end_encodeSnappyBlockAsm10B
13621 JMP match_extend_back_loop_encodeSnappyBlockAsm10B
13622
13623match_extend_back_end_encodeSnappyBlockAsm10B:
13624 MOVL CX, SI
13625 SUBL 12(SP), SI
13626 LEAQ 3(AX)(SI*1), SI
13627 CMPQ SI, (SP)
13628 JB match_dst_size_check_encodeSnappyBlockAsm10B
13629 MOVQ $0x00000000, ret+48(FP)
13630 RET
13631
13632match_dst_size_check_encodeSnappyBlockAsm10B:
13633 MOVL CX, SI
13634 MOVL 12(SP), DI
13635 CMPL DI, SI
13636 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B
13637 MOVL SI, R8
13638 MOVL SI, 12(SP)
13639 LEAQ (DX)(DI*1), SI
13640 SUBL DI, R8
13641 LEAL -1(R8), DI
13642 CMPL DI, $0x3c
13643 JB one_byte_match_emit_encodeSnappyBlockAsm10B
13644 CMPL DI, $0x00000100
13645 JB two_bytes_match_emit_encodeSnappyBlockAsm10B
13646 JB three_bytes_match_emit_encodeSnappyBlockAsm10B
13647
13648three_bytes_match_emit_encodeSnappyBlockAsm10B:
13649 MOVB $0xf4, (AX)
13650 MOVW DI, 1(AX)
13651 ADDQ $0x03, AX
13652 JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
13653
13654two_bytes_match_emit_encodeSnappyBlockAsm10B:
13655 MOVB $0xf0, (AX)
13656 MOVB DI, 1(AX)
13657 ADDQ $0x02, AX
13658 CMPL DI, $0x40
13659 JB memmove_match_emit_encodeSnappyBlockAsm10B
13660 JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
13661
13662one_byte_match_emit_encodeSnappyBlockAsm10B:
13663 SHLB $0x02, DI
13664 MOVB DI, (AX)
13665 ADDQ $0x01, AX
13666
13667memmove_match_emit_encodeSnappyBlockAsm10B:
13668 LEAQ (AX)(R8*1), DI
13669
13670 // genMemMoveShort
13671 CMPQ R8, $0x08
13672 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
13673 CMPQ R8, $0x10
13674 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
13675 CMPQ R8, $0x20
13676 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
13677 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
13678
13679emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
13680 MOVQ (SI), R9
13681 MOVQ R9, (AX)
13682 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
13683
13684emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
13685 MOVQ (SI), R9
13686 MOVQ -8(SI)(R8*1), SI
13687 MOVQ R9, (AX)
13688 MOVQ SI, -8(AX)(R8*1)
13689 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
13690
13691emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
13692 MOVOU (SI), X0
13693 MOVOU -16(SI)(R8*1), X1
13694 MOVOU X0, (AX)
13695 MOVOU X1, -16(AX)(R8*1)
13696 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
13697
13698emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
13699 MOVOU (SI), X0
13700 MOVOU 16(SI), X1
13701 MOVOU -32(SI)(R8*1), X2
13702 MOVOU -16(SI)(R8*1), X3
13703 MOVOU X0, (AX)
13704 MOVOU X1, 16(AX)
13705 MOVOU X2, -32(AX)(R8*1)
13706 MOVOU X3, -16(AX)(R8*1)
13707
13708memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
13709 MOVQ DI, AX
13710 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B
13711
13712memmove_long_match_emit_encodeSnappyBlockAsm10B:
13713 LEAQ (AX)(R8*1), DI
13714
13715 // genMemMoveLong
13716 MOVOU (SI), X0
13717 MOVOU 16(SI), X1
13718 MOVOU -32(SI)(R8*1), X2
13719 MOVOU -16(SI)(R8*1), X3
13720 MOVQ R8, R10
13721 SHRQ $0x05, R10
13722 MOVQ AX, R9
13723 ANDL $0x0000001f, R9
13724 MOVQ $0x00000040, R11
13725 SUBQ R9, R11
13726 DECQ R10
13727 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13728 LEAQ -32(SI)(R11*1), R9
13729 LEAQ -32(AX)(R11*1), R12
13730
13731emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
13732 MOVOU (R9), X4
13733 MOVOU 16(R9), X5
13734 MOVOA X4, (R12)
13735 MOVOA X5, 16(R12)
13736 ADDQ $0x20, R12
13737 ADDQ $0x20, R9
13738 ADDQ $0x20, R11
13739 DECQ R10
13740 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
13741
13742emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
13743 MOVOU -32(SI)(R11*1), X4
13744 MOVOU -16(SI)(R11*1), X5
13745 MOVOA X4, -32(AX)(R11*1)
13746 MOVOA X5, -16(AX)(R11*1)
13747 ADDQ $0x20, R11
13748 CMPQ R8, R11
13749 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13750 MOVOU X0, (AX)
13751 MOVOU X1, 16(AX)
13752 MOVOU X2, -32(AX)(R8*1)
13753 MOVOU X3, -16(AX)(R8*1)
13754 MOVQ DI, AX
13755
13756emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
13757match_nolit_loop_encodeSnappyBlockAsm10B:
13758 MOVL CX, SI
13759 SUBL BX, SI
13760 MOVL SI, 16(SP)
13761 ADDL $0x04, CX
13762 ADDL $0x04, BX
13763 MOVQ src_len+32(FP), SI
13764 SUBL CX, SI
13765 LEAQ (DX)(CX*1), DI
13766 LEAQ (DX)(BX*1), BX
13767
13768 // matchLen
13769 XORL R9, R9
13770
13771matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B:
13772 CMPL SI, $0x10
13773 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm10B
13774 MOVQ (DI)(R9*1), R8
13775 MOVQ 8(DI)(R9*1), R10
13776 XORQ (BX)(R9*1), R8
13777 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
13778 XORQ 8(BX)(R9*1), R10
13779 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B
13780 LEAL -16(SI), SI
13781 LEAL 16(R9), R9
13782 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B
13783
13784matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B:
13785#ifdef GOAMD64_v3
13786 TZCNTQ R10, R10
13787
13788#else
13789 BSFQ R10, R10
13790
13791#endif
13792 SARQ $0x03, R10
13793 LEAL 8(R9)(R10*1), R9
13794 JMP match_nolit_end_encodeSnappyBlockAsm10B
13795
13796matchlen_match8_match_nolit_encodeSnappyBlockAsm10B:
13797 CMPL SI, $0x08
13798 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
13799 MOVQ (DI)(R9*1), R8
13800 XORQ (BX)(R9*1), R8
13801 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
13802 LEAL -8(SI), SI
13803 LEAL 8(R9), R9
13804 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
13805
13806matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B:
13807#ifdef GOAMD64_v3
13808 TZCNTQ R8, R8
13809
13810#else
13811 BSFQ R8, R8
13812
13813#endif
13814 SARQ $0x03, R8
13815 LEAL (R9)(R8*1), R9
13816 JMP match_nolit_end_encodeSnappyBlockAsm10B
13817
13818matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
13819 CMPL SI, $0x04
13820 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
13821 MOVL (DI)(R9*1), R8
13822 CMPL (BX)(R9*1), R8
13823 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
13824 LEAL -4(SI), SI
13825 LEAL 4(R9), R9
13826
13827matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
13828 CMPL SI, $0x01
13829 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
13830 JB match_nolit_end_encodeSnappyBlockAsm10B
13831 MOVW (DI)(R9*1), R8
13832 CMPW (BX)(R9*1), R8
13833 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
13834 LEAL 2(R9), R9
13835 SUBL $0x02, SI
13836 JZ match_nolit_end_encodeSnappyBlockAsm10B
13837
13838matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
13839 MOVB (DI)(R9*1), R8
13840 CMPB (BX)(R9*1), R8
13841 JNE match_nolit_end_encodeSnappyBlockAsm10B
13842 LEAL 1(R9), R9
13843
13844match_nolit_end_encodeSnappyBlockAsm10B:
13845 ADDL R9, CX
13846 MOVL 16(SP), BX
13847 ADDL $0x04, R9
13848 MOVL CX, 12(SP)
13849
13850 // emitCopy
13851two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
13852 CMPL R9, $0x40
13853 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
13854 MOVB $0xee, (AX)
13855 MOVW BX, 1(AX)
13856 LEAL -60(R9), R9
13857 ADDQ $0x03, AX
13858 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
13859
13860two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
13861 MOVL R9, SI
13862 SHLL $0x02, SI
13863 CMPL R9, $0x0c
13864 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
13865 CMPL BX, $0x00000800
13866 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
13867 LEAL -15(SI), SI
13868 MOVB BL, 1(AX)
13869 SHRL $0x08, BX
13870 SHLL $0x05, BX
13871 ORL BX, SI
13872 MOVB SI, (AX)
13873 ADDQ $0x02, AX
13874 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
13875
13876emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
13877 LEAL -2(SI), SI
13878 MOVB SI, (AX)
13879 MOVW BX, 1(AX)
13880 ADDQ $0x03, AX
13881
13882match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
13883 CMPL CX, 8(SP)
13884 JAE emit_remainder_encodeSnappyBlockAsm10B
13885 MOVQ -2(DX)(CX*1), SI
13886 CMPQ AX, (SP)
13887 JB match_nolit_dst_ok_encodeSnappyBlockAsm10B
13888 MOVQ $0x00000000, ret+48(FP)
13889 RET
13890
13891match_nolit_dst_ok_encodeSnappyBlockAsm10B:
13892 MOVQ $0x9e3779b1, R8
13893 MOVQ SI, DI
13894 SHRQ $0x10, SI
13895 MOVQ SI, BX
13896 SHLQ $0x20, DI
13897 IMULQ R8, DI
13898 SHRQ $0x36, DI
13899 SHLQ $0x20, BX
13900 IMULQ R8, BX
13901 SHRQ $0x36, BX
13902 LEAL -2(CX), R8
13903 LEAQ 24(SP)(BX*4), R9
13904 MOVL (R9), BX
13905 MOVL R8, 24(SP)(DI*4)
13906 MOVL CX, (R9)
13907 CMPL (DX)(BX*1), SI
13908 JEQ match_nolit_loop_encodeSnappyBlockAsm10B
13909 INCL CX
13910 JMP search_loop_encodeSnappyBlockAsm10B
13911
13912emit_remainder_encodeSnappyBlockAsm10B:
13913 MOVQ src_len+32(FP), CX
13914 SUBL 12(SP), CX
13915 LEAQ 3(AX)(CX*1), CX
13916 CMPQ CX, (SP)
13917 JB emit_remainder_ok_encodeSnappyBlockAsm10B
13918 MOVQ $0x00000000, ret+48(FP)
13919 RET
13920
13921emit_remainder_ok_encodeSnappyBlockAsm10B:
13922 MOVQ src_len+32(FP), CX
13923 MOVL 12(SP), BX
13924 CMPL BX, CX
13925 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
13926 MOVL CX, SI
13927 MOVL CX, 12(SP)
13928 LEAQ (DX)(BX*1), CX
13929 SUBL BX, SI
13930 LEAL -1(SI), DX
13931 CMPL DX, $0x3c
13932 JB one_byte_emit_remainder_encodeSnappyBlockAsm10B
13933 CMPL DX, $0x00000100
13934 JB two_bytes_emit_remainder_encodeSnappyBlockAsm10B
13935 JB three_bytes_emit_remainder_encodeSnappyBlockAsm10B
13936
13937three_bytes_emit_remainder_encodeSnappyBlockAsm10B:
13938 MOVB $0xf4, (AX)
13939 MOVW DX, 1(AX)
13940 ADDQ $0x03, AX
13941 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
13942
13943two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
13944 MOVB $0xf0, (AX)
13945 MOVB DL, 1(AX)
13946 ADDQ $0x02, AX
13947 CMPL DX, $0x40
13948 JB memmove_emit_remainder_encodeSnappyBlockAsm10B
13949 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
13950
13951one_byte_emit_remainder_encodeSnappyBlockAsm10B:
13952 SHLB $0x02, DL
13953 MOVB DL, (AX)
13954 ADDQ $0x01, AX
13955
13956memmove_emit_remainder_encodeSnappyBlockAsm10B:
13957 LEAQ (AX)(SI*1), DX
13958 MOVL SI, BX
13959
13960 // genMemMoveShort
13961 CMPQ BX, $0x03
13962 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
13963 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
13964 CMPQ BX, $0x08
13965 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
13966 CMPQ BX, $0x10
13967 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
13968 CMPQ BX, $0x20
13969 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
13970 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
13971
13972emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
13973 MOVB (CX), SI
13974 MOVB -1(CX)(BX*1), CL
13975 MOVB SI, (AX)
13976 MOVB CL, -1(AX)(BX*1)
13977 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
13978
13979emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
13980 MOVW (CX), SI
13981 MOVB 2(CX), CL
13982 MOVW SI, (AX)
13983 MOVB CL, 2(AX)
13984 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
13985
13986emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
13987 MOVL (CX), SI
13988 MOVL -4(CX)(BX*1), CX
13989 MOVL SI, (AX)
13990 MOVL CX, -4(AX)(BX*1)
13991 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
13992
13993emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
13994 MOVQ (CX), SI
13995 MOVQ -8(CX)(BX*1), CX
13996 MOVQ SI, (AX)
13997 MOVQ CX, -8(AX)(BX*1)
13998 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
13999
14000emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
14001 MOVOU (CX), X0
14002 MOVOU -16(CX)(BX*1), X1
14003 MOVOU X0, (AX)
14004 MOVOU X1, -16(AX)(BX*1)
14005 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14006
14007emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
14008 MOVOU (CX), X0
14009 MOVOU 16(CX), X1
14010 MOVOU -32(CX)(BX*1), X2
14011 MOVOU -16(CX)(BX*1), X3
14012 MOVOU X0, (AX)
14013 MOVOU X1, 16(AX)
14014 MOVOU X2, -32(AX)(BX*1)
14015 MOVOU X3, -16(AX)(BX*1)
14016
14017memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
14018 MOVQ DX, AX
14019 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
14020
14021memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
14022 LEAQ (AX)(SI*1), DX
14023 MOVL SI, BX
14024
14025 // genMemMoveLong
14026 MOVOU (CX), X0
14027 MOVOU 16(CX), X1
14028 MOVOU -32(CX)(BX*1), X2
14029 MOVOU -16(CX)(BX*1), X3
14030 MOVQ BX, DI
14031 SHRQ $0x05, DI
14032 MOVQ AX, SI
14033 ANDL $0x0000001f, SI
14034 MOVQ $0x00000040, R8
14035 SUBQ SI, R8
14036 DECQ DI
14037 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
14038 LEAQ -32(CX)(R8*1), SI
14039 LEAQ -32(AX)(R8*1), R9
14040
14041emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
14042 MOVOU (SI), X4
14043 MOVOU 16(SI), X5
14044 MOVOA X4, (R9)
14045 MOVOA X5, 16(R9)
14046 ADDQ $0x20, R9
14047 ADDQ $0x20, SI
14048 ADDQ $0x20, R8
14049 DECQ DI
14050 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
14051
14052emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
14053 MOVOU -32(CX)(R8*1), X4
14054 MOVOU -16(CX)(R8*1), X5
14055 MOVOA X4, -32(AX)(R8*1)
14056 MOVOA X5, -16(AX)(R8*1)
14057 ADDQ $0x20, R8
14058 CMPQ BX, R8
14059 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
14060 MOVOU X0, (AX)
14061 MOVOU X1, 16(AX)
14062 MOVOU X2, -32(AX)(BX*1)
14063 MOVOU X3, -16(AX)(BX*1)
14064 MOVQ DX, AX
14065
14066emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
14067 MOVQ dst_base+0(FP), CX
14068 SUBQ CX, AX
14069 MOVQ AX, ret+48(FP)
14070 RET
14071
14072// func encodeSnappyBlockAsm8B(dst []byte, src []byte) int
14073// Requires: BMI, SSE2
14074TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56
14075 MOVQ dst_base+0(FP), AX
14076 MOVQ $0x00000008, CX
14077 LEAQ 24(SP), DX
14078 PXOR X0, X0
14079
14080zero_loop_encodeSnappyBlockAsm8B:
14081 MOVOU X0, (DX)
14082 MOVOU X0, 16(DX)
14083 MOVOU X0, 32(DX)
14084 MOVOU X0, 48(DX)
14085 MOVOU X0, 64(DX)
14086 MOVOU X0, 80(DX)
14087 MOVOU X0, 96(DX)
14088 MOVOU X0, 112(DX)
14089 ADDQ $0x80, DX
14090 DECQ CX
14091 JNZ zero_loop_encodeSnappyBlockAsm8B
14092 MOVL $0x00000000, 12(SP)
14093 MOVQ src_len+32(FP), CX
14094 LEAQ -9(CX), DX
14095 LEAQ -8(CX), BX
14096 MOVL BX, 8(SP)
14097 SHRQ $0x05, CX
14098 SUBL CX, DX
14099 LEAQ (AX)(DX*1), DX
14100 MOVQ DX, (SP)
14101 MOVL $0x00000001, CX
14102 MOVL CX, 16(SP)
14103 MOVQ src_base+24(FP), DX
14104
14105search_loop_encodeSnappyBlockAsm8B:
14106 MOVL CX, BX
14107 SUBL 12(SP), BX
14108 SHRL $0x04, BX
14109 LEAL 4(CX)(BX*1), BX
14110 CMPL BX, 8(SP)
14111 JAE emit_remainder_encodeSnappyBlockAsm8B
14112 MOVQ (DX)(CX*1), SI
14113 MOVL BX, 20(SP)
14114 MOVQ $0x9e3779b1, R8
14115 MOVQ SI, R9
14116 MOVQ SI, R10
14117 SHRQ $0x08, R10
14118 SHLQ $0x20, R9
14119 IMULQ R8, R9
14120 SHRQ $0x38, R9
14121 SHLQ $0x20, R10
14122 IMULQ R8, R10
14123 SHRQ $0x38, R10
14124 MOVL 24(SP)(R9*4), BX
14125 MOVL 24(SP)(R10*4), DI
14126 MOVL CX, 24(SP)(R9*4)
14127 LEAL 1(CX), R9
14128 MOVL R9, 24(SP)(R10*4)
14129 MOVQ SI, R9
14130 SHRQ $0x10, R9
14131 SHLQ $0x20, R9
14132 IMULQ R8, R9
14133 SHRQ $0x38, R9
14134 MOVL CX, R8
14135 SUBL 16(SP), R8
14136 MOVL 1(DX)(R8*1), R10
14137 MOVQ SI, R8
14138 SHRQ $0x08, R8
14139 CMPL R8, R10
14140 JNE no_repeat_found_encodeSnappyBlockAsm8B
14141 LEAL 1(CX), SI
14142 MOVL 12(SP), BX
14143 MOVL SI, DI
14144 SUBL 16(SP), DI
14145 JZ repeat_extend_back_end_encodeSnappyBlockAsm8B
14146
14147repeat_extend_back_loop_encodeSnappyBlockAsm8B:
14148 CMPL SI, BX
14149 JBE repeat_extend_back_end_encodeSnappyBlockAsm8B
14150 MOVB -1(DX)(DI*1), R8
14151 MOVB -1(DX)(SI*1), R9
14152 CMPB R8, R9
14153 JNE repeat_extend_back_end_encodeSnappyBlockAsm8B
14154 LEAL -1(SI), SI
14155 DECL DI
14156 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B
14157
14158repeat_extend_back_end_encodeSnappyBlockAsm8B:
14159 MOVL 12(SP), BX
14160 CMPL BX, SI
14161 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
14162 MOVL SI, DI
14163 MOVL SI, 12(SP)
14164 LEAQ (DX)(BX*1), R8
14165 SUBL BX, DI
14166 LEAL -1(DI), BX
14167 CMPL BX, $0x3c
14168 JB one_byte_repeat_emit_encodeSnappyBlockAsm8B
14169 CMPL BX, $0x00000100
14170 JB two_bytes_repeat_emit_encodeSnappyBlockAsm8B
14171 JB three_bytes_repeat_emit_encodeSnappyBlockAsm8B
14172
14173three_bytes_repeat_emit_encodeSnappyBlockAsm8B:
14174 MOVB $0xf4, (AX)
14175 MOVW BX, 1(AX)
14176 ADDQ $0x03, AX
14177 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
14178
14179two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
14180 MOVB $0xf0, (AX)
14181 MOVB BL, 1(AX)
14182 ADDQ $0x02, AX
14183 CMPL BX, $0x40
14184 JB memmove_repeat_emit_encodeSnappyBlockAsm8B
14185 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
14186
14187one_byte_repeat_emit_encodeSnappyBlockAsm8B:
14188 SHLB $0x02, BL
14189 MOVB BL, (AX)
14190 ADDQ $0x01, AX
14191
14192memmove_repeat_emit_encodeSnappyBlockAsm8B:
14193 LEAQ (AX)(DI*1), BX
14194
14195 // genMemMoveShort
14196 CMPQ DI, $0x08
14197 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
14198 CMPQ DI, $0x10
14199 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
14200 CMPQ DI, $0x20
14201 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
14202 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
14203
14204emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
14205 MOVQ (R8), R9
14206 MOVQ R9, (AX)
14207 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
14208
14209emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
14210 MOVQ (R8), R9
14211 MOVQ -8(R8)(DI*1), R8
14212 MOVQ R9, (AX)
14213 MOVQ R8, -8(AX)(DI*1)
14214 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
14215
14216emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
14217 MOVOU (R8), X0
14218 MOVOU -16(R8)(DI*1), X1
14219 MOVOU X0, (AX)
14220 MOVOU X1, -16(AX)(DI*1)
14221 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
14222
14223emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
14224 MOVOU (R8), X0
14225 MOVOU 16(R8), X1
14226 MOVOU -32(R8)(DI*1), X2
14227 MOVOU -16(R8)(DI*1), X3
14228 MOVOU X0, (AX)
14229 MOVOU X1, 16(AX)
14230 MOVOU X2, -32(AX)(DI*1)
14231 MOVOU X3, -16(AX)(DI*1)
14232
14233memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
14234 MOVQ BX, AX
14235 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
14236
14237memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
14238 LEAQ (AX)(DI*1), BX
14239
14240 // genMemMoveLong
14241 MOVOU (R8), X0
14242 MOVOU 16(R8), X1
14243 MOVOU -32(R8)(DI*1), X2
14244 MOVOU -16(R8)(DI*1), X3
14245 MOVQ DI, R10
14246 SHRQ $0x05, R10
14247 MOVQ AX, R9
14248 ANDL $0x0000001f, R9
14249 MOVQ $0x00000040, R11
14250 SUBQ R9, R11
14251 DECQ R10
14252 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14253 LEAQ -32(R8)(R11*1), R9
14254 LEAQ -32(AX)(R11*1), R12
14255
14256emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
14257 MOVOU (R9), X4
14258 MOVOU 16(R9), X5
14259 MOVOA X4, (R12)
14260 MOVOA X5, 16(R12)
14261 ADDQ $0x20, R12
14262 ADDQ $0x20, R9
14263 ADDQ $0x20, R11
14264 DECQ R10
14265 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
14266
14267emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
14268 MOVOU -32(R8)(R11*1), X4
14269 MOVOU -16(R8)(R11*1), X5
14270 MOVOA X4, -32(AX)(R11*1)
14271 MOVOA X5, -16(AX)(R11*1)
14272 ADDQ $0x20, R11
14273 CMPQ DI, R11
14274 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14275 MOVOU X0, (AX)
14276 MOVOU X1, 16(AX)
14277 MOVOU X2, -32(AX)(DI*1)
14278 MOVOU X3, -16(AX)(DI*1)
14279 MOVQ BX, AX
14280
14281emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
14282 ADDL $0x05, CX
14283 MOVL CX, BX
14284 SUBL 16(SP), BX
14285 MOVQ src_len+32(FP), DI
14286 SUBL CX, DI
14287 LEAQ (DX)(CX*1), R8
14288 LEAQ (DX)(BX*1), BX
14289
14290 // matchLen
14291 XORL R10, R10
14292
14293matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B:
14294 CMPL DI, $0x10
14295 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B
14296 MOVQ (R8)(R10*1), R9
14297 MOVQ 8(R8)(R10*1), R11
14298 XORQ (BX)(R10*1), R9
14299 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
14300 XORQ 8(BX)(R10*1), R11
14301 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B
14302 LEAL -16(DI), DI
14303 LEAL 16(R10), R10
14304 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B
14305
14306matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B:
14307#ifdef GOAMD64_v3
14308 TZCNTQ R11, R11
14309
14310#else
14311 BSFQ R11, R11
14312
14313#endif
14314 SARQ $0x03, R11
14315 LEAL 8(R10)(R11*1), R10
14316 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
14317
14318matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B:
14319 CMPL DI, $0x08
14320 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
14321 MOVQ (R8)(R10*1), R9
14322 XORQ (BX)(R10*1), R9
14323 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
14324 LEAL -8(DI), DI
14325 LEAL 8(R10), R10
14326 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
14327
14328matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B:
14329#ifdef GOAMD64_v3
14330 TZCNTQ R9, R9
14331
14332#else
14333 BSFQ R9, R9
14334
14335#endif
14336 SARQ $0x03, R9
14337 LEAL (R10)(R9*1), R10
14338 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
14339
14340matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
14341 CMPL DI, $0x04
14342 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
14343 MOVL (R8)(R10*1), R9
14344 CMPL (BX)(R10*1), R9
14345 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
14346 LEAL -4(DI), DI
14347 LEAL 4(R10), R10
14348
14349matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
14350 CMPL DI, $0x01
14351 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
14352 JB repeat_extend_forward_end_encodeSnappyBlockAsm8B
14353 MOVW (R8)(R10*1), R9
14354 CMPW (BX)(R10*1), R9
14355 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
14356 LEAL 2(R10), R10
14357 SUBL $0x02, DI
14358 JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
14359
14360matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
14361 MOVB (R8)(R10*1), R9
14362 CMPB (BX)(R10*1), R9
14363 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
14364 LEAL 1(R10), R10
14365
14366repeat_extend_forward_end_encodeSnappyBlockAsm8B:
14367 ADDL R10, CX
14368 MOVL CX, BX
14369 SUBL SI, BX
14370 MOVL 16(SP), SI
14371
14372 // emitCopy
14373two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
14374 CMPL BX, $0x40
14375 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
14376 MOVB $0xee, (AX)
14377 MOVW SI, 1(AX)
14378 LEAL -60(BX), BX
14379 ADDQ $0x03, AX
14380 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
14381
14382two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
14383 MOVL BX, DI
14384 SHLL $0x02, DI
14385 CMPL BX, $0x0c
14386 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
14387 LEAL -15(DI), DI
14388 MOVB SI, 1(AX)
14389 SHRL $0x08, SI
14390 SHLL $0x05, SI
14391 ORL SI, DI
14392 MOVB DI, (AX)
14393 ADDQ $0x02, AX
14394 JMP repeat_end_emit_encodeSnappyBlockAsm8B
14395
14396emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
14397 LEAL -2(DI), DI
14398 MOVB DI, (AX)
14399 MOVW SI, 1(AX)
14400 ADDQ $0x03, AX
14401
14402repeat_end_emit_encodeSnappyBlockAsm8B:
14403 MOVL CX, 12(SP)
14404 JMP search_loop_encodeSnappyBlockAsm8B
14405
14406no_repeat_found_encodeSnappyBlockAsm8B:
14407 CMPL (DX)(BX*1), SI
14408 JEQ candidate_match_encodeSnappyBlockAsm8B
14409 SHRQ $0x08, SI
14410 MOVL 24(SP)(R9*4), BX
14411 LEAL 2(CX), R8
14412 CMPL (DX)(DI*1), SI
14413 JEQ candidate2_match_encodeSnappyBlockAsm8B
14414 MOVL R8, 24(SP)(R9*4)
14415 SHRQ $0x08, SI
14416 CMPL (DX)(BX*1), SI
14417 JEQ candidate3_match_encodeSnappyBlockAsm8B
14418 MOVL 20(SP), CX
14419 JMP search_loop_encodeSnappyBlockAsm8B
14420
14421candidate3_match_encodeSnappyBlockAsm8B:
14422 ADDL $0x02, CX
14423 JMP candidate_match_encodeSnappyBlockAsm8B
14424
14425candidate2_match_encodeSnappyBlockAsm8B:
14426 MOVL R8, 24(SP)(R9*4)
14427 INCL CX
14428 MOVL DI, BX
14429
14430candidate_match_encodeSnappyBlockAsm8B:
14431 MOVL 12(SP), SI
14432 TESTL BX, BX
14433 JZ match_extend_back_end_encodeSnappyBlockAsm8B
14434
14435match_extend_back_loop_encodeSnappyBlockAsm8B:
14436 CMPL CX, SI
14437 JBE match_extend_back_end_encodeSnappyBlockAsm8B
14438 MOVB -1(DX)(BX*1), DI
14439 MOVB -1(DX)(CX*1), R8
14440 CMPB DI, R8
14441 JNE match_extend_back_end_encodeSnappyBlockAsm8B
14442 LEAL -1(CX), CX
14443 DECL BX
14444 JZ match_extend_back_end_encodeSnappyBlockAsm8B
14445 JMP match_extend_back_loop_encodeSnappyBlockAsm8B
14446
14447match_extend_back_end_encodeSnappyBlockAsm8B:
14448 MOVL CX, SI
14449 SUBL 12(SP), SI
14450 LEAQ 3(AX)(SI*1), SI
14451 CMPQ SI, (SP)
14452 JB match_dst_size_check_encodeSnappyBlockAsm8B
14453 MOVQ $0x00000000, ret+48(FP)
14454 RET
14455
14456match_dst_size_check_encodeSnappyBlockAsm8B:
14457 MOVL CX, SI
14458 MOVL 12(SP), DI
14459 CMPL DI, SI
14460 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B
14461 MOVL SI, R8
14462 MOVL SI, 12(SP)
14463 LEAQ (DX)(DI*1), SI
14464 SUBL DI, R8
14465 LEAL -1(R8), DI
14466 CMPL DI, $0x3c
14467 JB one_byte_match_emit_encodeSnappyBlockAsm8B
14468 CMPL DI, $0x00000100
14469 JB two_bytes_match_emit_encodeSnappyBlockAsm8B
14470 JB three_bytes_match_emit_encodeSnappyBlockAsm8B
14471
14472three_bytes_match_emit_encodeSnappyBlockAsm8B:
14473 MOVB $0xf4, (AX)
14474 MOVW DI, 1(AX)
14475 ADDQ $0x03, AX
14476 JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
14477
14478two_bytes_match_emit_encodeSnappyBlockAsm8B:
14479 MOVB $0xf0, (AX)
14480 MOVB DI, 1(AX)
14481 ADDQ $0x02, AX
14482 CMPL DI, $0x40
14483 JB memmove_match_emit_encodeSnappyBlockAsm8B
14484 JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
14485
14486one_byte_match_emit_encodeSnappyBlockAsm8B:
14487 SHLB $0x02, DI
14488 MOVB DI, (AX)
14489 ADDQ $0x01, AX
14490
14491memmove_match_emit_encodeSnappyBlockAsm8B:
14492 LEAQ (AX)(R8*1), DI
14493
14494 // genMemMoveShort
14495 CMPQ R8, $0x08
14496 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
14497 CMPQ R8, $0x10
14498 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
14499 CMPQ R8, $0x20
14500 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
14501 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
14502
14503emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
14504 MOVQ (SI), R9
14505 MOVQ R9, (AX)
14506 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
14507
14508emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
14509 MOVQ (SI), R9
14510 MOVQ -8(SI)(R8*1), SI
14511 MOVQ R9, (AX)
14512 MOVQ SI, -8(AX)(R8*1)
14513 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
14514
14515emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
14516 MOVOU (SI), X0
14517 MOVOU -16(SI)(R8*1), X1
14518 MOVOU X0, (AX)
14519 MOVOU X1, -16(AX)(R8*1)
14520 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
14521
14522emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
14523 MOVOU (SI), X0
14524 MOVOU 16(SI), X1
14525 MOVOU -32(SI)(R8*1), X2
14526 MOVOU -16(SI)(R8*1), X3
14527 MOVOU X0, (AX)
14528 MOVOU X1, 16(AX)
14529 MOVOU X2, -32(AX)(R8*1)
14530 MOVOU X3, -16(AX)(R8*1)
14531
14532memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
14533 MOVQ DI, AX
14534 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B
14535
14536memmove_long_match_emit_encodeSnappyBlockAsm8B:
14537 LEAQ (AX)(R8*1), DI
14538
14539 // genMemMoveLong
14540 MOVOU (SI), X0
14541 MOVOU 16(SI), X1
14542 MOVOU -32(SI)(R8*1), X2
14543 MOVOU -16(SI)(R8*1), X3
14544 MOVQ R8, R10
14545 SHRQ $0x05, R10
14546 MOVQ AX, R9
14547 ANDL $0x0000001f, R9
14548 MOVQ $0x00000040, R11
14549 SUBQ R9, R11
14550 DECQ R10
14551 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14552 LEAQ -32(SI)(R11*1), R9
14553 LEAQ -32(AX)(R11*1), R12
14554
14555emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
14556 MOVOU (R9), X4
14557 MOVOU 16(R9), X5
14558 MOVOA X4, (R12)
14559 MOVOA X5, 16(R12)
14560 ADDQ $0x20, R12
14561 ADDQ $0x20, R9
14562 ADDQ $0x20, R11
14563 DECQ R10
14564 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
14565
14566emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
14567 MOVOU -32(SI)(R11*1), X4
14568 MOVOU -16(SI)(R11*1), X5
14569 MOVOA X4, -32(AX)(R11*1)
14570 MOVOA X5, -16(AX)(R11*1)
14571 ADDQ $0x20, R11
14572 CMPQ R8, R11
14573 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14574 MOVOU X0, (AX)
14575 MOVOU X1, 16(AX)
14576 MOVOU X2, -32(AX)(R8*1)
14577 MOVOU X3, -16(AX)(R8*1)
14578 MOVQ DI, AX
14579
14580emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
14581match_nolit_loop_encodeSnappyBlockAsm8B:
14582 MOVL CX, SI
14583 SUBL BX, SI
14584 MOVL SI, 16(SP)
14585 ADDL $0x04, CX
14586 ADDL $0x04, BX
14587 MOVQ src_len+32(FP), SI
14588 SUBL CX, SI
14589 LEAQ (DX)(CX*1), DI
14590 LEAQ (DX)(BX*1), BX
14591
14592 // matchLen
14593 XORL R9, R9
14594
14595matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B:
14596 CMPL SI, $0x10
14597 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm8B
14598 MOVQ (DI)(R9*1), R8
14599 MOVQ 8(DI)(R9*1), R10
14600 XORQ (BX)(R9*1), R8
14601 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
14602 XORQ 8(BX)(R9*1), R10
14603 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B
14604 LEAL -16(SI), SI
14605 LEAL 16(R9), R9
14606 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B
14607
14608matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B:
14609#ifdef GOAMD64_v3
14610 TZCNTQ R10, R10
14611
14612#else
14613 BSFQ R10, R10
14614
14615#endif
14616 SARQ $0x03, R10
14617 LEAL 8(R9)(R10*1), R9
14618 JMP match_nolit_end_encodeSnappyBlockAsm8B
14619
14620matchlen_match8_match_nolit_encodeSnappyBlockAsm8B:
14621 CMPL SI, $0x08
14622 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
14623 MOVQ (DI)(R9*1), R8
14624 XORQ (BX)(R9*1), R8
14625 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
14626 LEAL -8(SI), SI
14627 LEAL 8(R9), R9
14628 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
14629
14630matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B:
14631#ifdef GOAMD64_v3
14632 TZCNTQ R8, R8
14633
14634#else
14635 BSFQ R8, R8
14636
14637#endif
14638 SARQ $0x03, R8
14639 LEAL (R9)(R8*1), R9
14640 JMP match_nolit_end_encodeSnappyBlockAsm8B
14641
14642matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
14643 CMPL SI, $0x04
14644 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
14645 MOVL (DI)(R9*1), R8
14646 CMPL (BX)(R9*1), R8
14647 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
14648 LEAL -4(SI), SI
14649 LEAL 4(R9), R9
14650
14651matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
14652 CMPL SI, $0x01
14653 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
14654 JB match_nolit_end_encodeSnappyBlockAsm8B
14655 MOVW (DI)(R9*1), R8
14656 CMPW (BX)(R9*1), R8
14657 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
14658 LEAL 2(R9), R9
14659 SUBL $0x02, SI
14660 JZ match_nolit_end_encodeSnappyBlockAsm8B
14661
14662matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
14663 MOVB (DI)(R9*1), R8
14664 CMPB (BX)(R9*1), R8
14665 JNE match_nolit_end_encodeSnappyBlockAsm8B
14666 LEAL 1(R9), R9
14667
14668match_nolit_end_encodeSnappyBlockAsm8B:
14669 ADDL R9, CX
14670 MOVL 16(SP), BX
14671 ADDL $0x04, R9
14672 MOVL CX, 12(SP)
14673
14674 // emitCopy
14675two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
14676 CMPL R9, $0x40
14677 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
14678 MOVB $0xee, (AX)
14679 MOVW BX, 1(AX)
14680 LEAL -60(R9), R9
14681 ADDQ $0x03, AX
14682 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
14683
14684two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
14685 MOVL R9, SI
14686 SHLL $0x02, SI
14687 CMPL R9, $0x0c
14688 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
14689 LEAL -15(SI), SI
14690 MOVB BL, 1(AX)
14691 SHRL $0x08, BX
14692 SHLL $0x05, BX
14693 ORL BX, SI
14694 MOVB SI, (AX)
14695 ADDQ $0x02, AX
14696 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
14697
14698emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
14699 LEAL -2(SI), SI
14700 MOVB SI, (AX)
14701 MOVW BX, 1(AX)
14702 ADDQ $0x03, AX
14703
14704match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
14705 CMPL CX, 8(SP)
14706 JAE emit_remainder_encodeSnappyBlockAsm8B
14707 MOVQ -2(DX)(CX*1), SI
14708 CMPQ AX, (SP)
14709 JB match_nolit_dst_ok_encodeSnappyBlockAsm8B
14710 MOVQ $0x00000000, ret+48(FP)
14711 RET
14712
14713match_nolit_dst_ok_encodeSnappyBlockAsm8B:
14714 MOVQ $0x9e3779b1, R8
14715 MOVQ SI, DI
14716 SHRQ $0x10, SI
14717 MOVQ SI, BX
14718 SHLQ $0x20, DI
14719 IMULQ R8, DI
14720 SHRQ $0x38, DI
14721 SHLQ $0x20, BX
14722 IMULQ R8, BX
14723 SHRQ $0x38, BX
14724 LEAL -2(CX), R8
14725 LEAQ 24(SP)(BX*4), R9
14726 MOVL (R9), BX
14727 MOVL R8, 24(SP)(DI*4)
14728 MOVL CX, (R9)
14729 CMPL (DX)(BX*1), SI
14730 JEQ match_nolit_loop_encodeSnappyBlockAsm8B
14731 INCL CX
14732 JMP search_loop_encodeSnappyBlockAsm8B
14733
14734emit_remainder_encodeSnappyBlockAsm8B:
14735 MOVQ src_len+32(FP), CX
14736 SUBL 12(SP), CX
14737 LEAQ 3(AX)(CX*1), CX
14738 CMPQ CX, (SP)
14739 JB emit_remainder_ok_encodeSnappyBlockAsm8B
14740 MOVQ $0x00000000, ret+48(FP)
14741 RET
14742
14743emit_remainder_ok_encodeSnappyBlockAsm8B:
14744 MOVQ src_len+32(FP), CX
14745 MOVL 12(SP), BX
14746 CMPL BX, CX
14747 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
14748 MOVL CX, SI
14749 MOVL CX, 12(SP)
14750 LEAQ (DX)(BX*1), CX
14751 SUBL BX, SI
14752 LEAL -1(SI), DX
14753 CMPL DX, $0x3c
14754 JB one_byte_emit_remainder_encodeSnappyBlockAsm8B
14755 CMPL DX, $0x00000100
14756 JB two_bytes_emit_remainder_encodeSnappyBlockAsm8B
14757 JB three_bytes_emit_remainder_encodeSnappyBlockAsm8B
14758
14759three_bytes_emit_remainder_encodeSnappyBlockAsm8B:
14760 MOVB $0xf4, (AX)
14761 MOVW DX, 1(AX)
14762 ADDQ $0x03, AX
14763 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
14764
14765two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
14766 MOVB $0xf0, (AX)
14767 MOVB DL, 1(AX)
14768 ADDQ $0x02, AX
14769 CMPL DX, $0x40
14770 JB memmove_emit_remainder_encodeSnappyBlockAsm8B
14771 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
14772
14773one_byte_emit_remainder_encodeSnappyBlockAsm8B:
14774 SHLB $0x02, DL
14775 MOVB DL, (AX)
14776 ADDQ $0x01, AX
14777
14778memmove_emit_remainder_encodeSnappyBlockAsm8B:
14779 LEAQ (AX)(SI*1), DX
14780 MOVL SI, BX
14781
14782 // genMemMoveShort
14783 CMPQ BX, $0x03
14784 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
14785 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
14786 CMPQ BX, $0x08
14787 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
14788 CMPQ BX, $0x10
14789 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
14790 CMPQ BX, $0x20
14791 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
14792 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
14793
14794emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
14795 MOVB (CX), SI
14796 MOVB -1(CX)(BX*1), CL
14797 MOVB SI, (AX)
14798 MOVB CL, -1(AX)(BX*1)
14799 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14800
14801emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
14802 MOVW (CX), SI
14803 MOVB 2(CX), CL
14804 MOVW SI, (AX)
14805 MOVB CL, 2(AX)
14806 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14807
14808emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
14809 MOVL (CX), SI
14810 MOVL -4(CX)(BX*1), CX
14811 MOVL SI, (AX)
14812 MOVL CX, -4(AX)(BX*1)
14813 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14814
14815emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
14816 MOVQ (CX), SI
14817 MOVQ -8(CX)(BX*1), CX
14818 MOVQ SI, (AX)
14819 MOVQ CX, -8(AX)(BX*1)
14820 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14821
14822emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
14823 MOVOU (CX), X0
14824 MOVOU -16(CX)(BX*1), X1
14825 MOVOU X0, (AX)
14826 MOVOU X1, -16(AX)(BX*1)
14827 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14828
14829emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
14830 MOVOU (CX), X0
14831 MOVOU 16(CX), X1
14832 MOVOU -32(CX)(BX*1), X2
14833 MOVOU -16(CX)(BX*1), X3
14834 MOVOU X0, (AX)
14835 MOVOU X1, 16(AX)
14836 MOVOU X2, -32(AX)(BX*1)
14837 MOVOU X3, -16(AX)(BX*1)
14838
14839memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
14840 MOVQ DX, AX
14841 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
14842
14843memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
14844 LEAQ (AX)(SI*1), DX
14845 MOVL SI, BX
14846
14847 // genMemMoveLong
14848 MOVOU (CX), X0
14849 MOVOU 16(CX), X1
14850 MOVOU -32(CX)(BX*1), X2
14851 MOVOU -16(CX)(BX*1), X3
14852 MOVQ BX, DI
14853 SHRQ $0x05, DI
14854 MOVQ AX, SI
14855 ANDL $0x0000001f, SI
14856 MOVQ $0x00000040, R8
14857 SUBQ SI, R8
14858 DECQ DI
14859 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14860 LEAQ -32(CX)(R8*1), SI
14861 LEAQ -32(AX)(R8*1), R9
14862
14863emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
14864 MOVOU (SI), X4
14865 MOVOU 16(SI), X5
14866 MOVOA X4, (R9)
14867 MOVOA X5, 16(R9)
14868 ADDQ $0x20, R9
14869 ADDQ $0x20, SI
14870 ADDQ $0x20, R8
14871 DECQ DI
14872 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
14873
14874emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
14875 MOVOU -32(CX)(R8*1), X4
14876 MOVOU -16(CX)(R8*1), X5
14877 MOVOA X4, -32(AX)(R8*1)
14878 MOVOA X5, -16(AX)(R8*1)
14879 ADDQ $0x20, R8
14880 CMPQ BX, R8
14881 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14882 MOVOU X0, (AX)
14883 MOVOU X1, 16(AX)
14884 MOVOU X2, -32(AX)(BX*1)
14885 MOVOU X3, -16(AX)(BX*1)
14886 MOVQ DX, AX
14887
14888emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
14889 MOVQ dst_base+0(FP), CX
14890 SUBQ CX, AX
14891 MOVQ AX, ret+48(FP)
14892 RET
14893
14894// func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int
14895// Requires: BMI, SSE2
14896TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56
14897 MOVQ dst_base+0(FP), AX
14898 MOVQ $0x00001200, CX
14899 LEAQ 24(SP), DX
14900 PXOR X0, X0
14901
14902zero_loop_encodeSnappyBetterBlockAsm:
14903 MOVOU X0, (DX)
14904 MOVOU X0, 16(DX)
14905 MOVOU X0, 32(DX)
14906 MOVOU X0, 48(DX)
14907 MOVOU X0, 64(DX)
14908 MOVOU X0, 80(DX)
14909 MOVOU X0, 96(DX)
14910 MOVOU X0, 112(DX)
14911 ADDQ $0x80, DX
14912 DECQ CX
14913 JNZ zero_loop_encodeSnappyBetterBlockAsm
14914 MOVL $0x00000000, 12(SP)
14915 MOVQ src_len+32(FP), CX
14916 LEAQ -9(CX), DX
14917 LEAQ -8(CX), BX
14918 MOVL BX, 8(SP)
14919 SHRQ $0x05, CX
14920 SUBL CX, DX
14921 LEAQ (AX)(DX*1), DX
14922 MOVQ DX, (SP)
14923 MOVL $0x00000001, CX
14924 MOVL $0x00000000, 16(SP)
14925 MOVQ src_base+24(FP), DX
14926
14927search_loop_encodeSnappyBetterBlockAsm:
14928 MOVL CX, BX
14929 SUBL 12(SP), BX
14930 SHRL $0x07, BX
14931 CMPL BX, $0x63
14932 JBE check_maxskip_ok_encodeSnappyBetterBlockAsm
14933 LEAL 100(CX), BX
14934 JMP check_maxskip_cont_encodeSnappyBetterBlockAsm
14935
14936check_maxskip_ok_encodeSnappyBetterBlockAsm:
14937 LEAL 1(CX)(BX*1), BX
14938
14939check_maxskip_cont_encodeSnappyBetterBlockAsm:
14940 CMPL BX, 8(SP)
14941 JAE emit_remainder_encodeSnappyBetterBlockAsm
14942 MOVQ (DX)(CX*1), SI
14943 MOVL BX, 20(SP)
14944 MOVQ $0x00cf1bbcdcbfa563, R8
14945 MOVQ $0x9e3779b1, BX
14946 MOVQ SI, R9
14947 MOVQ SI, R10
14948 SHLQ $0x08, R9
14949 IMULQ R8, R9
14950 SHRQ $0x2f, R9
14951 SHLQ $0x20, R10
14952 IMULQ BX, R10
14953 SHRQ $0x32, R10
14954 MOVL 24(SP)(R9*4), BX
14955 MOVL 524312(SP)(R10*4), DI
14956 MOVL CX, 24(SP)(R9*4)
14957 MOVL CX, 524312(SP)(R10*4)
14958 MOVQ (DX)(BX*1), R9
14959 MOVQ (DX)(DI*1), R10
14960 CMPQ R9, SI
14961 JEQ candidate_match_encodeSnappyBetterBlockAsm
14962 CMPQ R10, SI
14963 JNE no_short_found_encodeSnappyBetterBlockAsm
14964 MOVL DI, BX
14965 JMP candidate_match_encodeSnappyBetterBlockAsm
14966
14967no_short_found_encodeSnappyBetterBlockAsm:
14968 CMPL R9, SI
14969 JEQ candidate_match_encodeSnappyBetterBlockAsm
14970 CMPL R10, SI
14971 JEQ candidateS_match_encodeSnappyBetterBlockAsm
14972 MOVL 20(SP), CX
14973 JMP search_loop_encodeSnappyBetterBlockAsm
14974
14975candidateS_match_encodeSnappyBetterBlockAsm:
14976 SHRQ $0x08, SI
14977 MOVQ SI, R9
14978 SHLQ $0x08, R9
14979 IMULQ R8, R9
14980 SHRQ $0x2f, R9
14981 MOVL 24(SP)(R9*4), BX
14982 INCL CX
14983 MOVL CX, 24(SP)(R9*4)
14984 CMPL (DX)(BX*1), SI
14985 JEQ candidate_match_encodeSnappyBetterBlockAsm
14986 DECL CX
14987 MOVL DI, BX
14988
14989candidate_match_encodeSnappyBetterBlockAsm:
14990 MOVL 12(SP), SI
14991 TESTL BX, BX
14992 JZ match_extend_back_end_encodeSnappyBetterBlockAsm
14993
14994match_extend_back_loop_encodeSnappyBetterBlockAsm:
14995 CMPL CX, SI
14996 JBE match_extend_back_end_encodeSnappyBetterBlockAsm
14997 MOVB -1(DX)(BX*1), DI
14998 MOVB -1(DX)(CX*1), R8
14999 CMPB DI, R8
15000 JNE match_extend_back_end_encodeSnappyBetterBlockAsm
15001 LEAL -1(CX), CX
15002 DECL BX
15003 JZ match_extend_back_end_encodeSnappyBetterBlockAsm
15004 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm
15005
15006match_extend_back_end_encodeSnappyBetterBlockAsm:
15007 MOVL CX, SI
15008 SUBL 12(SP), SI
15009 LEAQ 5(AX)(SI*1), SI
15010 CMPQ SI, (SP)
15011 JB match_dst_size_check_encodeSnappyBetterBlockAsm
15012 MOVQ $0x00000000, ret+48(FP)
15013 RET
15014
15015match_dst_size_check_encodeSnappyBetterBlockAsm:
15016 MOVL CX, SI
15017 ADDL $0x04, CX
15018 ADDL $0x04, BX
15019 MOVQ src_len+32(FP), DI
15020 SUBL CX, DI
15021 LEAQ (DX)(CX*1), R8
15022 LEAQ (DX)(BX*1), R9
15023
15024 // matchLen
15025 XORL R11, R11
15026
15027matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm:
15028 CMPL DI, $0x10
15029 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm
15030 MOVQ (R8)(R11*1), R10
15031 MOVQ 8(R8)(R11*1), R12
15032 XORQ (R9)(R11*1), R10
15033 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
15034 XORQ 8(R9)(R11*1), R12
15035 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm
15036 LEAL -16(DI), DI
15037 LEAL 16(R11), R11
15038 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm
15039
15040matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm:
15041#ifdef GOAMD64_v3
15042 TZCNTQ R12, R12
15043
15044#else
15045 BSFQ R12, R12
15046
15047#endif
15048 SARQ $0x03, R12
15049 LEAL 8(R11)(R12*1), R11
15050 JMP match_nolit_end_encodeSnappyBetterBlockAsm
15051
15052matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm:
15053 CMPL DI, $0x08
15054 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
15055 MOVQ (R8)(R11*1), R10
15056 XORQ (R9)(R11*1), R10
15057 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
15058 LEAL -8(DI), DI
15059 LEAL 8(R11), R11
15060 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
15061
15062matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm:
15063#ifdef GOAMD64_v3
15064 TZCNTQ R10, R10
15065
15066#else
15067 BSFQ R10, R10
15068
15069#endif
15070 SARQ $0x03, R10
15071 LEAL (R11)(R10*1), R11
15072 JMP match_nolit_end_encodeSnappyBetterBlockAsm
15073
15074matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
15075 CMPL DI, $0x04
15076 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
15077 MOVL (R8)(R11*1), R10
15078 CMPL (R9)(R11*1), R10
15079 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
15080 LEAL -4(DI), DI
15081 LEAL 4(R11), R11
15082
15083matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
15084 CMPL DI, $0x01
15085 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
15086 JB match_nolit_end_encodeSnappyBetterBlockAsm
15087 MOVW (R8)(R11*1), R10
15088 CMPW (R9)(R11*1), R10
15089 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
15090 LEAL 2(R11), R11
15091 SUBL $0x02, DI
15092 JZ match_nolit_end_encodeSnappyBetterBlockAsm
15093
15094matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
15095 MOVB (R8)(R11*1), R10
15096 CMPB (R9)(R11*1), R10
15097 JNE match_nolit_end_encodeSnappyBetterBlockAsm
15098 LEAL 1(R11), R11
15099
15100match_nolit_end_encodeSnappyBetterBlockAsm:
15101 MOVL CX, DI
15102 SUBL BX, DI
15103
15104 // Check if repeat
15105 CMPL R11, $0x01
15106 JA match_length_ok_encodeSnappyBetterBlockAsm
15107 CMPL DI, $0x0000ffff
15108 JBE match_length_ok_encodeSnappyBetterBlockAsm
15109 MOVL 20(SP), CX
15110 INCL CX
15111 JMP search_loop_encodeSnappyBetterBlockAsm
15112
15113match_length_ok_encodeSnappyBetterBlockAsm:
15114 MOVL DI, 16(SP)
15115 MOVL 12(SP), BX
15116 CMPL BX, SI
15117 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
15118 MOVL SI, R8
15119 MOVL SI, 12(SP)
15120 LEAQ (DX)(BX*1), R9
15121 SUBL BX, R8
15122 LEAL -1(R8), BX
15123 CMPL BX, $0x3c
15124 JB one_byte_match_emit_encodeSnappyBetterBlockAsm
15125 CMPL BX, $0x00000100
15126 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm
15127 CMPL BX, $0x00010000
15128 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm
15129 CMPL BX, $0x01000000
15130 JB four_bytes_match_emit_encodeSnappyBetterBlockAsm
15131 MOVB $0xfc, (AX)
15132 MOVL BX, 1(AX)
15133 ADDQ $0x05, AX
15134 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15135
15136four_bytes_match_emit_encodeSnappyBetterBlockAsm:
15137 MOVL BX, R10
15138 SHRL $0x10, R10
15139 MOVB $0xf8, (AX)
15140 MOVW BX, 1(AX)
15141 MOVB R10, 3(AX)
15142 ADDQ $0x04, AX
15143 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15144
15145three_bytes_match_emit_encodeSnappyBetterBlockAsm:
15146 MOVB $0xf4, (AX)
15147 MOVW BX, 1(AX)
15148 ADDQ $0x03, AX
15149 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15150
15151two_bytes_match_emit_encodeSnappyBetterBlockAsm:
15152 MOVB $0xf0, (AX)
15153 MOVB BL, 1(AX)
15154 ADDQ $0x02, AX
15155 CMPL BX, $0x40
15156 JB memmove_match_emit_encodeSnappyBetterBlockAsm
15157 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15158
15159one_byte_match_emit_encodeSnappyBetterBlockAsm:
15160 SHLB $0x02, BL
15161 MOVB BL, (AX)
15162 ADDQ $0x01, AX
15163
15164memmove_match_emit_encodeSnappyBetterBlockAsm:
15165 LEAQ (AX)(R8*1), BX
15166
15167 // genMemMoveShort
15168 CMPQ R8, $0x08
15169 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
15170 CMPQ R8, $0x10
15171 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
15172 CMPQ R8, $0x20
15173 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
15174 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
15175
15176emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
15177 MOVQ (R9), R10
15178 MOVQ R10, (AX)
15179 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
15180
15181emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
15182 MOVQ (R9), R10
15183 MOVQ -8(R9)(R8*1), R9
15184 MOVQ R10, (AX)
15185 MOVQ R9, -8(AX)(R8*1)
15186 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
15187
15188emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
15189 MOVOU (R9), X0
15190 MOVOU -16(R9)(R8*1), X1
15191 MOVOU X0, (AX)
15192 MOVOU X1, -16(AX)(R8*1)
15193 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
15194
15195emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
15196 MOVOU (R9), X0
15197 MOVOU 16(R9), X1
15198 MOVOU -32(R9)(R8*1), X2
15199 MOVOU -16(R9)(R8*1), X3
15200 MOVOU X0, (AX)
15201 MOVOU X1, 16(AX)
15202 MOVOU X2, -32(AX)(R8*1)
15203 MOVOU X3, -16(AX)(R8*1)
15204
15205memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
15206 MOVQ BX, AX
15207 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
15208
15209memmove_long_match_emit_encodeSnappyBetterBlockAsm:
15210 LEAQ (AX)(R8*1), BX
15211
15212 // genMemMoveLong
15213 MOVOU (R9), X0
15214 MOVOU 16(R9), X1
15215 MOVOU -32(R9)(R8*1), X2
15216 MOVOU -16(R9)(R8*1), X3
15217 MOVQ R8, R12
15218 SHRQ $0x05, R12
15219 MOVQ AX, R10
15220 ANDL $0x0000001f, R10
15221 MOVQ $0x00000040, R13
15222 SUBQ R10, R13
15223 DECQ R12
15224 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15225 LEAQ -32(R9)(R13*1), R10
15226 LEAQ -32(AX)(R13*1), R14
15227
15228emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
15229 MOVOU (R10), X4
15230 MOVOU 16(R10), X5
15231 MOVOA X4, (R14)
15232 MOVOA X5, 16(R14)
15233 ADDQ $0x20, R14
15234 ADDQ $0x20, R10
15235 ADDQ $0x20, R13
15236 DECQ R12
15237 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
15238
15239emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
15240 MOVOU -32(R9)(R13*1), X4
15241 MOVOU -16(R9)(R13*1), X5
15242 MOVOA X4, -32(AX)(R13*1)
15243 MOVOA X5, -16(AX)(R13*1)
15244 ADDQ $0x20, R13
15245 CMPQ R8, R13
15246 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15247 MOVOU X0, (AX)
15248 MOVOU X1, 16(AX)
15249 MOVOU X2, -32(AX)(R8*1)
15250 MOVOU X3, -16(AX)(R8*1)
15251 MOVQ BX, AX
15252
15253emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
15254 ADDL R11, CX
15255 ADDL $0x04, R11
15256 MOVL CX, 12(SP)
15257
15258 // emitCopy
15259 CMPL DI, $0x00010000
15260 JB two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
15261
15262four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
15263 CMPL R11, $0x40
15264 JBE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
15265 MOVB $0xff, (AX)
15266 MOVL DI, 1(AX)
15267 LEAL -64(R11), R11
15268 ADDQ $0x05, AX
15269 CMPL R11, $0x04
15270 JB four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
15271 JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
15272
15273four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
15274 TESTL R11, R11
15275 JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
15276 XORL BX, BX
15277 LEAL -1(BX)(R11*4), R11
15278 MOVB R11, (AX)
15279 MOVL DI, 1(AX)
15280 ADDQ $0x05, AX
15281 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
15282
15283two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
15284 CMPL R11, $0x40
15285 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
15286 MOVB $0xee, (AX)
15287 MOVW DI, 1(AX)
15288 LEAL -60(R11), R11
15289 ADDQ $0x03, AX
15290 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
15291
15292two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
15293 MOVL R11, BX
15294 SHLL $0x02, BX
15295 CMPL R11, $0x0c
15296 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
15297 CMPL DI, $0x00000800
15298 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
15299 LEAL -15(BX), BX
15300 MOVB DI, 1(AX)
15301 SHRL $0x08, DI
15302 SHLL $0x05, DI
15303 ORL DI, BX
15304 MOVB BL, (AX)
15305 ADDQ $0x02, AX
15306 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
15307
15308emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
15309 LEAL -2(BX), BX
15310 MOVB BL, (AX)
15311 MOVW DI, 1(AX)
15312 ADDQ $0x03, AX
15313
15314match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
15315 CMPL CX, 8(SP)
15316 JAE emit_remainder_encodeSnappyBetterBlockAsm
15317 CMPQ AX, (SP)
15318 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm
15319 MOVQ $0x00000000, ret+48(FP)
15320 RET
15321
15322match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
15323 MOVQ $0x00cf1bbcdcbfa563, BX
15324 MOVQ $0x9e3779b1, DI
15325 LEAQ 1(SI), SI
15326 LEAQ -2(CX), R8
15327 MOVQ (DX)(SI*1), R9
15328 MOVQ 1(DX)(SI*1), R10
15329 MOVQ (DX)(R8*1), R11
15330 MOVQ 1(DX)(R8*1), R12
15331 SHLQ $0x08, R9
15332 IMULQ BX, R9
15333 SHRQ $0x2f, R9
15334 SHLQ $0x20, R10
15335 IMULQ DI, R10
15336 SHRQ $0x32, R10
15337 SHLQ $0x08, R11
15338 IMULQ BX, R11
15339 SHRQ $0x2f, R11
15340 SHLQ $0x20, R12
15341 IMULQ DI, R12
15342 SHRQ $0x32, R12
15343 LEAQ 1(SI), DI
15344 LEAQ 1(R8), R13
15345 MOVL SI, 24(SP)(R9*4)
15346 MOVL R8, 24(SP)(R11*4)
15347 MOVL DI, 524312(SP)(R10*4)
15348 MOVL R13, 524312(SP)(R12*4)
15349 LEAQ 1(R8)(SI*1), DI
15350 SHRQ $0x01, DI
15351 ADDQ $0x01, SI
15352 SUBQ $0x01, R8
15353
15354index_loop_encodeSnappyBetterBlockAsm:
15355 CMPQ DI, R8
15356 JAE search_loop_encodeSnappyBetterBlockAsm
15357 MOVQ (DX)(SI*1), R9
15358 MOVQ (DX)(DI*1), R10
15359 SHLQ $0x08, R9
15360 IMULQ BX, R9
15361 SHRQ $0x2f, R9
15362 SHLQ $0x08, R10
15363 IMULQ BX, R10
15364 SHRQ $0x2f, R10
15365 MOVL SI, 24(SP)(R9*4)
15366 MOVL DI, 24(SP)(R10*4)
15367 ADDQ $0x02, SI
15368 ADDQ $0x02, DI
15369 JMP index_loop_encodeSnappyBetterBlockAsm
15370
15371emit_remainder_encodeSnappyBetterBlockAsm:
15372 MOVQ src_len+32(FP), CX
15373 SUBL 12(SP), CX
15374 LEAQ 5(AX)(CX*1), CX
15375 CMPQ CX, (SP)
15376 JB emit_remainder_ok_encodeSnappyBetterBlockAsm
15377 MOVQ $0x00000000, ret+48(FP)
15378 RET
15379
15380emit_remainder_ok_encodeSnappyBetterBlockAsm:
15381 MOVQ src_len+32(FP), CX
15382 MOVL 12(SP), BX
15383 CMPL BX, CX
15384 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
15385 MOVL CX, SI
15386 MOVL CX, 12(SP)
15387 LEAQ (DX)(BX*1), CX
15388 SUBL BX, SI
15389 LEAL -1(SI), DX
15390 CMPL DX, $0x3c
15391 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm
15392 CMPL DX, $0x00000100
15393 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
15394 CMPL DX, $0x00010000
15395 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
15396 CMPL DX, $0x01000000
15397 JB four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
15398 MOVB $0xfc, (AX)
15399 MOVL DX, 1(AX)
15400 ADDQ $0x05, AX
15401 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15402
15403four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
15404 MOVL DX, BX
15405 SHRL $0x10, BX
15406 MOVB $0xf8, (AX)
15407 MOVW DX, 1(AX)
15408 MOVB BL, 3(AX)
15409 ADDQ $0x04, AX
15410 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15411
15412three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
15413 MOVB $0xf4, (AX)
15414 MOVW DX, 1(AX)
15415 ADDQ $0x03, AX
15416 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15417
15418two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
15419 MOVB $0xf0, (AX)
15420 MOVB DL, 1(AX)
15421 ADDQ $0x02, AX
15422 CMPL DX, $0x40
15423 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm
15424 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15425
15426one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
15427 SHLB $0x02, DL
15428 MOVB DL, (AX)
15429 ADDQ $0x01, AX
15430
15431memmove_emit_remainder_encodeSnappyBetterBlockAsm:
15432 LEAQ (AX)(SI*1), DX
15433 MOVL SI, BX
15434
15435 // genMemMoveShort
15436 CMPQ BX, $0x03
15437 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2
15438 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3
15439 CMPQ BX, $0x08
15440 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7
15441 CMPQ BX, $0x10
15442 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
15443 CMPQ BX, $0x20
15444 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
15445 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
15446
15447emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2:
15448 MOVB (CX), SI
15449 MOVB -1(CX)(BX*1), CL
15450 MOVB SI, (AX)
15451 MOVB CL, -1(AX)(BX*1)
15452 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15453
15454emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3:
15455 MOVW (CX), SI
15456 MOVB 2(CX), CL
15457 MOVW SI, (AX)
15458 MOVB CL, 2(AX)
15459 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15460
15461emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7:
15462 MOVL (CX), SI
15463 MOVL -4(CX)(BX*1), CX
15464 MOVL SI, (AX)
15465 MOVL CX, -4(AX)(BX*1)
15466 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15467
15468emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
15469 MOVQ (CX), SI
15470 MOVQ -8(CX)(BX*1), CX
15471 MOVQ SI, (AX)
15472 MOVQ CX, -8(AX)(BX*1)
15473 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15474
15475emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
15476 MOVOU (CX), X0
15477 MOVOU -16(CX)(BX*1), X1
15478 MOVOU X0, (AX)
15479 MOVOU X1, -16(AX)(BX*1)
15480 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15481
15482emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
15483 MOVOU (CX), X0
15484 MOVOU 16(CX), X1
15485 MOVOU -32(CX)(BX*1), X2
15486 MOVOU -16(CX)(BX*1), X3
15487 MOVOU X0, (AX)
15488 MOVOU X1, 16(AX)
15489 MOVOU X2, -32(AX)(BX*1)
15490 MOVOU X3, -16(AX)(BX*1)
15491
15492memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
15493 MOVQ DX, AX
15494 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
15495
15496memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
15497 LEAQ (AX)(SI*1), DX
15498 MOVL SI, BX
15499
15500 // genMemMoveLong
15501 MOVOU (CX), X0
15502 MOVOU 16(CX), X1
15503 MOVOU -32(CX)(BX*1), X2
15504 MOVOU -16(CX)(BX*1), X3
15505 MOVQ BX, DI
15506 SHRQ $0x05, DI
15507 MOVQ AX, SI
15508 ANDL $0x0000001f, SI
15509 MOVQ $0x00000040, R8
15510 SUBQ SI, R8
15511 DECQ DI
15512 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15513 LEAQ -32(CX)(R8*1), SI
15514 LEAQ -32(AX)(R8*1), R9
15515
15516emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
15517 MOVOU (SI), X4
15518 MOVOU 16(SI), X5
15519 MOVOA X4, (R9)
15520 MOVOA X5, 16(R9)
15521 ADDQ $0x20, R9
15522 ADDQ $0x20, SI
15523 ADDQ $0x20, R8
15524 DECQ DI
15525 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
15526
15527emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
15528 MOVOU -32(CX)(R8*1), X4
15529 MOVOU -16(CX)(R8*1), X5
15530 MOVOA X4, -32(AX)(R8*1)
15531 MOVOA X5, -16(AX)(R8*1)
15532 ADDQ $0x20, R8
15533 CMPQ BX, R8
15534 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15535 MOVOU X0, (AX)
15536 MOVOU X1, 16(AX)
15537 MOVOU X2, -32(AX)(BX*1)
15538 MOVOU X3, -16(AX)(BX*1)
15539 MOVQ DX, AX
15540
15541emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
15542 MOVQ dst_base+0(FP), CX
15543 SUBQ CX, AX
15544 MOVQ AX, ret+48(FP)
15545 RET
15546
15547// func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int
15548// Requires: BMI, SSE2
15549TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56
15550 MOVQ dst_base+0(FP), AX
15551 MOVQ $0x00000a00, CX
15552 LEAQ 24(SP), DX
15553 PXOR X0, X0
15554
15555zero_loop_encodeSnappyBetterBlockAsm64K:
15556 MOVOU X0, (DX)
15557 MOVOU X0, 16(DX)
15558 MOVOU X0, 32(DX)
15559 MOVOU X0, 48(DX)
15560 MOVOU X0, 64(DX)
15561 MOVOU X0, 80(DX)
15562 MOVOU X0, 96(DX)
15563 MOVOU X0, 112(DX)
15564 ADDQ $0x80, DX
15565 DECQ CX
15566 JNZ zero_loop_encodeSnappyBetterBlockAsm64K
15567 MOVL $0x00000000, 12(SP)
15568 MOVQ src_len+32(FP), CX
15569 LEAQ -9(CX), DX
15570 LEAQ -8(CX), BX
15571 MOVL BX, 8(SP)
15572 SHRQ $0x05, CX
15573 SUBL CX, DX
15574 LEAQ (AX)(DX*1), DX
15575 MOVQ DX, (SP)
15576 MOVL $0x00000001, CX
15577 MOVL $0x00000000, 16(SP)
15578 MOVQ src_base+24(FP), DX
15579
15580search_loop_encodeSnappyBetterBlockAsm64K:
15581 MOVL CX, BX
15582 SUBL 12(SP), BX
15583 SHRL $0x07, BX
15584 LEAL 1(CX)(BX*1), BX
15585 CMPL BX, 8(SP)
15586 JAE emit_remainder_encodeSnappyBetterBlockAsm64K
15587 MOVQ (DX)(CX*1), SI
15588 MOVL BX, 20(SP)
15589 MOVQ $0x00cf1bbcdcbfa563, R8
15590 MOVQ $0x9e3779b1, BX
15591 MOVQ SI, R9
15592 MOVQ SI, R10
15593 SHLQ $0x08, R9
15594 IMULQ R8, R9
15595 SHRQ $0x30, R9
15596 SHLQ $0x20, R10
15597 IMULQ BX, R10
15598 SHRQ $0x32, R10
15599 MOVL 24(SP)(R9*4), BX
15600 MOVL 262168(SP)(R10*4), DI
15601 MOVL CX, 24(SP)(R9*4)
15602 MOVL CX, 262168(SP)(R10*4)
15603 MOVQ (DX)(BX*1), R9
15604 MOVQ (DX)(DI*1), R10
15605 CMPQ R9, SI
15606 JEQ candidate_match_encodeSnappyBetterBlockAsm64K
15607 CMPQ R10, SI
15608 JNE no_short_found_encodeSnappyBetterBlockAsm64K
15609 MOVL DI, BX
15610 JMP candidate_match_encodeSnappyBetterBlockAsm64K
15611
15612no_short_found_encodeSnappyBetterBlockAsm64K:
15613 CMPL R9, SI
15614 JEQ candidate_match_encodeSnappyBetterBlockAsm64K
15615 CMPL R10, SI
15616 JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
15617 MOVL 20(SP), CX
15618 JMP search_loop_encodeSnappyBetterBlockAsm64K
15619
15620candidateS_match_encodeSnappyBetterBlockAsm64K:
15621 SHRQ $0x08, SI
15622 MOVQ SI, R9
15623 SHLQ $0x08, R9
15624 IMULQ R8, R9
15625 SHRQ $0x30, R9
15626 MOVL 24(SP)(R9*4), BX
15627 INCL CX
15628 MOVL CX, 24(SP)(R9*4)
15629 CMPL (DX)(BX*1), SI
15630 JEQ candidate_match_encodeSnappyBetterBlockAsm64K
15631 DECL CX
15632 MOVL DI, BX
15633
15634candidate_match_encodeSnappyBetterBlockAsm64K:
15635 MOVL 12(SP), SI
15636 TESTL BX, BX
15637 JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
15638
15639match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
15640 CMPL CX, SI
15641 JBE match_extend_back_end_encodeSnappyBetterBlockAsm64K
15642 MOVB -1(DX)(BX*1), DI
15643 MOVB -1(DX)(CX*1), R8
15644 CMPB DI, R8
15645 JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K
15646 LEAL -1(CX), CX
15647 DECL BX
15648 JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
15649 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K
15650
15651match_extend_back_end_encodeSnappyBetterBlockAsm64K:
15652 MOVL CX, SI
15653 SUBL 12(SP), SI
15654 LEAQ 3(AX)(SI*1), SI
15655 CMPQ SI, (SP)
15656 JB match_dst_size_check_encodeSnappyBetterBlockAsm64K
15657 MOVQ $0x00000000, ret+48(FP)
15658 RET
15659
15660match_dst_size_check_encodeSnappyBetterBlockAsm64K:
15661 MOVL CX, SI
15662 ADDL $0x04, CX
15663 ADDL $0x04, BX
15664 MOVQ src_len+32(FP), DI
15665 SUBL CX, DI
15666 LEAQ (DX)(CX*1), R8
15667 LEAQ (DX)(BX*1), R9
15668
15669 // matchLen
15670 XORL R11, R11
15671
15672matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K:
15673 CMPL DI, $0x10
15674 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K
15675 MOVQ (R8)(R11*1), R10
15676 MOVQ 8(R8)(R11*1), R12
15677 XORQ (R9)(R11*1), R10
15678 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
15679 XORQ 8(R9)(R11*1), R12
15680 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K
15681 LEAL -16(DI), DI
15682 LEAL 16(R11), R11
15683 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K
15684
15685matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K:
15686#ifdef GOAMD64_v3
15687 TZCNTQ R12, R12
15688
15689#else
15690 BSFQ R12, R12
15691
15692#endif
15693 SARQ $0x03, R12
15694 LEAL 8(R11)(R12*1), R11
15695 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
15696
15697matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K:
15698 CMPL DI, $0x08
15699 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
15700 MOVQ (R8)(R11*1), R10
15701 XORQ (R9)(R11*1), R10
15702 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
15703 LEAL -8(DI), DI
15704 LEAL 8(R11), R11
15705 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
15706
15707matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K:
15708#ifdef GOAMD64_v3
15709 TZCNTQ R10, R10
15710
15711#else
15712 BSFQ R10, R10
15713
15714#endif
15715 SARQ $0x03, R10
15716 LEAL (R11)(R10*1), R11
15717 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
15718
15719matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
15720 CMPL DI, $0x04
15721 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
15722 MOVL (R8)(R11*1), R10
15723 CMPL (R9)(R11*1), R10
15724 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
15725 LEAL -4(DI), DI
15726 LEAL 4(R11), R11
15727
15728matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
15729 CMPL DI, $0x01
15730 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
15731 JB match_nolit_end_encodeSnappyBetterBlockAsm64K
15732 MOVW (R8)(R11*1), R10
15733 CMPW (R9)(R11*1), R10
15734 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
15735 LEAL 2(R11), R11
15736 SUBL $0x02, DI
15737 JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
15738
15739matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
15740 MOVB (R8)(R11*1), R10
15741 CMPB (R9)(R11*1), R10
15742 JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
15743 LEAL 1(R11), R11
15744
15745match_nolit_end_encodeSnappyBetterBlockAsm64K:
15746 MOVL CX, DI
15747 SUBL BX, DI
15748
15749 // Check if repeat
15750 MOVL DI, 16(SP)
15751 MOVL 12(SP), BX
15752 CMPL BX, SI
15753 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
15754 MOVL SI, R8
15755 MOVL SI, 12(SP)
15756 LEAQ (DX)(BX*1), R9
15757 SUBL BX, R8
15758 LEAL -1(R8), BX
15759 CMPL BX, $0x3c
15760 JB one_byte_match_emit_encodeSnappyBetterBlockAsm64K
15761 CMPL BX, $0x00000100
15762 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
15763 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm64K
15764
15765three_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
15766 MOVB $0xf4, (AX)
15767 MOVW BX, 1(AX)
15768 ADDQ $0x03, AX
15769 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
15770
15771two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
15772 MOVB $0xf0, (AX)
15773 MOVB BL, 1(AX)
15774 ADDQ $0x02, AX
15775 CMPL BX, $0x40
15776 JB memmove_match_emit_encodeSnappyBetterBlockAsm64K
15777 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
15778
15779one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
15780 SHLB $0x02, BL
15781 MOVB BL, (AX)
15782 ADDQ $0x01, AX
15783
15784memmove_match_emit_encodeSnappyBetterBlockAsm64K:
15785 LEAQ (AX)(R8*1), BX
15786
15787 // genMemMoveShort
15788 CMPQ R8, $0x08
15789 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
15790 CMPQ R8, $0x10
15791 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
15792 CMPQ R8, $0x20
15793 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
15794 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
15795
15796emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
15797 MOVQ (R9), R10
15798 MOVQ R10, (AX)
15799 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
15800
15801emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
15802 MOVQ (R9), R10
15803 MOVQ -8(R9)(R8*1), R9
15804 MOVQ R10, (AX)
15805 MOVQ R9, -8(AX)(R8*1)
15806 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
15807
15808emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
15809 MOVOU (R9), X0
15810 MOVOU -16(R9)(R8*1), X1
15811 MOVOU X0, (AX)
15812 MOVOU X1, -16(AX)(R8*1)
15813 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
15814
15815emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
15816 MOVOU (R9), X0
15817 MOVOU 16(R9), X1
15818 MOVOU -32(R9)(R8*1), X2
15819 MOVOU -16(R9)(R8*1), X3
15820 MOVOU X0, (AX)
15821 MOVOU X1, 16(AX)
15822 MOVOU X2, -32(AX)(R8*1)
15823 MOVOU X3, -16(AX)(R8*1)
15824
15825memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
15826 MOVQ BX, AX
15827 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
15828
15829memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
15830 LEAQ (AX)(R8*1), BX
15831
15832 // genMemMoveLong
15833 MOVOU (R9), X0
15834 MOVOU 16(R9), X1
15835 MOVOU -32(R9)(R8*1), X2
15836 MOVOU -16(R9)(R8*1), X3
15837 MOVQ R8, R12
15838 SHRQ $0x05, R12
15839 MOVQ AX, R10
15840 ANDL $0x0000001f, R10
15841 MOVQ $0x00000040, R13
15842 SUBQ R10, R13
15843 DECQ R12
15844 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
15845 LEAQ -32(R9)(R13*1), R10
15846 LEAQ -32(AX)(R13*1), R14
15847
15848emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
15849 MOVOU (R10), X4
15850 MOVOU 16(R10), X5
15851 MOVOA X4, (R14)
15852 MOVOA X5, 16(R14)
15853 ADDQ $0x20, R14
15854 ADDQ $0x20, R10
15855 ADDQ $0x20, R13
15856 DECQ R12
15857 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
15858
15859emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
15860 MOVOU -32(R9)(R13*1), X4
15861 MOVOU -16(R9)(R13*1), X5
15862 MOVOA X4, -32(AX)(R13*1)
15863 MOVOA X5, -16(AX)(R13*1)
15864 ADDQ $0x20, R13
15865 CMPQ R8, R13
15866 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
15867 MOVOU X0, (AX)
15868 MOVOU X1, 16(AX)
15869 MOVOU X2, -32(AX)(R8*1)
15870 MOVOU X3, -16(AX)(R8*1)
15871 MOVQ BX, AX
15872
15873emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
15874 ADDL R11, CX
15875 ADDL $0x04, R11
15876 MOVL CX, 12(SP)
15877
15878 // emitCopy
15879two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
15880 CMPL R11, $0x40
15881 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
15882 MOVB $0xee, (AX)
15883 MOVW DI, 1(AX)
15884 LEAL -60(R11), R11
15885 ADDQ $0x03, AX
15886 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
15887
15888two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
15889 MOVL R11, BX
15890 SHLL $0x02, BX
15891 CMPL R11, $0x0c
15892 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
15893 CMPL DI, $0x00000800
15894 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
15895 LEAL -15(BX), BX
15896 MOVB DI, 1(AX)
15897 SHRL $0x08, DI
15898 SHLL $0x05, DI
15899 ORL DI, BX
15900 MOVB BL, (AX)
15901 ADDQ $0x02, AX
15902 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
15903
15904emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
15905 LEAL -2(BX), BX
15906 MOVB BL, (AX)
15907 MOVW DI, 1(AX)
15908 ADDQ $0x03, AX
15909
15910match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
15911 CMPL CX, 8(SP)
15912 JAE emit_remainder_encodeSnappyBetterBlockAsm64K
15913 CMPQ AX, (SP)
15914 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
15915 MOVQ $0x00000000, ret+48(FP)
15916 RET
15917
15918match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
15919 MOVQ $0x00cf1bbcdcbfa563, BX
15920 MOVQ $0x9e3779b1, DI
15921 LEAQ 1(SI), SI
15922 LEAQ -2(CX), R8
15923 MOVQ (DX)(SI*1), R9
15924 MOVQ 1(DX)(SI*1), R10
15925 MOVQ (DX)(R8*1), R11
15926 MOVQ 1(DX)(R8*1), R12
15927 SHLQ $0x08, R9
15928 IMULQ BX, R9
15929 SHRQ $0x30, R9
15930 SHLQ $0x20, R10
15931 IMULQ DI, R10
15932 SHRQ $0x32, R10
15933 SHLQ $0x08, R11
15934 IMULQ BX, R11
15935 SHRQ $0x30, R11
15936 SHLQ $0x20, R12
15937 IMULQ DI, R12
15938 SHRQ $0x32, R12
15939 LEAQ 1(SI), DI
15940 LEAQ 1(R8), R13
15941 MOVL SI, 24(SP)(R9*4)
15942 MOVL R8, 24(SP)(R11*4)
15943 MOVL DI, 262168(SP)(R10*4)
15944 MOVL R13, 262168(SP)(R12*4)
15945 LEAQ 1(R8)(SI*1), DI
15946 SHRQ $0x01, DI
15947 ADDQ $0x01, SI
15948 SUBQ $0x01, R8
15949
15950index_loop_encodeSnappyBetterBlockAsm64K:
15951 CMPQ DI, R8
15952 JAE search_loop_encodeSnappyBetterBlockAsm64K
15953 MOVQ (DX)(SI*1), R9
15954 MOVQ (DX)(DI*1), R10
15955 SHLQ $0x08, R9
15956 IMULQ BX, R9
15957 SHRQ $0x30, R9
15958 SHLQ $0x08, R10
15959 IMULQ BX, R10
15960 SHRQ $0x30, R10
15961 MOVL SI, 24(SP)(R9*4)
15962 MOVL DI, 24(SP)(R10*4)
15963 ADDQ $0x02, SI
15964 ADDQ $0x02, DI
15965 JMP index_loop_encodeSnappyBetterBlockAsm64K
15966
15967emit_remainder_encodeSnappyBetterBlockAsm64K:
15968 MOVQ src_len+32(FP), CX
15969 SUBL 12(SP), CX
15970 LEAQ 3(AX)(CX*1), CX
15971 CMPQ CX, (SP)
15972 JB emit_remainder_ok_encodeSnappyBetterBlockAsm64K
15973 MOVQ $0x00000000, ret+48(FP)
15974 RET
15975
15976emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
15977 MOVQ src_len+32(FP), CX
15978 MOVL 12(SP), BX
15979 CMPL BX, CX
15980 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
15981 MOVL CX, SI
15982 MOVL CX, 12(SP)
15983 LEAQ (DX)(BX*1), CX
15984 SUBL BX, SI
15985 LEAL -1(SI), DX
15986 CMPL DX, $0x3c
15987 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
15988 CMPL DX, $0x00000100
15989 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
15990 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
15991
15992three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
15993 MOVB $0xf4, (AX)
15994 MOVW DX, 1(AX)
15995 ADDQ $0x03, AX
15996 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
15997
15998two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
15999 MOVB $0xf0, (AX)
16000 MOVB DL, 1(AX)
16001 ADDQ $0x02, AX
16002 CMPL DX, $0x40
16003 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
16004 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
16005
16006one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
16007 SHLB $0x02, DL
16008 MOVB DL, (AX)
16009 ADDQ $0x01, AX
16010
16011memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
16012 LEAQ (AX)(SI*1), DX
16013 MOVL SI, BX
16014
16015 // genMemMoveShort
16016 CMPQ BX, $0x03
16017 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2
16018 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3
16019 CMPQ BX, $0x08
16020 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7
16021 CMPQ BX, $0x10
16022 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
16023 CMPQ BX, $0x20
16024 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
16025 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
16026
16027emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2:
16028 MOVB (CX), SI
16029 MOVB -1(CX)(BX*1), CL
16030 MOVB SI, (AX)
16031 MOVB CL, -1(AX)(BX*1)
16032 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16033
16034emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3:
16035 MOVW (CX), SI
16036 MOVB 2(CX), CL
16037 MOVW SI, (AX)
16038 MOVB CL, 2(AX)
16039 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16040
16041emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7:
16042 MOVL (CX), SI
16043 MOVL -4(CX)(BX*1), CX
16044 MOVL SI, (AX)
16045 MOVL CX, -4(AX)(BX*1)
16046 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16047
16048emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
16049 MOVQ (CX), SI
16050 MOVQ -8(CX)(BX*1), CX
16051 MOVQ SI, (AX)
16052 MOVQ CX, -8(AX)(BX*1)
16053 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16054
16055emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
16056 MOVOU (CX), X0
16057 MOVOU -16(CX)(BX*1), X1
16058 MOVOU X0, (AX)
16059 MOVOU X1, -16(AX)(BX*1)
16060 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16061
16062emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
16063 MOVOU (CX), X0
16064 MOVOU 16(CX), X1
16065 MOVOU -32(CX)(BX*1), X2
16066 MOVOU -16(CX)(BX*1), X3
16067 MOVOU X0, (AX)
16068 MOVOU X1, 16(AX)
16069 MOVOU X2, -32(AX)(BX*1)
16070 MOVOU X3, -16(AX)(BX*1)
16071
16072memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
16073 MOVQ DX, AX
16074 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
16075
16076memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
16077 LEAQ (AX)(SI*1), DX
16078 MOVL SI, BX
16079
16080 // genMemMoveLong
16081 MOVOU (CX), X0
16082 MOVOU 16(CX), X1
16083 MOVOU -32(CX)(BX*1), X2
16084 MOVOU -16(CX)(BX*1), X3
16085 MOVQ BX, DI
16086 SHRQ $0x05, DI
16087 MOVQ AX, SI
16088 ANDL $0x0000001f, SI
16089 MOVQ $0x00000040, R8
16090 SUBQ SI, R8
16091 DECQ DI
16092 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
16093 LEAQ -32(CX)(R8*1), SI
16094 LEAQ -32(AX)(R8*1), R9
16095
16096emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
16097 MOVOU (SI), X4
16098 MOVOU 16(SI), X5
16099 MOVOA X4, (R9)
16100 MOVOA X5, 16(R9)
16101 ADDQ $0x20, R9
16102 ADDQ $0x20, SI
16103 ADDQ $0x20, R8
16104 DECQ DI
16105 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
16106
16107emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
16108 MOVOU -32(CX)(R8*1), X4
16109 MOVOU -16(CX)(R8*1), X5
16110 MOVOA X4, -32(AX)(R8*1)
16111 MOVOA X5, -16(AX)(R8*1)
16112 ADDQ $0x20, R8
16113 CMPQ BX, R8
16114 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
16115 MOVOU X0, (AX)
16116 MOVOU X1, 16(AX)
16117 MOVOU X2, -32(AX)(BX*1)
16118 MOVOU X3, -16(AX)(BX*1)
16119 MOVQ DX, AX
16120
16121emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
16122 MOVQ dst_base+0(FP), CX
16123 SUBQ CX, AX
16124 MOVQ AX, ret+48(FP)
16125 RET
16126
16127// func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int
16128// Requires: BMI, SSE2
16129TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56
16130 MOVQ dst_base+0(FP), AX
16131 MOVQ $0x00000280, CX
16132 LEAQ 24(SP), DX
16133 PXOR X0, X0
16134
16135zero_loop_encodeSnappyBetterBlockAsm12B:
16136 MOVOU X0, (DX)
16137 MOVOU X0, 16(DX)
16138 MOVOU X0, 32(DX)
16139 MOVOU X0, 48(DX)
16140 MOVOU X0, 64(DX)
16141 MOVOU X0, 80(DX)
16142 MOVOU X0, 96(DX)
16143 MOVOU X0, 112(DX)
16144 ADDQ $0x80, DX
16145 DECQ CX
16146 JNZ zero_loop_encodeSnappyBetterBlockAsm12B
16147 MOVL $0x00000000, 12(SP)
16148 MOVQ src_len+32(FP), CX
16149 LEAQ -9(CX), DX
16150 LEAQ -8(CX), BX
16151 MOVL BX, 8(SP)
16152 SHRQ $0x05, CX
16153 SUBL CX, DX
16154 LEAQ (AX)(DX*1), DX
16155 MOVQ DX, (SP)
16156 MOVL $0x00000001, CX
16157 MOVL $0x00000000, 16(SP)
16158 MOVQ src_base+24(FP), DX
16159
16160search_loop_encodeSnappyBetterBlockAsm12B:
16161 MOVL CX, BX
16162 SUBL 12(SP), BX
16163 SHRL $0x06, BX
16164 LEAL 1(CX)(BX*1), BX
16165 CMPL BX, 8(SP)
16166 JAE emit_remainder_encodeSnappyBetterBlockAsm12B
16167 MOVQ (DX)(CX*1), SI
16168 MOVL BX, 20(SP)
16169 MOVQ $0x0000cf1bbcdcbf9b, R8
16170 MOVQ $0x9e3779b1, BX
16171 MOVQ SI, R9
16172 MOVQ SI, R10
16173 SHLQ $0x10, R9
16174 IMULQ R8, R9
16175 SHRQ $0x32, R9
16176 SHLQ $0x20, R10
16177 IMULQ BX, R10
16178 SHRQ $0x34, R10
16179 MOVL 24(SP)(R9*4), BX
16180 MOVL 65560(SP)(R10*4), DI
16181 MOVL CX, 24(SP)(R9*4)
16182 MOVL CX, 65560(SP)(R10*4)
16183 MOVQ (DX)(BX*1), R9
16184 MOVQ (DX)(DI*1), R10
16185 CMPQ R9, SI
16186 JEQ candidate_match_encodeSnappyBetterBlockAsm12B
16187 CMPQ R10, SI
16188 JNE no_short_found_encodeSnappyBetterBlockAsm12B
16189 MOVL DI, BX
16190 JMP candidate_match_encodeSnappyBetterBlockAsm12B
16191
16192no_short_found_encodeSnappyBetterBlockAsm12B:
16193 CMPL R9, SI
16194 JEQ candidate_match_encodeSnappyBetterBlockAsm12B
16195 CMPL R10, SI
16196 JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
16197 MOVL 20(SP), CX
16198 JMP search_loop_encodeSnappyBetterBlockAsm12B
16199
16200candidateS_match_encodeSnappyBetterBlockAsm12B:
16201 SHRQ $0x08, SI
16202 MOVQ SI, R9
16203 SHLQ $0x10, R9
16204 IMULQ R8, R9
16205 SHRQ $0x32, R9
16206 MOVL 24(SP)(R9*4), BX
16207 INCL CX
16208 MOVL CX, 24(SP)(R9*4)
16209 CMPL (DX)(BX*1), SI
16210 JEQ candidate_match_encodeSnappyBetterBlockAsm12B
16211 DECL CX
16212 MOVL DI, BX
16213
16214candidate_match_encodeSnappyBetterBlockAsm12B:
16215 MOVL 12(SP), SI
16216 TESTL BX, BX
16217 JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
16218
16219match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
16220 CMPL CX, SI
16221 JBE match_extend_back_end_encodeSnappyBetterBlockAsm12B
16222 MOVB -1(DX)(BX*1), DI
16223 MOVB -1(DX)(CX*1), R8
16224 CMPB DI, R8
16225 JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B
16226 LEAL -1(CX), CX
16227 DECL BX
16228 JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
16229 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B
16230
16231match_extend_back_end_encodeSnappyBetterBlockAsm12B:
16232 MOVL CX, SI
16233 SUBL 12(SP), SI
16234 LEAQ 3(AX)(SI*1), SI
16235 CMPQ SI, (SP)
16236 JB match_dst_size_check_encodeSnappyBetterBlockAsm12B
16237 MOVQ $0x00000000, ret+48(FP)
16238 RET
16239
16240match_dst_size_check_encodeSnappyBetterBlockAsm12B:
16241 MOVL CX, SI
16242 ADDL $0x04, CX
16243 ADDL $0x04, BX
16244 MOVQ src_len+32(FP), DI
16245 SUBL CX, DI
16246 LEAQ (DX)(CX*1), R8
16247 LEAQ (DX)(BX*1), R9
16248
16249 // matchLen
16250 XORL R11, R11
16251
16252matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B:
16253 CMPL DI, $0x10
16254 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B
16255 MOVQ (R8)(R11*1), R10
16256 MOVQ 8(R8)(R11*1), R12
16257 XORQ (R9)(R11*1), R10
16258 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
16259 XORQ 8(R9)(R11*1), R12
16260 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B
16261 LEAL -16(DI), DI
16262 LEAL 16(R11), R11
16263 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B
16264
16265matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B:
16266#ifdef GOAMD64_v3
16267 TZCNTQ R12, R12
16268
16269#else
16270 BSFQ R12, R12
16271
16272#endif
16273 SARQ $0x03, R12
16274 LEAL 8(R11)(R12*1), R11
16275 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
16276
16277matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B:
16278 CMPL DI, $0x08
16279 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
16280 MOVQ (R8)(R11*1), R10
16281 XORQ (R9)(R11*1), R10
16282 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
16283 LEAL -8(DI), DI
16284 LEAL 8(R11), R11
16285 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
16286
16287matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B:
16288#ifdef GOAMD64_v3
16289 TZCNTQ R10, R10
16290
16291#else
16292 BSFQ R10, R10
16293
16294#endif
16295 SARQ $0x03, R10
16296 LEAL (R11)(R10*1), R11
16297 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
16298
16299matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
16300 CMPL DI, $0x04
16301 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
16302 MOVL (R8)(R11*1), R10
16303 CMPL (R9)(R11*1), R10
16304 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
16305 LEAL -4(DI), DI
16306 LEAL 4(R11), R11
16307
16308matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
16309 CMPL DI, $0x01
16310 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
16311 JB match_nolit_end_encodeSnappyBetterBlockAsm12B
16312 MOVW (R8)(R11*1), R10
16313 CMPW (R9)(R11*1), R10
16314 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
16315 LEAL 2(R11), R11
16316 SUBL $0x02, DI
16317 JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
16318
16319matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
16320 MOVB (R8)(R11*1), R10
16321 CMPB (R9)(R11*1), R10
16322 JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
16323 LEAL 1(R11), R11
16324
16325match_nolit_end_encodeSnappyBetterBlockAsm12B:
16326 MOVL CX, DI
16327 SUBL BX, DI
16328
16329 // Check if repeat
16330 MOVL DI, 16(SP)
16331 MOVL 12(SP), BX
16332 CMPL BX, SI
16333 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
16334 MOVL SI, R8
16335 MOVL SI, 12(SP)
16336 LEAQ (DX)(BX*1), R9
16337 SUBL BX, R8
16338 LEAL -1(R8), BX
16339 CMPL BX, $0x3c
16340 JB one_byte_match_emit_encodeSnappyBetterBlockAsm12B
16341 CMPL BX, $0x00000100
16342 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
16343 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm12B
16344
16345three_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
16346 MOVB $0xf4, (AX)
16347 MOVW BX, 1(AX)
16348 ADDQ $0x03, AX
16349 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
16350
16351two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
16352 MOVB $0xf0, (AX)
16353 MOVB BL, 1(AX)
16354 ADDQ $0x02, AX
16355 CMPL BX, $0x40
16356 JB memmove_match_emit_encodeSnappyBetterBlockAsm12B
16357 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
16358
16359one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
16360 SHLB $0x02, BL
16361 MOVB BL, (AX)
16362 ADDQ $0x01, AX
16363
16364memmove_match_emit_encodeSnappyBetterBlockAsm12B:
16365 LEAQ (AX)(R8*1), BX
16366
16367 // genMemMoveShort
16368 CMPQ R8, $0x08
16369 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
16370 CMPQ R8, $0x10
16371 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
16372 CMPQ R8, $0x20
16373 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
16374 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
16375
16376emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
16377 MOVQ (R9), R10
16378 MOVQ R10, (AX)
16379 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
16380
16381emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
16382 MOVQ (R9), R10
16383 MOVQ -8(R9)(R8*1), R9
16384 MOVQ R10, (AX)
16385 MOVQ R9, -8(AX)(R8*1)
16386 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
16387
16388emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
16389 MOVOU (R9), X0
16390 MOVOU -16(R9)(R8*1), X1
16391 MOVOU X0, (AX)
16392 MOVOU X1, -16(AX)(R8*1)
16393 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
16394
16395emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
16396 MOVOU (R9), X0
16397 MOVOU 16(R9), X1
16398 MOVOU -32(R9)(R8*1), X2
16399 MOVOU -16(R9)(R8*1), X3
16400 MOVOU X0, (AX)
16401 MOVOU X1, 16(AX)
16402 MOVOU X2, -32(AX)(R8*1)
16403 MOVOU X3, -16(AX)(R8*1)
16404
16405memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
16406 MOVQ BX, AX
16407 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
16408
16409memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
16410 LEAQ (AX)(R8*1), BX
16411
16412 // genMemMoveLong
16413 MOVOU (R9), X0
16414 MOVOU 16(R9), X1
16415 MOVOU -32(R9)(R8*1), X2
16416 MOVOU -16(R9)(R8*1), X3
16417 MOVQ R8, R12
16418 SHRQ $0x05, R12
16419 MOVQ AX, R10
16420 ANDL $0x0000001f, R10
16421 MOVQ $0x00000040, R13
16422 SUBQ R10, R13
16423 DECQ R12
16424 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16425 LEAQ -32(R9)(R13*1), R10
16426 LEAQ -32(AX)(R13*1), R14
16427
16428emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
16429 MOVOU (R10), X4
16430 MOVOU 16(R10), X5
16431 MOVOA X4, (R14)
16432 MOVOA X5, 16(R14)
16433 ADDQ $0x20, R14
16434 ADDQ $0x20, R10
16435 ADDQ $0x20, R13
16436 DECQ R12
16437 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
16438
16439emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
16440 MOVOU -32(R9)(R13*1), X4
16441 MOVOU -16(R9)(R13*1), X5
16442 MOVOA X4, -32(AX)(R13*1)
16443 MOVOA X5, -16(AX)(R13*1)
16444 ADDQ $0x20, R13
16445 CMPQ R8, R13
16446 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16447 MOVOU X0, (AX)
16448 MOVOU X1, 16(AX)
16449 MOVOU X2, -32(AX)(R8*1)
16450 MOVOU X3, -16(AX)(R8*1)
16451 MOVQ BX, AX
16452
16453emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
16454 ADDL R11, CX
16455 ADDL $0x04, R11
16456 MOVL CX, 12(SP)
16457
16458 // emitCopy
16459two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
16460 CMPL R11, $0x40
16461 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
16462 MOVB $0xee, (AX)
16463 MOVW DI, 1(AX)
16464 LEAL -60(R11), R11
16465 ADDQ $0x03, AX
16466 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
16467
16468two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
16469 MOVL R11, BX
16470 SHLL $0x02, BX
16471 CMPL R11, $0x0c
16472 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
16473 CMPL DI, $0x00000800
16474 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
16475 LEAL -15(BX), BX
16476 MOVB DI, 1(AX)
16477 SHRL $0x08, DI
16478 SHLL $0x05, DI
16479 ORL DI, BX
16480 MOVB BL, (AX)
16481 ADDQ $0x02, AX
16482 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
16483
16484emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
16485 LEAL -2(BX), BX
16486 MOVB BL, (AX)
16487 MOVW DI, 1(AX)
16488 ADDQ $0x03, AX
16489
16490match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
16491 CMPL CX, 8(SP)
16492 JAE emit_remainder_encodeSnappyBetterBlockAsm12B
16493 CMPQ AX, (SP)
16494 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
16495 MOVQ $0x00000000, ret+48(FP)
16496 RET
16497
16498match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
16499 MOVQ $0x0000cf1bbcdcbf9b, BX
16500 MOVQ $0x9e3779b1, DI
16501 LEAQ 1(SI), SI
16502 LEAQ -2(CX), R8
16503 MOVQ (DX)(SI*1), R9
16504 MOVQ 1(DX)(SI*1), R10
16505 MOVQ (DX)(R8*1), R11
16506 MOVQ 1(DX)(R8*1), R12
16507 SHLQ $0x10, R9
16508 IMULQ BX, R9
16509 SHRQ $0x32, R9
16510 SHLQ $0x20, R10
16511 IMULQ DI, R10
16512 SHRQ $0x34, R10
16513 SHLQ $0x10, R11
16514 IMULQ BX, R11
16515 SHRQ $0x32, R11
16516 SHLQ $0x20, R12
16517 IMULQ DI, R12
16518 SHRQ $0x34, R12
16519 LEAQ 1(SI), DI
16520 LEAQ 1(R8), R13
16521 MOVL SI, 24(SP)(R9*4)
16522 MOVL R8, 24(SP)(R11*4)
16523 MOVL DI, 65560(SP)(R10*4)
16524 MOVL R13, 65560(SP)(R12*4)
16525 LEAQ 1(R8)(SI*1), DI
16526 SHRQ $0x01, DI
16527 ADDQ $0x01, SI
16528 SUBQ $0x01, R8
16529
16530index_loop_encodeSnappyBetterBlockAsm12B:
16531 CMPQ DI, R8
16532 JAE search_loop_encodeSnappyBetterBlockAsm12B
16533 MOVQ (DX)(SI*1), R9
16534 MOVQ (DX)(DI*1), R10
16535 SHLQ $0x10, R9
16536 IMULQ BX, R9
16537 SHRQ $0x32, R9
16538 SHLQ $0x10, R10
16539 IMULQ BX, R10
16540 SHRQ $0x32, R10
16541 MOVL SI, 24(SP)(R9*4)
16542 MOVL DI, 24(SP)(R10*4)
16543 ADDQ $0x02, SI
16544 ADDQ $0x02, DI
16545 JMP index_loop_encodeSnappyBetterBlockAsm12B
16546
16547emit_remainder_encodeSnappyBetterBlockAsm12B:
16548 MOVQ src_len+32(FP), CX
16549 SUBL 12(SP), CX
16550 LEAQ 3(AX)(CX*1), CX
16551 CMPQ CX, (SP)
16552 JB emit_remainder_ok_encodeSnappyBetterBlockAsm12B
16553 MOVQ $0x00000000, ret+48(FP)
16554 RET
16555
16556emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
16557 MOVQ src_len+32(FP), CX
16558 MOVL 12(SP), BX
16559 CMPL BX, CX
16560 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
16561 MOVL CX, SI
16562 MOVL CX, 12(SP)
16563 LEAQ (DX)(BX*1), CX
16564 SUBL BX, SI
16565 LEAL -1(SI), DX
16566 CMPL DX, $0x3c
16567 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
16568 CMPL DX, $0x00000100
16569 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
16570 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
16571
16572three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
16573 MOVB $0xf4, (AX)
16574 MOVW DX, 1(AX)
16575 ADDQ $0x03, AX
16576 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
16577
16578two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
16579 MOVB $0xf0, (AX)
16580 MOVB DL, 1(AX)
16581 ADDQ $0x02, AX
16582 CMPL DX, $0x40
16583 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
16584 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
16585
16586one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
16587 SHLB $0x02, DL
16588 MOVB DL, (AX)
16589 ADDQ $0x01, AX
16590
16591memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
16592 LEAQ (AX)(SI*1), DX
16593 MOVL SI, BX
16594
16595 // genMemMoveShort
16596 CMPQ BX, $0x03
16597 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2
16598 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3
16599 CMPQ BX, $0x08
16600 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7
16601 CMPQ BX, $0x10
16602 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
16603 CMPQ BX, $0x20
16604 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
16605 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
16606
16607emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2:
16608 MOVB (CX), SI
16609 MOVB -1(CX)(BX*1), CL
16610 MOVB SI, (AX)
16611 MOVB CL, -1(AX)(BX*1)
16612 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16613
16614emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3:
16615 MOVW (CX), SI
16616 MOVB 2(CX), CL
16617 MOVW SI, (AX)
16618 MOVB CL, 2(AX)
16619 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16620
16621emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7:
16622 MOVL (CX), SI
16623 MOVL -4(CX)(BX*1), CX
16624 MOVL SI, (AX)
16625 MOVL CX, -4(AX)(BX*1)
16626 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16627
16628emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
16629 MOVQ (CX), SI
16630 MOVQ -8(CX)(BX*1), CX
16631 MOVQ SI, (AX)
16632 MOVQ CX, -8(AX)(BX*1)
16633 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16634
16635emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
16636 MOVOU (CX), X0
16637 MOVOU -16(CX)(BX*1), X1
16638 MOVOU X0, (AX)
16639 MOVOU X1, -16(AX)(BX*1)
16640 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16641
16642emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
16643 MOVOU (CX), X0
16644 MOVOU 16(CX), X1
16645 MOVOU -32(CX)(BX*1), X2
16646 MOVOU -16(CX)(BX*1), X3
16647 MOVOU X0, (AX)
16648 MOVOU X1, 16(AX)
16649 MOVOU X2, -32(AX)(BX*1)
16650 MOVOU X3, -16(AX)(BX*1)
16651
16652memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
16653 MOVQ DX, AX
16654 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
16655
16656memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
16657 LEAQ (AX)(SI*1), DX
16658 MOVL SI, BX
16659
16660 // genMemMoveLong
16661 MOVOU (CX), X0
16662 MOVOU 16(CX), X1
16663 MOVOU -32(CX)(BX*1), X2
16664 MOVOU -16(CX)(BX*1), X3
16665 MOVQ BX, DI
16666 SHRQ $0x05, DI
16667 MOVQ AX, SI
16668 ANDL $0x0000001f, SI
16669 MOVQ $0x00000040, R8
16670 SUBQ SI, R8
16671 DECQ DI
16672 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16673 LEAQ -32(CX)(R8*1), SI
16674 LEAQ -32(AX)(R8*1), R9
16675
16676emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
16677 MOVOU (SI), X4
16678 MOVOU 16(SI), X5
16679 MOVOA X4, (R9)
16680 MOVOA X5, 16(R9)
16681 ADDQ $0x20, R9
16682 ADDQ $0x20, SI
16683 ADDQ $0x20, R8
16684 DECQ DI
16685 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
16686
16687emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
16688 MOVOU -32(CX)(R8*1), X4
16689 MOVOU -16(CX)(R8*1), X5
16690 MOVOA X4, -32(AX)(R8*1)
16691 MOVOA X5, -16(AX)(R8*1)
16692 ADDQ $0x20, R8
16693 CMPQ BX, R8
16694 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16695 MOVOU X0, (AX)
16696 MOVOU X1, 16(AX)
16697 MOVOU X2, -32(AX)(BX*1)
16698 MOVOU X3, -16(AX)(BX*1)
16699 MOVQ DX, AX
16700
16701emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
16702 MOVQ dst_base+0(FP), CX
16703 SUBQ CX, AX
16704 MOVQ AX, ret+48(FP)
16705 RET
16706
16707// func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int
16708// Requires: BMI, SSE2
16709TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56
16710 MOVQ dst_base+0(FP), AX
16711 MOVQ $0x000000a0, CX
16712 LEAQ 24(SP), DX
16713 PXOR X0, X0
16714
16715zero_loop_encodeSnappyBetterBlockAsm10B:
16716 MOVOU X0, (DX)
16717 MOVOU X0, 16(DX)
16718 MOVOU X0, 32(DX)
16719 MOVOU X0, 48(DX)
16720 MOVOU X0, 64(DX)
16721 MOVOU X0, 80(DX)
16722 MOVOU X0, 96(DX)
16723 MOVOU X0, 112(DX)
16724 ADDQ $0x80, DX
16725 DECQ CX
16726 JNZ zero_loop_encodeSnappyBetterBlockAsm10B
16727 MOVL $0x00000000, 12(SP)
16728 MOVQ src_len+32(FP), CX
16729 LEAQ -9(CX), DX
16730 LEAQ -8(CX), BX
16731 MOVL BX, 8(SP)
16732 SHRQ $0x05, CX
16733 SUBL CX, DX
16734 LEAQ (AX)(DX*1), DX
16735 MOVQ DX, (SP)
16736 MOVL $0x00000001, CX
16737 MOVL $0x00000000, 16(SP)
16738 MOVQ src_base+24(FP), DX
16739
16740search_loop_encodeSnappyBetterBlockAsm10B:
16741 MOVL CX, BX
16742 SUBL 12(SP), BX
16743 SHRL $0x05, BX
16744 LEAL 1(CX)(BX*1), BX
16745 CMPL BX, 8(SP)
16746 JAE emit_remainder_encodeSnappyBetterBlockAsm10B
16747 MOVQ (DX)(CX*1), SI
16748 MOVL BX, 20(SP)
16749 MOVQ $0x0000cf1bbcdcbf9b, R8
16750 MOVQ $0x9e3779b1, BX
16751 MOVQ SI, R9
16752 MOVQ SI, R10
16753 SHLQ $0x10, R9
16754 IMULQ R8, R9
16755 SHRQ $0x34, R9
16756 SHLQ $0x20, R10
16757 IMULQ BX, R10
16758 SHRQ $0x36, R10
16759 MOVL 24(SP)(R9*4), BX
16760 MOVL 16408(SP)(R10*4), DI
16761 MOVL CX, 24(SP)(R9*4)
16762 MOVL CX, 16408(SP)(R10*4)
16763 MOVQ (DX)(BX*1), R9
16764 MOVQ (DX)(DI*1), R10
16765 CMPQ R9, SI
16766 JEQ candidate_match_encodeSnappyBetterBlockAsm10B
16767 CMPQ R10, SI
16768 JNE no_short_found_encodeSnappyBetterBlockAsm10B
16769 MOVL DI, BX
16770 JMP candidate_match_encodeSnappyBetterBlockAsm10B
16771
16772no_short_found_encodeSnappyBetterBlockAsm10B:
16773 CMPL R9, SI
16774 JEQ candidate_match_encodeSnappyBetterBlockAsm10B
16775 CMPL R10, SI
16776 JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
16777 MOVL 20(SP), CX
16778 JMP search_loop_encodeSnappyBetterBlockAsm10B
16779
16780candidateS_match_encodeSnappyBetterBlockAsm10B:
16781 SHRQ $0x08, SI
16782 MOVQ SI, R9
16783 SHLQ $0x10, R9
16784 IMULQ R8, R9
16785 SHRQ $0x34, R9
16786 MOVL 24(SP)(R9*4), BX
16787 INCL CX
16788 MOVL CX, 24(SP)(R9*4)
16789 CMPL (DX)(BX*1), SI
16790 JEQ candidate_match_encodeSnappyBetterBlockAsm10B
16791 DECL CX
16792 MOVL DI, BX
16793
16794candidate_match_encodeSnappyBetterBlockAsm10B:
16795 MOVL 12(SP), SI
16796 TESTL BX, BX
16797 JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
16798
16799match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
16800 CMPL CX, SI
16801 JBE match_extend_back_end_encodeSnappyBetterBlockAsm10B
16802 MOVB -1(DX)(BX*1), DI
16803 MOVB -1(DX)(CX*1), R8
16804 CMPB DI, R8
16805 JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B
16806 LEAL -1(CX), CX
16807 DECL BX
16808 JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
16809 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B
16810
16811match_extend_back_end_encodeSnappyBetterBlockAsm10B:
16812 MOVL CX, SI
16813 SUBL 12(SP), SI
16814 LEAQ 3(AX)(SI*1), SI
16815 CMPQ SI, (SP)
16816 JB match_dst_size_check_encodeSnappyBetterBlockAsm10B
16817 MOVQ $0x00000000, ret+48(FP)
16818 RET
16819
16820match_dst_size_check_encodeSnappyBetterBlockAsm10B:
16821 MOVL CX, SI
16822 ADDL $0x04, CX
16823 ADDL $0x04, BX
16824 MOVQ src_len+32(FP), DI
16825 SUBL CX, DI
16826 LEAQ (DX)(CX*1), R8
16827 LEAQ (DX)(BX*1), R9
16828
16829 // matchLen
16830 XORL R11, R11
16831
16832matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B:
16833 CMPL DI, $0x10
16834 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B
16835 MOVQ (R8)(R11*1), R10
16836 MOVQ 8(R8)(R11*1), R12
16837 XORQ (R9)(R11*1), R10
16838 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
16839 XORQ 8(R9)(R11*1), R12
16840 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B
16841 LEAL -16(DI), DI
16842 LEAL 16(R11), R11
16843 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B
16844
16845matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B:
16846#ifdef GOAMD64_v3
16847 TZCNTQ R12, R12
16848
16849#else
16850 BSFQ R12, R12
16851
16852#endif
16853 SARQ $0x03, R12
16854 LEAL 8(R11)(R12*1), R11
16855 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
16856
16857matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B:
16858 CMPL DI, $0x08
16859 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
16860 MOVQ (R8)(R11*1), R10
16861 XORQ (R9)(R11*1), R10
16862 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
16863 LEAL -8(DI), DI
16864 LEAL 8(R11), R11
16865 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
16866
16867matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B:
16868#ifdef GOAMD64_v3
16869 TZCNTQ R10, R10
16870
16871#else
16872 BSFQ R10, R10
16873
16874#endif
16875 SARQ $0x03, R10
16876 LEAL (R11)(R10*1), R11
16877 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
16878
16879matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
16880 CMPL DI, $0x04
16881 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
16882 MOVL (R8)(R11*1), R10
16883 CMPL (R9)(R11*1), R10
16884 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
16885 LEAL -4(DI), DI
16886 LEAL 4(R11), R11
16887
16888matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
16889 CMPL DI, $0x01
16890 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
16891 JB match_nolit_end_encodeSnappyBetterBlockAsm10B
16892 MOVW (R8)(R11*1), R10
16893 CMPW (R9)(R11*1), R10
16894 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
16895 LEAL 2(R11), R11
16896 SUBL $0x02, DI
16897 JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
16898
16899matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
16900 MOVB (R8)(R11*1), R10
16901 CMPB (R9)(R11*1), R10
16902 JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
16903 LEAL 1(R11), R11
16904
16905match_nolit_end_encodeSnappyBetterBlockAsm10B:
16906 MOVL CX, DI
16907 SUBL BX, DI
16908
16909 // Check if repeat
16910 MOVL DI, 16(SP)
16911 MOVL 12(SP), BX
16912 CMPL BX, SI
16913 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
16914 MOVL SI, R8
16915 MOVL SI, 12(SP)
16916 LEAQ (DX)(BX*1), R9
16917 SUBL BX, R8
16918 LEAL -1(R8), BX
16919 CMPL BX, $0x3c
16920 JB one_byte_match_emit_encodeSnappyBetterBlockAsm10B
16921 CMPL BX, $0x00000100
16922 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
16923 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm10B
16924
16925three_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
16926 MOVB $0xf4, (AX)
16927 MOVW BX, 1(AX)
16928 ADDQ $0x03, AX
16929 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
16930
16931two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
16932 MOVB $0xf0, (AX)
16933 MOVB BL, 1(AX)
16934 ADDQ $0x02, AX
16935 CMPL BX, $0x40
16936 JB memmove_match_emit_encodeSnappyBetterBlockAsm10B
16937 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
16938
16939one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
16940 SHLB $0x02, BL
16941 MOVB BL, (AX)
16942 ADDQ $0x01, AX
16943
16944memmove_match_emit_encodeSnappyBetterBlockAsm10B:
16945 LEAQ (AX)(R8*1), BX
16946
16947 // genMemMoveShort
16948 CMPQ R8, $0x08
16949 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
16950 CMPQ R8, $0x10
16951 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
16952 CMPQ R8, $0x20
16953 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
16954 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
16955
16956emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
16957 MOVQ (R9), R10
16958 MOVQ R10, (AX)
16959 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
16960
16961emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
16962 MOVQ (R9), R10
16963 MOVQ -8(R9)(R8*1), R9
16964 MOVQ R10, (AX)
16965 MOVQ R9, -8(AX)(R8*1)
16966 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
16967
16968emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
16969 MOVOU (R9), X0
16970 MOVOU -16(R9)(R8*1), X1
16971 MOVOU X0, (AX)
16972 MOVOU X1, -16(AX)(R8*1)
16973 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
16974
16975emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
16976 MOVOU (R9), X0
16977 MOVOU 16(R9), X1
16978 MOVOU -32(R9)(R8*1), X2
16979 MOVOU -16(R9)(R8*1), X3
16980 MOVOU X0, (AX)
16981 MOVOU X1, 16(AX)
16982 MOVOU X2, -32(AX)(R8*1)
16983 MOVOU X3, -16(AX)(R8*1)
16984
16985memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
16986 MOVQ BX, AX
16987 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
16988
16989memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
16990 LEAQ (AX)(R8*1), BX
16991
16992 // genMemMoveLong
16993 MOVOU (R9), X0
16994 MOVOU 16(R9), X1
16995 MOVOU -32(R9)(R8*1), X2
16996 MOVOU -16(R9)(R8*1), X3
16997 MOVQ R8, R12
16998 SHRQ $0x05, R12
16999 MOVQ AX, R10
17000 ANDL $0x0000001f, R10
17001 MOVQ $0x00000040, R13
17002 SUBQ R10, R13
17003 DECQ R12
17004 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17005 LEAQ -32(R9)(R13*1), R10
17006 LEAQ -32(AX)(R13*1), R14
17007
17008emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
17009 MOVOU (R10), X4
17010 MOVOU 16(R10), X5
17011 MOVOA X4, (R14)
17012 MOVOA X5, 16(R14)
17013 ADDQ $0x20, R14
17014 ADDQ $0x20, R10
17015 ADDQ $0x20, R13
17016 DECQ R12
17017 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
17018
17019emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
17020 MOVOU -32(R9)(R13*1), X4
17021 MOVOU -16(R9)(R13*1), X5
17022 MOVOA X4, -32(AX)(R13*1)
17023 MOVOA X5, -16(AX)(R13*1)
17024 ADDQ $0x20, R13
17025 CMPQ R8, R13
17026 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17027 MOVOU X0, (AX)
17028 MOVOU X1, 16(AX)
17029 MOVOU X2, -32(AX)(R8*1)
17030 MOVOU X3, -16(AX)(R8*1)
17031 MOVQ BX, AX
17032
17033emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
17034 ADDL R11, CX
17035 ADDL $0x04, R11
17036 MOVL CX, 12(SP)
17037
17038 // emitCopy
17039two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
17040 CMPL R11, $0x40
17041 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
17042 MOVB $0xee, (AX)
17043 MOVW DI, 1(AX)
17044 LEAL -60(R11), R11
17045 ADDQ $0x03, AX
17046 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
17047
17048two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
17049 MOVL R11, BX
17050 SHLL $0x02, BX
17051 CMPL R11, $0x0c
17052 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
17053 CMPL DI, $0x00000800
17054 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
17055 LEAL -15(BX), BX
17056 MOVB DI, 1(AX)
17057 SHRL $0x08, DI
17058 SHLL $0x05, DI
17059 ORL DI, BX
17060 MOVB BL, (AX)
17061 ADDQ $0x02, AX
17062 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
17063
17064emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
17065 LEAL -2(BX), BX
17066 MOVB BL, (AX)
17067 MOVW DI, 1(AX)
17068 ADDQ $0x03, AX
17069
17070match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
17071 CMPL CX, 8(SP)
17072 JAE emit_remainder_encodeSnappyBetterBlockAsm10B
17073 CMPQ AX, (SP)
17074 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
17075 MOVQ $0x00000000, ret+48(FP)
17076 RET
17077
17078match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
17079 MOVQ $0x0000cf1bbcdcbf9b, BX
17080 MOVQ $0x9e3779b1, DI
17081 LEAQ 1(SI), SI
17082 LEAQ -2(CX), R8
17083 MOVQ (DX)(SI*1), R9
17084 MOVQ 1(DX)(SI*1), R10
17085 MOVQ (DX)(R8*1), R11
17086 MOVQ 1(DX)(R8*1), R12
17087 SHLQ $0x10, R9
17088 IMULQ BX, R9
17089 SHRQ $0x34, R9
17090 SHLQ $0x20, R10
17091 IMULQ DI, R10
17092 SHRQ $0x36, R10
17093 SHLQ $0x10, R11
17094 IMULQ BX, R11
17095 SHRQ $0x34, R11
17096 SHLQ $0x20, R12
17097 IMULQ DI, R12
17098 SHRQ $0x36, R12
17099 LEAQ 1(SI), DI
17100 LEAQ 1(R8), R13
17101 MOVL SI, 24(SP)(R9*4)
17102 MOVL R8, 24(SP)(R11*4)
17103 MOVL DI, 16408(SP)(R10*4)
17104 MOVL R13, 16408(SP)(R12*4)
17105 LEAQ 1(R8)(SI*1), DI
17106 SHRQ $0x01, DI
17107 ADDQ $0x01, SI
17108 SUBQ $0x01, R8
17109
17110index_loop_encodeSnappyBetterBlockAsm10B:
17111 CMPQ DI, R8
17112 JAE search_loop_encodeSnappyBetterBlockAsm10B
17113 MOVQ (DX)(SI*1), R9
17114 MOVQ (DX)(DI*1), R10
17115 SHLQ $0x10, R9
17116 IMULQ BX, R9
17117 SHRQ $0x34, R9
17118 SHLQ $0x10, R10
17119 IMULQ BX, R10
17120 SHRQ $0x34, R10
17121 MOVL SI, 24(SP)(R9*4)
17122 MOVL DI, 24(SP)(R10*4)
17123 ADDQ $0x02, SI
17124 ADDQ $0x02, DI
17125 JMP index_loop_encodeSnappyBetterBlockAsm10B
17126
17127emit_remainder_encodeSnappyBetterBlockAsm10B:
17128 MOVQ src_len+32(FP), CX
17129 SUBL 12(SP), CX
17130 LEAQ 3(AX)(CX*1), CX
17131 CMPQ CX, (SP)
17132 JB emit_remainder_ok_encodeSnappyBetterBlockAsm10B
17133 MOVQ $0x00000000, ret+48(FP)
17134 RET
17135
17136emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
17137 MOVQ src_len+32(FP), CX
17138 MOVL 12(SP), BX
17139 CMPL BX, CX
17140 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
17141 MOVL CX, SI
17142 MOVL CX, 12(SP)
17143 LEAQ (DX)(BX*1), CX
17144 SUBL BX, SI
17145 LEAL -1(SI), DX
17146 CMPL DX, $0x3c
17147 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
17148 CMPL DX, $0x00000100
17149 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
17150 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
17151
17152three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
17153 MOVB $0xf4, (AX)
17154 MOVW DX, 1(AX)
17155 ADDQ $0x03, AX
17156 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
17157
17158two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
17159 MOVB $0xf0, (AX)
17160 MOVB DL, 1(AX)
17161 ADDQ $0x02, AX
17162 CMPL DX, $0x40
17163 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
17164 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
17165
17166one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
17167 SHLB $0x02, DL
17168 MOVB DL, (AX)
17169 ADDQ $0x01, AX
17170
17171memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
17172 LEAQ (AX)(SI*1), DX
17173 MOVL SI, BX
17174
17175 // genMemMoveShort
17176 CMPQ BX, $0x03
17177 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2
17178 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3
17179 CMPQ BX, $0x08
17180 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7
17181 CMPQ BX, $0x10
17182 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
17183 CMPQ BX, $0x20
17184 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
17185 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
17186
17187emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2:
17188 MOVB (CX), SI
17189 MOVB -1(CX)(BX*1), CL
17190 MOVB SI, (AX)
17191 MOVB CL, -1(AX)(BX*1)
17192 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17193
17194emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3:
17195 MOVW (CX), SI
17196 MOVB 2(CX), CL
17197 MOVW SI, (AX)
17198 MOVB CL, 2(AX)
17199 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17200
17201emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7:
17202 MOVL (CX), SI
17203 MOVL -4(CX)(BX*1), CX
17204 MOVL SI, (AX)
17205 MOVL CX, -4(AX)(BX*1)
17206 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17207
17208emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
17209 MOVQ (CX), SI
17210 MOVQ -8(CX)(BX*1), CX
17211 MOVQ SI, (AX)
17212 MOVQ CX, -8(AX)(BX*1)
17213 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17214
17215emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
17216 MOVOU (CX), X0
17217 MOVOU -16(CX)(BX*1), X1
17218 MOVOU X0, (AX)
17219 MOVOU X1, -16(AX)(BX*1)
17220 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17221
17222emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
17223 MOVOU (CX), X0
17224 MOVOU 16(CX), X1
17225 MOVOU -32(CX)(BX*1), X2
17226 MOVOU -16(CX)(BX*1), X3
17227 MOVOU X0, (AX)
17228 MOVOU X1, 16(AX)
17229 MOVOU X2, -32(AX)(BX*1)
17230 MOVOU X3, -16(AX)(BX*1)
17231
17232memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
17233 MOVQ DX, AX
17234 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
17235
17236memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
17237 LEAQ (AX)(SI*1), DX
17238 MOVL SI, BX
17239
17240 // genMemMoveLong
17241 MOVOU (CX), X0
17242 MOVOU 16(CX), X1
17243 MOVOU -32(CX)(BX*1), X2
17244 MOVOU -16(CX)(BX*1), X3
17245 MOVQ BX, DI
17246 SHRQ $0x05, DI
17247 MOVQ AX, SI
17248 ANDL $0x0000001f, SI
17249 MOVQ $0x00000040, R8
17250 SUBQ SI, R8
17251 DECQ DI
17252 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17253 LEAQ -32(CX)(R8*1), SI
17254 LEAQ -32(AX)(R8*1), R9
17255
17256emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
17257 MOVOU (SI), X4
17258 MOVOU 16(SI), X5
17259 MOVOA X4, (R9)
17260 MOVOA X5, 16(R9)
17261 ADDQ $0x20, R9
17262 ADDQ $0x20, SI
17263 ADDQ $0x20, R8
17264 DECQ DI
17265 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
17266
17267emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
17268 MOVOU -32(CX)(R8*1), X4
17269 MOVOU -16(CX)(R8*1), X5
17270 MOVOA X4, -32(AX)(R8*1)
17271 MOVOA X5, -16(AX)(R8*1)
17272 ADDQ $0x20, R8
17273 CMPQ BX, R8
17274 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17275 MOVOU X0, (AX)
17276 MOVOU X1, 16(AX)
17277 MOVOU X2, -32(AX)(BX*1)
17278 MOVOU X3, -16(AX)(BX*1)
17279 MOVQ DX, AX
17280
17281emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
17282 MOVQ dst_base+0(FP), CX
17283 SUBQ CX, AX
17284 MOVQ AX, ret+48(FP)
17285 RET
17286
17287// func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int
17288// Requires: BMI, SSE2
17289TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56
17290 MOVQ dst_base+0(FP), AX
17291 MOVQ $0x00000028, CX
17292 LEAQ 24(SP), DX
17293 PXOR X0, X0
17294
17295zero_loop_encodeSnappyBetterBlockAsm8B:
17296 MOVOU X0, (DX)
17297 MOVOU X0, 16(DX)
17298 MOVOU X0, 32(DX)
17299 MOVOU X0, 48(DX)
17300 MOVOU X0, 64(DX)
17301 MOVOU X0, 80(DX)
17302 MOVOU X0, 96(DX)
17303 MOVOU X0, 112(DX)
17304 ADDQ $0x80, DX
17305 DECQ CX
17306 JNZ zero_loop_encodeSnappyBetterBlockAsm8B
17307 MOVL $0x00000000, 12(SP)
17308 MOVQ src_len+32(FP), CX
17309 LEAQ -9(CX), DX
17310 LEAQ -8(CX), BX
17311 MOVL BX, 8(SP)
17312 SHRQ $0x05, CX
17313 SUBL CX, DX
17314 LEAQ (AX)(DX*1), DX
17315 MOVQ DX, (SP)
17316 MOVL $0x00000001, CX
17317 MOVL $0x00000000, 16(SP)
17318 MOVQ src_base+24(FP), DX
17319
17320search_loop_encodeSnappyBetterBlockAsm8B:
17321 MOVL CX, BX
17322 SUBL 12(SP), BX
17323 SHRL $0x04, BX
17324 LEAL 1(CX)(BX*1), BX
17325 CMPL BX, 8(SP)
17326 JAE emit_remainder_encodeSnappyBetterBlockAsm8B
17327 MOVQ (DX)(CX*1), SI
17328 MOVL BX, 20(SP)
17329 MOVQ $0x0000cf1bbcdcbf9b, R8
17330 MOVQ $0x9e3779b1, BX
17331 MOVQ SI, R9
17332 MOVQ SI, R10
17333 SHLQ $0x10, R9
17334 IMULQ R8, R9
17335 SHRQ $0x36, R9
17336 SHLQ $0x20, R10
17337 IMULQ BX, R10
17338 SHRQ $0x38, R10
17339 MOVL 24(SP)(R9*4), BX
17340 MOVL 4120(SP)(R10*4), DI
17341 MOVL CX, 24(SP)(R9*4)
17342 MOVL CX, 4120(SP)(R10*4)
17343 MOVQ (DX)(BX*1), R9
17344 MOVQ (DX)(DI*1), R10
17345 CMPQ R9, SI
17346 JEQ candidate_match_encodeSnappyBetterBlockAsm8B
17347 CMPQ R10, SI
17348 JNE no_short_found_encodeSnappyBetterBlockAsm8B
17349 MOVL DI, BX
17350 JMP candidate_match_encodeSnappyBetterBlockAsm8B
17351
17352no_short_found_encodeSnappyBetterBlockAsm8B:
17353 CMPL R9, SI
17354 JEQ candidate_match_encodeSnappyBetterBlockAsm8B
17355 CMPL R10, SI
17356 JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
17357 MOVL 20(SP), CX
17358 JMP search_loop_encodeSnappyBetterBlockAsm8B
17359
17360candidateS_match_encodeSnappyBetterBlockAsm8B:
17361 SHRQ $0x08, SI
17362 MOVQ SI, R9
17363 SHLQ $0x10, R9
17364 IMULQ R8, R9
17365 SHRQ $0x36, R9
17366 MOVL 24(SP)(R9*4), BX
17367 INCL CX
17368 MOVL CX, 24(SP)(R9*4)
17369 CMPL (DX)(BX*1), SI
17370 JEQ candidate_match_encodeSnappyBetterBlockAsm8B
17371 DECL CX
17372 MOVL DI, BX
17373
17374candidate_match_encodeSnappyBetterBlockAsm8B:
17375 MOVL 12(SP), SI
17376 TESTL BX, BX
17377 JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
17378
17379match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
17380 CMPL CX, SI
17381 JBE match_extend_back_end_encodeSnappyBetterBlockAsm8B
17382 MOVB -1(DX)(BX*1), DI
17383 MOVB -1(DX)(CX*1), R8
17384 CMPB DI, R8
17385 JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B
17386 LEAL -1(CX), CX
17387 DECL BX
17388 JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
17389 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B
17390
17391match_extend_back_end_encodeSnappyBetterBlockAsm8B:
17392 MOVL CX, SI
17393 SUBL 12(SP), SI
17394 LEAQ 3(AX)(SI*1), SI
17395 CMPQ SI, (SP)
17396 JB match_dst_size_check_encodeSnappyBetterBlockAsm8B
17397 MOVQ $0x00000000, ret+48(FP)
17398 RET
17399
17400match_dst_size_check_encodeSnappyBetterBlockAsm8B:
17401 MOVL CX, SI
17402 ADDL $0x04, CX
17403 ADDL $0x04, BX
17404 MOVQ src_len+32(FP), DI
17405 SUBL CX, DI
17406 LEAQ (DX)(CX*1), R8
17407 LEAQ (DX)(BX*1), R9
17408
17409 // matchLen
17410 XORL R11, R11
17411
17412matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B:
17413 CMPL DI, $0x10
17414 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B
17415 MOVQ (R8)(R11*1), R10
17416 MOVQ 8(R8)(R11*1), R12
17417 XORQ (R9)(R11*1), R10
17418 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
17419 XORQ 8(R9)(R11*1), R12
17420 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B
17421 LEAL -16(DI), DI
17422 LEAL 16(R11), R11
17423 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B
17424
17425matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B:
17426#ifdef GOAMD64_v3
17427 TZCNTQ R12, R12
17428
17429#else
17430 BSFQ R12, R12
17431
17432#endif
17433 SARQ $0x03, R12
17434 LEAL 8(R11)(R12*1), R11
17435 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
17436
17437matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B:
17438 CMPL DI, $0x08
17439 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
17440 MOVQ (R8)(R11*1), R10
17441 XORQ (R9)(R11*1), R10
17442 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
17443 LEAL -8(DI), DI
17444 LEAL 8(R11), R11
17445 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
17446
17447matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B:
17448#ifdef GOAMD64_v3
17449 TZCNTQ R10, R10
17450
17451#else
17452 BSFQ R10, R10
17453
17454#endif
17455 SARQ $0x03, R10
17456 LEAL (R11)(R10*1), R11
17457 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
17458
17459matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
17460 CMPL DI, $0x04
17461 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
17462 MOVL (R8)(R11*1), R10
17463 CMPL (R9)(R11*1), R10
17464 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
17465 LEAL -4(DI), DI
17466 LEAL 4(R11), R11
17467
17468matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
17469 CMPL DI, $0x01
17470 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
17471 JB match_nolit_end_encodeSnappyBetterBlockAsm8B
17472 MOVW (R8)(R11*1), R10
17473 CMPW (R9)(R11*1), R10
17474 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
17475 LEAL 2(R11), R11
17476 SUBL $0x02, DI
17477 JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
17478
17479matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
17480 MOVB (R8)(R11*1), R10
17481 CMPB (R9)(R11*1), R10
17482 JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
17483 LEAL 1(R11), R11
17484
17485match_nolit_end_encodeSnappyBetterBlockAsm8B:
17486 MOVL CX, DI
17487 SUBL BX, DI
17488
17489 // Check if repeat
17490 MOVL DI, 16(SP)
17491 MOVL 12(SP), BX
17492 CMPL BX, SI
17493 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
17494 MOVL SI, R8
17495 MOVL SI, 12(SP)
17496 LEAQ (DX)(BX*1), R9
17497 SUBL BX, R8
17498 LEAL -1(R8), BX
17499 CMPL BX, $0x3c
17500 JB one_byte_match_emit_encodeSnappyBetterBlockAsm8B
17501 CMPL BX, $0x00000100
17502 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
17503 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm8B
17504
17505three_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
17506 MOVB $0xf4, (AX)
17507 MOVW BX, 1(AX)
17508 ADDQ $0x03, AX
17509 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
17510
17511two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
17512 MOVB $0xf0, (AX)
17513 MOVB BL, 1(AX)
17514 ADDQ $0x02, AX
17515 CMPL BX, $0x40
17516 JB memmove_match_emit_encodeSnappyBetterBlockAsm8B
17517 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
17518
17519one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
17520 SHLB $0x02, BL
17521 MOVB BL, (AX)
17522 ADDQ $0x01, AX
17523
17524memmove_match_emit_encodeSnappyBetterBlockAsm8B:
17525 LEAQ (AX)(R8*1), BX
17526
17527 // genMemMoveShort
17528 CMPQ R8, $0x08
17529 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
17530 CMPQ R8, $0x10
17531 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
17532 CMPQ R8, $0x20
17533 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
17534 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
17535
17536emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
17537 MOVQ (R9), R10
17538 MOVQ R10, (AX)
17539 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
17540
17541emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
17542 MOVQ (R9), R10
17543 MOVQ -8(R9)(R8*1), R9
17544 MOVQ R10, (AX)
17545 MOVQ R9, -8(AX)(R8*1)
17546 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
17547
17548emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
17549 MOVOU (R9), X0
17550 MOVOU -16(R9)(R8*1), X1
17551 MOVOU X0, (AX)
17552 MOVOU X1, -16(AX)(R8*1)
17553 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
17554
17555emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
17556 MOVOU (R9), X0
17557 MOVOU 16(R9), X1
17558 MOVOU -32(R9)(R8*1), X2
17559 MOVOU -16(R9)(R8*1), X3
17560 MOVOU X0, (AX)
17561 MOVOU X1, 16(AX)
17562 MOVOU X2, -32(AX)(R8*1)
17563 MOVOU X3, -16(AX)(R8*1)
17564
17565memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
17566 MOVQ BX, AX
17567 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
17568
17569memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
17570 LEAQ (AX)(R8*1), BX
17571
17572 // genMemMoveLong
17573 MOVOU (R9), X0
17574 MOVOU 16(R9), X1
17575 MOVOU -32(R9)(R8*1), X2
17576 MOVOU -16(R9)(R8*1), X3
17577 MOVQ R8, R12
17578 SHRQ $0x05, R12
17579 MOVQ AX, R10
17580 ANDL $0x0000001f, R10
17581 MOVQ $0x00000040, R13
17582 SUBQ R10, R13
17583 DECQ R12
17584 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17585 LEAQ -32(R9)(R13*1), R10
17586 LEAQ -32(AX)(R13*1), R14
17587
17588emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
17589 MOVOU (R10), X4
17590 MOVOU 16(R10), X5
17591 MOVOA X4, (R14)
17592 MOVOA X5, 16(R14)
17593 ADDQ $0x20, R14
17594 ADDQ $0x20, R10
17595 ADDQ $0x20, R13
17596 DECQ R12
17597 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
17598
17599emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
17600 MOVOU -32(R9)(R13*1), X4
17601 MOVOU -16(R9)(R13*1), X5
17602 MOVOA X4, -32(AX)(R13*1)
17603 MOVOA X5, -16(AX)(R13*1)
17604 ADDQ $0x20, R13
17605 CMPQ R8, R13
17606 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17607 MOVOU X0, (AX)
17608 MOVOU X1, 16(AX)
17609 MOVOU X2, -32(AX)(R8*1)
17610 MOVOU X3, -16(AX)(R8*1)
17611 MOVQ BX, AX
17612
17613emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
17614 ADDL R11, CX
17615 ADDL $0x04, R11
17616 MOVL CX, 12(SP)
17617
17618 // emitCopy
17619two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
17620 CMPL R11, $0x40
17621 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
17622 MOVB $0xee, (AX)
17623 MOVW DI, 1(AX)
17624 LEAL -60(R11), R11
17625 ADDQ $0x03, AX
17626 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
17627
17628two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
17629 MOVL R11, BX
17630 SHLL $0x02, BX
17631 CMPL R11, $0x0c
17632 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
17633 LEAL -15(BX), BX
17634 MOVB DI, 1(AX)
17635 SHRL $0x08, DI
17636 SHLL $0x05, DI
17637 ORL DI, BX
17638 MOVB BL, (AX)
17639 ADDQ $0x02, AX
17640 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
17641
17642emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
17643 LEAL -2(BX), BX
17644 MOVB BL, (AX)
17645 MOVW DI, 1(AX)
17646 ADDQ $0x03, AX
17647
17648match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
17649 CMPL CX, 8(SP)
17650 JAE emit_remainder_encodeSnappyBetterBlockAsm8B
17651 CMPQ AX, (SP)
17652 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
17653 MOVQ $0x00000000, ret+48(FP)
17654 RET
17655
17656match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
17657 MOVQ $0x0000cf1bbcdcbf9b, BX
17658 MOVQ $0x9e3779b1, DI
17659 LEAQ 1(SI), SI
17660 LEAQ -2(CX), R8
17661 MOVQ (DX)(SI*1), R9
17662 MOVQ 1(DX)(SI*1), R10
17663 MOVQ (DX)(R8*1), R11
17664 MOVQ 1(DX)(R8*1), R12
17665 SHLQ $0x10, R9
17666 IMULQ BX, R9
17667 SHRQ $0x36, R9
17668 SHLQ $0x20, R10
17669 IMULQ DI, R10
17670 SHRQ $0x38, R10
17671 SHLQ $0x10, R11
17672 IMULQ BX, R11
17673 SHRQ $0x36, R11
17674 SHLQ $0x20, R12
17675 IMULQ DI, R12
17676 SHRQ $0x38, R12
17677 LEAQ 1(SI), DI
17678 LEAQ 1(R8), R13
17679 MOVL SI, 24(SP)(R9*4)
17680 MOVL R8, 24(SP)(R11*4)
17681 MOVL DI, 4120(SP)(R10*4)
17682 MOVL R13, 4120(SP)(R12*4)
17683 LEAQ 1(R8)(SI*1), DI
17684 SHRQ $0x01, DI
17685 ADDQ $0x01, SI
17686 SUBQ $0x01, R8
17687
17688index_loop_encodeSnappyBetterBlockAsm8B:
17689 CMPQ DI, R8
17690 JAE search_loop_encodeSnappyBetterBlockAsm8B
17691 MOVQ (DX)(SI*1), R9
17692 MOVQ (DX)(DI*1), R10
17693 SHLQ $0x10, R9
17694 IMULQ BX, R9
17695 SHRQ $0x36, R9
17696 SHLQ $0x10, R10
17697 IMULQ BX, R10
17698 SHRQ $0x36, R10
17699 MOVL SI, 24(SP)(R9*4)
17700 MOVL DI, 24(SP)(R10*4)
17701 ADDQ $0x02, SI
17702 ADDQ $0x02, DI
17703 JMP index_loop_encodeSnappyBetterBlockAsm8B
17704
17705emit_remainder_encodeSnappyBetterBlockAsm8B:
17706 MOVQ src_len+32(FP), CX
17707 SUBL 12(SP), CX
17708 LEAQ 3(AX)(CX*1), CX
17709 CMPQ CX, (SP)
17710 JB emit_remainder_ok_encodeSnappyBetterBlockAsm8B
17711 MOVQ $0x00000000, ret+48(FP)
17712 RET
17713
17714emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
17715 MOVQ src_len+32(FP), CX
17716 MOVL 12(SP), BX
17717 CMPL BX, CX
17718 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
17719 MOVL CX, SI
17720 MOVL CX, 12(SP)
17721 LEAQ (DX)(BX*1), CX
17722 SUBL BX, SI
17723 LEAL -1(SI), DX
17724 CMPL DX, $0x3c
17725 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
17726 CMPL DX, $0x00000100
17727 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
17728 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
17729
17730three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
17731 MOVB $0xf4, (AX)
17732 MOVW DX, 1(AX)
17733 ADDQ $0x03, AX
17734 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
17735
17736two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
17737 MOVB $0xf0, (AX)
17738 MOVB DL, 1(AX)
17739 ADDQ $0x02, AX
17740 CMPL DX, $0x40
17741 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
17742 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
17743
17744one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
17745 SHLB $0x02, DL
17746 MOVB DL, (AX)
17747 ADDQ $0x01, AX
17748
17749memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
17750 LEAQ (AX)(SI*1), DX
17751 MOVL SI, BX
17752
17753 // genMemMoveShort
17754 CMPQ BX, $0x03
17755 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2
17756 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3
17757 CMPQ BX, $0x08
17758 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7
17759 CMPQ BX, $0x10
17760 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
17761 CMPQ BX, $0x20
17762 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
17763 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
17764
17765emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2:
17766 MOVB (CX), SI
17767 MOVB -1(CX)(BX*1), CL
17768 MOVB SI, (AX)
17769 MOVB CL, -1(AX)(BX*1)
17770 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17771
17772emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3:
17773 MOVW (CX), SI
17774 MOVB 2(CX), CL
17775 MOVW SI, (AX)
17776 MOVB CL, 2(AX)
17777 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17778
17779emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7:
17780 MOVL (CX), SI
17781 MOVL -4(CX)(BX*1), CX
17782 MOVL SI, (AX)
17783 MOVL CX, -4(AX)(BX*1)
17784 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17785
17786emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
17787 MOVQ (CX), SI
17788 MOVQ -8(CX)(BX*1), CX
17789 MOVQ SI, (AX)
17790 MOVQ CX, -8(AX)(BX*1)
17791 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17792
17793emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
17794 MOVOU (CX), X0
17795 MOVOU -16(CX)(BX*1), X1
17796 MOVOU X0, (AX)
17797 MOVOU X1, -16(AX)(BX*1)
17798 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17799
17800emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
17801 MOVOU (CX), X0
17802 MOVOU 16(CX), X1
17803 MOVOU -32(CX)(BX*1), X2
17804 MOVOU -16(CX)(BX*1), X3
17805 MOVOU X0, (AX)
17806 MOVOU X1, 16(AX)
17807 MOVOU X2, -32(AX)(BX*1)
17808 MOVOU X3, -16(AX)(BX*1)
17809
17810memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
17811 MOVQ DX, AX
17812 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
17813
17814memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
17815 LEAQ (AX)(SI*1), DX
17816 MOVL SI, BX
17817
17818 // genMemMoveLong
17819 MOVOU (CX), X0
17820 MOVOU 16(CX), X1
17821 MOVOU -32(CX)(BX*1), X2
17822 MOVOU -16(CX)(BX*1), X3
17823 MOVQ BX, DI
17824 SHRQ $0x05, DI
17825 MOVQ AX, SI
17826 ANDL $0x0000001f, SI
17827 MOVQ $0x00000040, R8
17828 SUBQ SI, R8
17829 DECQ DI
17830 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17831 LEAQ -32(CX)(R8*1), SI
17832 LEAQ -32(AX)(R8*1), R9
17833
17834emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
17835 MOVOU (SI), X4
17836 MOVOU 16(SI), X5
17837 MOVOA X4, (R9)
17838 MOVOA X5, 16(R9)
17839 ADDQ $0x20, R9
17840 ADDQ $0x20, SI
17841 ADDQ $0x20, R8
17842 DECQ DI
17843 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
17844
17845emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
17846 MOVOU -32(CX)(R8*1), X4
17847 MOVOU -16(CX)(R8*1), X5
17848 MOVOA X4, -32(AX)(R8*1)
17849 MOVOA X5, -16(AX)(R8*1)
17850 ADDQ $0x20, R8
17851 CMPQ BX, R8
17852 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17853 MOVOU X0, (AX)
17854 MOVOU X1, 16(AX)
17855 MOVOU X2, -32(AX)(BX*1)
17856 MOVOU X3, -16(AX)(BX*1)
17857 MOVQ DX, AX
17858
17859emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
17860 MOVQ dst_base+0(FP), CX
17861 SUBQ CX, AX
17862 MOVQ AX, ret+48(FP)
17863 RET
17864
17865// func calcBlockSize(src []byte) int
17866// Requires: BMI, SSE2
17867TEXT ·calcBlockSize(SB), $32792-32
17868 XORQ AX, AX
17869 MOVQ $0x00000100, CX
17870 LEAQ 24(SP), DX
17871 PXOR X0, X0
17872
17873zero_loop_calcBlockSize:
17874 MOVOU X0, (DX)
17875 MOVOU X0, 16(DX)
17876 MOVOU X0, 32(DX)
17877 MOVOU X0, 48(DX)
17878 MOVOU X0, 64(DX)
17879 MOVOU X0, 80(DX)
17880 MOVOU X0, 96(DX)
17881 MOVOU X0, 112(DX)
17882 ADDQ $0x80, DX
17883 DECQ CX
17884 JNZ zero_loop_calcBlockSize
17885 MOVL $0x00000000, 12(SP)
17886 MOVQ src_len+8(FP), CX
17887 LEAQ -9(CX), DX
17888 LEAQ -8(CX), BX
17889 MOVL BX, 8(SP)
17890 SHRQ $0x05, CX
17891 SUBL CX, DX
17892 LEAQ (AX)(DX*1), DX
17893 MOVQ DX, (SP)
17894 MOVL $0x00000001, CX
17895 MOVL CX, 16(SP)
17896 MOVQ src_base+0(FP), DX
17897
17898search_loop_calcBlockSize:
17899 MOVL CX, BX
17900 SUBL 12(SP), BX
17901 SHRL $0x05, BX
17902 LEAL 4(CX)(BX*1), BX
17903 CMPL BX, 8(SP)
17904 JAE emit_remainder_calcBlockSize
17905 MOVQ (DX)(CX*1), SI
17906 MOVL BX, 20(SP)
17907 MOVQ $0x0000cf1bbcdcbf9b, R8
17908 MOVQ SI, R9
17909 MOVQ SI, R10
17910 SHRQ $0x08, R10
17911 SHLQ $0x10, R9
17912 IMULQ R8, R9
17913 SHRQ $0x33, R9
17914 SHLQ $0x10, R10
17915 IMULQ R8, R10
17916 SHRQ $0x33, R10
17917 MOVL 24(SP)(R9*4), BX
17918 MOVL 24(SP)(R10*4), DI
17919 MOVL CX, 24(SP)(R9*4)
17920 LEAL 1(CX), R9
17921 MOVL R9, 24(SP)(R10*4)
17922 MOVQ SI, R9
17923 SHRQ $0x10, R9
17924 SHLQ $0x10, R9
17925 IMULQ R8, R9
17926 SHRQ $0x33, R9
17927 MOVL CX, R8
17928 SUBL 16(SP), R8
17929 MOVL 1(DX)(R8*1), R10
17930 MOVQ SI, R8
17931 SHRQ $0x08, R8
17932 CMPL R8, R10
17933 JNE no_repeat_found_calcBlockSize
17934 LEAL 1(CX), SI
17935 MOVL 12(SP), BX
17936 MOVL SI, DI
17937 SUBL 16(SP), DI
17938 JZ repeat_extend_back_end_calcBlockSize
17939
17940repeat_extend_back_loop_calcBlockSize:
17941 CMPL SI, BX
17942 JBE repeat_extend_back_end_calcBlockSize
17943 MOVB -1(DX)(DI*1), R8
17944 MOVB -1(DX)(SI*1), R9
17945 CMPB R8, R9
17946 JNE repeat_extend_back_end_calcBlockSize
17947 LEAL -1(SI), SI
17948 DECL DI
17949 JNZ repeat_extend_back_loop_calcBlockSize
17950
17951repeat_extend_back_end_calcBlockSize:
17952 MOVL 12(SP), BX
17953 CMPL BX, SI
17954 JEQ emit_literal_done_repeat_emit_calcBlockSize
17955 MOVL SI, DI
17956 MOVL SI, 12(SP)
17957 LEAQ (DX)(BX*1), R8
17958 SUBL BX, DI
17959 LEAL -1(DI), BX
17960 CMPL BX, $0x3c
17961 JB one_byte_repeat_emit_calcBlockSize
17962 CMPL BX, $0x00000100
17963 JB two_bytes_repeat_emit_calcBlockSize
17964 CMPL BX, $0x00010000
17965 JB three_bytes_repeat_emit_calcBlockSize
17966 CMPL BX, $0x01000000
17967 JB four_bytes_repeat_emit_calcBlockSize
17968 ADDQ $0x05, AX
17969 JMP memmove_long_repeat_emit_calcBlockSize
17970
17971four_bytes_repeat_emit_calcBlockSize:
17972 ADDQ $0x04, AX
17973 JMP memmove_long_repeat_emit_calcBlockSize
17974
17975three_bytes_repeat_emit_calcBlockSize:
17976 ADDQ $0x03, AX
17977 JMP memmove_long_repeat_emit_calcBlockSize
17978
17979two_bytes_repeat_emit_calcBlockSize:
17980 ADDQ $0x02, AX
17981 CMPL BX, $0x40
17982 JB memmove_repeat_emit_calcBlockSize
17983 JMP memmove_long_repeat_emit_calcBlockSize
17984
17985one_byte_repeat_emit_calcBlockSize:
17986 ADDQ $0x01, AX
17987
17988memmove_repeat_emit_calcBlockSize:
17989 LEAQ (AX)(DI*1), AX
17990 JMP emit_literal_done_repeat_emit_calcBlockSize
17991
17992memmove_long_repeat_emit_calcBlockSize:
17993 LEAQ (AX)(DI*1), AX
17994
17995emit_literal_done_repeat_emit_calcBlockSize:
17996 ADDL $0x05, CX
17997 MOVL CX, BX
17998 SUBL 16(SP), BX
17999 MOVQ src_len+8(FP), DI
18000 SUBL CX, DI
18001 LEAQ (DX)(CX*1), R8
18002 LEAQ (DX)(BX*1), BX
18003
18004 // matchLen
18005 XORL R10, R10
18006
18007matchlen_loopback_16_repeat_extend_calcBlockSize:
18008 CMPL DI, $0x10
18009 JB matchlen_match8_repeat_extend_calcBlockSize
18010 MOVQ (R8)(R10*1), R9
18011 MOVQ 8(R8)(R10*1), R11
18012 XORQ (BX)(R10*1), R9
18013 JNZ matchlen_bsf_8_repeat_extend_calcBlockSize
18014 XORQ 8(BX)(R10*1), R11
18015 JNZ matchlen_bsf_16repeat_extend_calcBlockSize
18016 LEAL -16(DI), DI
18017 LEAL 16(R10), R10
18018 JMP matchlen_loopback_16_repeat_extend_calcBlockSize
18019
18020matchlen_bsf_16repeat_extend_calcBlockSize:
18021#ifdef GOAMD64_v3
18022 TZCNTQ R11, R11
18023
18024#else
18025 BSFQ R11, R11
18026
18027#endif
18028 SARQ $0x03, R11
18029 LEAL 8(R10)(R11*1), R10
18030 JMP repeat_extend_forward_end_calcBlockSize
18031
18032matchlen_match8_repeat_extend_calcBlockSize:
18033 CMPL DI, $0x08
18034 JB matchlen_match4_repeat_extend_calcBlockSize
18035 MOVQ (R8)(R10*1), R9
18036 XORQ (BX)(R10*1), R9
18037 JNZ matchlen_bsf_8_repeat_extend_calcBlockSize
18038 LEAL -8(DI), DI
18039 LEAL 8(R10), R10
18040 JMP matchlen_match4_repeat_extend_calcBlockSize
18041
18042matchlen_bsf_8_repeat_extend_calcBlockSize:
18043#ifdef GOAMD64_v3
18044 TZCNTQ R9, R9
18045
18046#else
18047 BSFQ R9, R9
18048
18049#endif
18050 SARQ $0x03, R9
18051 LEAL (R10)(R9*1), R10
18052 JMP repeat_extend_forward_end_calcBlockSize
18053
18054matchlen_match4_repeat_extend_calcBlockSize:
18055 CMPL DI, $0x04
18056 JB matchlen_match2_repeat_extend_calcBlockSize
18057 MOVL (R8)(R10*1), R9
18058 CMPL (BX)(R10*1), R9
18059 JNE matchlen_match2_repeat_extend_calcBlockSize
18060 LEAL -4(DI), DI
18061 LEAL 4(R10), R10
18062
18063matchlen_match2_repeat_extend_calcBlockSize:
18064 CMPL DI, $0x01
18065 JE matchlen_match1_repeat_extend_calcBlockSize
18066 JB repeat_extend_forward_end_calcBlockSize
18067 MOVW (R8)(R10*1), R9
18068 CMPW (BX)(R10*1), R9
18069 JNE matchlen_match1_repeat_extend_calcBlockSize
18070 LEAL 2(R10), R10
18071 SUBL $0x02, DI
18072 JZ repeat_extend_forward_end_calcBlockSize
18073
18074matchlen_match1_repeat_extend_calcBlockSize:
18075 MOVB (R8)(R10*1), R9
18076 CMPB (BX)(R10*1), R9
18077 JNE repeat_extend_forward_end_calcBlockSize
18078 LEAL 1(R10), R10
18079
18080repeat_extend_forward_end_calcBlockSize:
18081 ADDL R10, CX
18082 MOVL CX, BX
18083 SUBL SI, BX
18084 MOVL 16(SP), SI
18085
18086 // emitCopy
18087 CMPL SI, $0x00010000
18088 JB two_byte_offset_repeat_as_copy_calcBlockSize
18089
18090four_bytes_loop_back_repeat_as_copy_calcBlockSize:
18091 CMPL BX, $0x40
18092 JBE four_bytes_remain_repeat_as_copy_calcBlockSize
18093 LEAL -64(BX), BX
18094 ADDQ $0x05, AX
18095 CMPL BX, $0x04
18096 JB four_bytes_remain_repeat_as_copy_calcBlockSize
18097 JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize
18098
18099four_bytes_remain_repeat_as_copy_calcBlockSize:
18100 TESTL BX, BX
18101 JZ repeat_end_emit_calcBlockSize
18102 XORL BX, BX
18103 ADDQ $0x05, AX
18104 JMP repeat_end_emit_calcBlockSize
18105
18106two_byte_offset_repeat_as_copy_calcBlockSize:
18107 CMPL BX, $0x40
18108 JBE two_byte_offset_short_repeat_as_copy_calcBlockSize
18109 LEAL -60(BX), BX
18110 ADDQ $0x03, AX
18111 JMP two_byte_offset_repeat_as_copy_calcBlockSize
18112
18113two_byte_offset_short_repeat_as_copy_calcBlockSize:
18114 MOVL BX, DI
18115 SHLL $0x02, DI
18116 CMPL BX, $0x0c
18117 JAE emit_copy_three_repeat_as_copy_calcBlockSize
18118 CMPL SI, $0x00000800
18119 JAE emit_copy_three_repeat_as_copy_calcBlockSize
18120 ADDQ $0x02, AX
18121 JMP repeat_end_emit_calcBlockSize
18122
18123emit_copy_three_repeat_as_copy_calcBlockSize:
18124 ADDQ $0x03, AX
18125
18126repeat_end_emit_calcBlockSize:
18127 MOVL CX, 12(SP)
18128 JMP search_loop_calcBlockSize
18129
18130no_repeat_found_calcBlockSize:
18131 CMPL (DX)(BX*1), SI
18132 JEQ candidate_match_calcBlockSize
18133 SHRQ $0x08, SI
18134 MOVL 24(SP)(R9*4), BX
18135 LEAL 2(CX), R8
18136 CMPL (DX)(DI*1), SI
18137 JEQ candidate2_match_calcBlockSize
18138 MOVL R8, 24(SP)(R9*4)
18139 SHRQ $0x08, SI
18140 CMPL (DX)(BX*1), SI
18141 JEQ candidate3_match_calcBlockSize
18142 MOVL 20(SP), CX
18143 JMP search_loop_calcBlockSize
18144
18145candidate3_match_calcBlockSize:
18146 ADDL $0x02, CX
18147 JMP candidate_match_calcBlockSize
18148
18149candidate2_match_calcBlockSize:
18150 MOVL R8, 24(SP)(R9*4)
18151 INCL CX
18152 MOVL DI, BX
18153
18154candidate_match_calcBlockSize:
18155 MOVL 12(SP), SI
18156 TESTL BX, BX
18157 JZ match_extend_back_end_calcBlockSize
18158
18159match_extend_back_loop_calcBlockSize:
18160 CMPL CX, SI
18161 JBE match_extend_back_end_calcBlockSize
18162 MOVB -1(DX)(BX*1), DI
18163 MOVB -1(DX)(CX*1), R8
18164 CMPB DI, R8
18165 JNE match_extend_back_end_calcBlockSize
18166 LEAL -1(CX), CX
18167 DECL BX
18168 JZ match_extend_back_end_calcBlockSize
18169 JMP match_extend_back_loop_calcBlockSize
18170
18171match_extend_back_end_calcBlockSize:
18172 MOVL CX, SI
18173 SUBL 12(SP), SI
18174 LEAQ 5(AX)(SI*1), SI
18175 CMPQ SI, (SP)
18176 JB match_dst_size_check_calcBlockSize
18177 MOVQ $0x00000000, ret+24(FP)
18178 RET
18179
18180match_dst_size_check_calcBlockSize:
18181 MOVL CX, SI
18182 MOVL 12(SP), DI
18183 CMPL DI, SI
18184 JEQ emit_literal_done_match_emit_calcBlockSize
18185 MOVL SI, R8
18186 MOVL SI, 12(SP)
18187 LEAQ (DX)(DI*1), SI
18188 SUBL DI, R8
18189 LEAL -1(R8), SI
18190 CMPL SI, $0x3c
18191 JB one_byte_match_emit_calcBlockSize
18192 CMPL SI, $0x00000100
18193 JB two_bytes_match_emit_calcBlockSize
18194 CMPL SI, $0x00010000
18195 JB three_bytes_match_emit_calcBlockSize
18196 CMPL SI, $0x01000000
18197 JB four_bytes_match_emit_calcBlockSize
18198 ADDQ $0x05, AX
18199 JMP memmove_long_match_emit_calcBlockSize
18200
18201four_bytes_match_emit_calcBlockSize:
18202 ADDQ $0x04, AX
18203 JMP memmove_long_match_emit_calcBlockSize
18204
18205three_bytes_match_emit_calcBlockSize:
18206 ADDQ $0x03, AX
18207 JMP memmove_long_match_emit_calcBlockSize
18208
18209two_bytes_match_emit_calcBlockSize:
18210 ADDQ $0x02, AX
18211 CMPL SI, $0x40
18212 JB memmove_match_emit_calcBlockSize
18213 JMP memmove_long_match_emit_calcBlockSize
18214
18215one_byte_match_emit_calcBlockSize:
18216 ADDQ $0x01, AX
18217
18218memmove_match_emit_calcBlockSize:
18219 LEAQ (AX)(R8*1), AX
18220 JMP emit_literal_done_match_emit_calcBlockSize
18221
18222memmove_long_match_emit_calcBlockSize:
18223 LEAQ (AX)(R8*1), AX
18224
18225emit_literal_done_match_emit_calcBlockSize:
18226match_nolit_loop_calcBlockSize:
18227 MOVL CX, SI
18228 SUBL BX, SI
18229 MOVL SI, 16(SP)
18230 ADDL $0x04, CX
18231 ADDL $0x04, BX
18232 MOVQ src_len+8(FP), SI
18233 SUBL CX, SI
18234 LEAQ (DX)(CX*1), DI
18235 LEAQ (DX)(BX*1), BX
18236
18237 // matchLen
18238 XORL R9, R9
18239
18240matchlen_loopback_16_match_nolit_calcBlockSize:
18241 CMPL SI, $0x10
18242 JB matchlen_match8_match_nolit_calcBlockSize
18243 MOVQ (DI)(R9*1), R8
18244 MOVQ 8(DI)(R9*1), R10
18245 XORQ (BX)(R9*1), R8
18246 JNZ matchlen_bsf_8_match_nolit_calcBlockSize
18247 XORQ 8(BX)(R9*1), R10
18248 JNZ matchlen_bsf_16match_nolit_calcBlockSize
18249 LEAL -16(SI), SI
18250 LEAL 16(R9), R9
18251 JMP matchlen_loopback_16_match_nolit_calcBlockSize
18252
18253matchlen_bsf_16match_nolit_calcBlockSize:
18254#ifdef GOAMD64_v3
18255 TZCNTQ R10, R10
18256
18257#else
18258 BSFQ R10, R10
18259
18260#endif
18261 SARQ $0x03, R10
18262 LEAL 8(R9)(R10*1), R9
18263 JMP match_nolit_end_calcBlockSize
18264
18265matchlen_match8_match_nolit_calcBlockSize:
18266 CMPL SI, $0x08
18267 JB matchlen_match4_match_nolit_calcBlockSize
18268 MOVQ (DI)(R9*1), R8
18269 XORQ (BX)(R9*1), R8
18270 JNZ matchlen_bsf_8_match_nolit_calcBlockSize
18271 LEAL -8(SI), SI
18272 LEAL 8(R9), R9
18273 JMP matchlen_match4_match_nolit_calcBlockSize
18274
18275matchlen_bsf_8_match_nolit_calcBlockSize:
18276#ifdef GOAMD64_v3
18277 TZCNTQ R8, R8
18278
18279#else
18280 BSFQ R8, R8
18281
18282#endif
18283 SARQ $0x03, R8
18284 LEAL (R9)(R8*1), R9
18285 JMP match_nolit_end_calcBlockSize
18286
18287matchlen_match4_match_nolit_calcBlockSize:
18288 CMPL SI, $0x04
18289 JB matchlen_match2_match_nolit_calcBlockSize
18290 MOVL (DI)(R9*1), R8
18291 CMPL (BX)(R9*1), R8
18292 JNE matchlen_match2_match_nolit_calcBlockSize
18293 LEAL -4(SI), SI
18294 LEAL 4(R9), R9
18295
18296matchlen_match2_match_nolit_calcBlockSize:
18297 CMPL SI, $0x01
18298 JE matchlen_match1_match_nolit_calcBlockSize
18299 JB match_nolit_end_calcBlockSize
18300 MOVW (DI)(R9*1), R8
18301 CMPW (BX)(R9*1), R8
18302 JNE matchlen_match1_match_nolit_calcBlockSize
18303 LEAL 2(R9), R9
18304 SUBL $0x02, SI
18305 JZ match_nolit_end_calcBlockSize
18306
18307matchlen_match1_match_nolit_calcBlockSize:
18308 MOVB (DI)(R9*1), R8
18309 CMPB (BX)(R9*1), R8
18310 JNE match_nolit_end_calcBlockSize
18311 LEAL 1(R9), R9
18312
18313match_nolit_end_calcBlockSize:
18314 ADDL R9, CX
18315 MOVL 16(SP), BX
18316 ADDL $0x04, R9
18317 MOVL CX, 12(SP)
18318
18319 // emitCopy
18320 CMPL BX, $0x00010000
18321 JB two_byte_offset_match_nolit_calcBlockSize
18322
18323four_bytes_loop_back_match_nolit_calcBlockSize:
18324 CMPL R9, $0x40
18325 JBE four_bytes_remain_match_nolit_calcBlockSize
18326 LEAL -64(R9), R9
18327 ADDQ $0x05, AX
18328 CMPL R9, $0x04
18329 JB four_bytes_remain_match_nolit_calcBlockSize
18330 JMP four_bytes_loop_back_match_nolit_calcBlockSize
18331
18332four_bytes_remain_match_nolit_calcBlockSize:
18333 TESTL R9, R9
18334 JZ match_nolit_emitcopy_end_calcBlockSize
18335 XORL BX, BX
18336 ADDQ $0x05, AX
18337 JMP match_nolit_emitcopy_end_calcBlockSize
18338
18339two_byte_offset_match_nolit_calcBlockSize:
18340 CMPL R9, $0x40
18341 JBE two_byte_offset_short_match_nolit_calcBlockSize
18342 LEAL -60(R9), R9
18343 ADDQ $0x03, AX
18344 JMP two_byte_offset_match_nolit_calcBlockSize
18345
18346two_byte_offset_short_match_nolit_calcBlockSize:
18347 MOVL R9, SI
18348 SHLL $0x02, SI
18349 CMPL R9, $0x0c
18350 JAE emit_copy_three_match_nolit_calcBlockSize
18351 CMPL BX, $0x00000800
18352 JAE emit_copy_three_match_nolit_calcBlockSize
18353 ADDQ $0x02, AX
18354 JMP match_nolit_emitcopy_end_calcBlockSize
18355
18356emit_copy_three_match_nolit_calcBlockSize:
18357 ADDQ $0x03, AX
18358
18359match_nolit_emitcopy_end_calcBlockSize:
18360 CMPL CX, 8(SP)
18361 JAE emit_remainder_calcBlockSize
18362 MOVQ -2(DX)(CX*1), SI
18363 CMPQ AX, (SP)
18364 JB match_nolit_dst_ok_calcBlockSize
18365 MOVQ $0x00000000, ret+24(FP)
18366 RET
18367
18368match_nolit_dst_ok_calcBlockSize:
18369 MOVQ $0x0000cf1bbcdcbf9b, R8
18370 MOVQ SI, DI
18371 SHRQ $0x10, SI
18372 MOVQ SI, BX
18373 SHLQ $0x10, DI
18374 IMULQ R8, DI
18375 SHRQ $0x33, DI
18376 SHLQ $0x10, BX
18377 IMULQ R8, BX
18378 SHRQ $0x33, BX
18379 LEAL -2(CX), R8
18380 LEAQ 24(SP)(BX*4), R9
18381 MOVL (R9), BX
18382 MOVL R8, 24(SP)(DI*4)
18383 MOVL CX, (R9)
18384 CMPL (DX)(BX*1), SI
18385 JEQ match_nolit_loop_calcBlockSize
18386 INCL CX
18387 JMP search_loop_calcBlockSize
18388
18389emit_remainder_calcBlockSize:
18390 MOVQ src_len+8(FP), CX
18391 SUBL 12(SP), CX
18392 LEAQ 5(AX)(CX*1), CX
18393 CMPQ CX, (SP)
18394 JB emit_remainder_ok_calcBlockSize
18395 MOVQ $0x00000000, ret+24(FP)
18396 RET
18397
18398emit_remainder_ok_calcBlockSize:
18399 MOVQ src_len+8(FP), CX
18400 MOVL 12(SP), BX
18401 CMPL BX, CX
18402 JEQ emit_literal_done_emit_remainder_calcBlockSize
18403 MOVL CX, SI
18404 MOVL CX, 12(SP)
18405 LEAQ (DX)(BX*1), CX
18406 SUBL BX, SI
18407 LEAL -1(SI), CX
18408 CMPL CX, $0x3c
18409 JB one_byte_emit_remainder_calcBlockSize
18410 CMPL CX, $0x00000100
18411 JB two_bytes_emit_remainder_calcBlockSize
18412 CMPL CX, $0x00010000
18413 JB three_bytes_emit_remainder_calcBlockSize
18414 CMPL CX, $0x01000000
18415 JB four_bytes_emit_remainder_calcBlockSize
18416 ADDQ $0x05, AX
18417 JMP memmove_long_emit_remainder_calcBlockSize
18418
18419four_bytes_emit_remainder_calcBlockSize:
18420 ADDQ $0x04, AX
18421 JMP memmove_long_emit_remainder_calcBlockSize
18422
18423three_bytes_emit_remainder_calcBlockSize:
18424 ADDQ $0x03, AX
18425 JMP memmove_long_emit_remainder_calcBlockSize
18426
18427two_bytes_emit_remainder_calcBlockSize:
18428 ADDQ $0x02, AX
18429 CMPL CX, $0x40
18430 JB memmove_emit_remainder_calcBlockSize
18431 JMP memmove_long_emit_remainder_calcBlockSize
18432
18433one_byte_emit_remainder_calcBlockSize:
18434 ADDQ $0x01, AX
18435
18436memmove_emit_remainder_calcBlockSize:
18437 LEAQ (AX)(SI*1), AX
18438 JMP emit_literal_done_emit_remainder_calcBlockSize
18439
18440memmove_long_emit_remainder_calcBlockSize:
18441 LEAQ (AX)(SI*1), AX
18442
18443emit_literal_done_emit_remainder_calcBlockSize:
18444 MOVQ AX, ret+24(FP)
18445 RET
18446
18447// func calcBlockSizeSmall(src []byte) int
18448// Requires: BMI, SSE2
18449TEXT ·calcBlockSizeSmall(SB), $2072-32
18450 XORQ AX, AX
18451 MOVQ $0x00000010, CX
18452 LEAQ 24(SP), DX
18453 PXOR X0, X0
18454
18455zero_loop_calcBlockSizeSmall:
18456 MOVOU X0, (DX)
18457 MOVOU X0, 16(DX)
18458 MOVOU X0, 32(DX)
18459 MOVOU X0, 48(DX)
18460 MOVOU X0, 64(DX)
18461 MOVOU X0, 80(DX)
18462 MOVOU X0, 96(DX)
18463 MOVOU X0, 112(DX)
18464 ADDQ $0x80, DX
18465 DECQ CX
18466 JNZ zero_loop_calcBlockSizeSmall
18467 MOVL $0x00000000, 12(SP)
18468 MOVQ src_len+8(FP), CX
18469 LEAQ -9(CX), DX
18470 LEAQ -8(CX), BX
18471 MOVL BX, 8(SP)
18472 SHRQ $0x05, CX
18473 SUBL CX, DX
18474 LEAQ (AX)(DX*1), DX
18475 MOVQ DX, (SP)
18476 MOVL $0x00000001, CX
18477 MOVL CX, 16(SP)
18478 MOVQ src_base+0(FP), DX
18479
18480search_loop_calcBlockSizeSmall:
18481 MOVL CX, BX
18482 SUBL 12(SP), BX
18483 SHRL $0x04, BX
18484 LEAL 4(CX)(BX*1), BX
18485 CMPL BX, 8(SP)
18486 JAE emit_remainder_calcBlockSizeSmall
18487 MOVQ (DX)(CX*1), SI
18488 MOVL BX, 20(SP)
18489 MOVQ $0x9e3779b1, R8
18490 MOVQ SI, R9
18491 MOVQ SI, R10
18492 SHRQ $0x08, R10
18493 SHLQ $0x20, R9
18494 IMULQ R8, R9
18495 SHRQ $0x37, R9
18496 SHLQ $0x20, R10
18497 IMULQ R8, R10
18498 SHRQ $0x37, R10
18499 MOVL 24(SP)(R9*4), BX
18500 MOVL 24(SP)(R10*4), DI
18501 MOVL CX, 24(SP)(R9*4)
18502 LEAL 1(CX), R9
18503 MOVL R9, 24(SP)(R10*4)
18504 MOVQ SI, R9
18505 SHRQ $0x10, R9
18506 SHLQ $0x20, R9
18507 IMULQ R8, R9
18508 SHRQ $0x37, R9
18509 MOVL CX, R8
18510 SUBL 16(SP), R8
18511 MOVL 1(DX)(R8*1), R10
18512 MOVQ SI, R8
18513 SHRQ $0x08, R8
18514 CMPL R8, R10
18515 JNE no_repeat_found_calcBlockSizeSmall
18516 LEAL 1(CX), SI
18517 MOVL 12(SP), BX
18518 MOVL SI, DI
18519 SUBL 16(SP), DI
18520 JZ repeat_extend_back_end_calcBlockSizeSmall
18521
18522repeat_extend_back_loop_calcBlockSizeSmall:
18523 CMPL SI, BX
18524 JBE repeat_extend_back_end_calcBlockSizeSmall
18525 MOVB -1(DX)(DI*1), R8
18526 MOVB -1(DX)(SI*1), R9
18527 CMPB R8, R9
18528 JNE repeat_extend_back_end_calcBlockSizeSmall
18529 LEAL -1(SI), SI
18530 DECL DI
18531 JNZ repeat_extend_back_loop_calcBlockSizeSmall
18532
18533repeat_extend_back_end_calcBlockSizeSmall:
18534 MOVL 12(SP), BX
18535 CMPL BX, SI
18536 JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall
18537 MOVL SI, DI
18538 MOVL SI, 12(SP)
18539 LEAQ (DX)(BX*1), R8
18540 SUBL BX, DI
18541 LEAL -1(DI), BX
18542 CMPL BX, $0x3c
18543 JB one_byte_repeat_emit_calcBlockSizeSmall
18544 CMPL BX, $0x00000100
18545 JB two_bytes_repeat_emit_calcBlockSizeSmall
18546 JB three_bytes_repeat_emit_calcBlockSizeSmall
18547
18548three_bytes_repeat_emit_calcBlockSizeSmall:
18549 ADDQ $0x03, AX
18550 JMP memmove_long_repeat_emit_calcBlockSizeSmall
18551
18552two_bytes_repeat_emit_calcBlockSizeSmall:
18553 ADDQ $0x02, AX
18554 CMPL BX, $0x40
18555 JB memmove_repeat_emit_calcBlockSizeSmall
18556 JMP memmove_long_repeat_emit_calcBlockSizeSmall
18557
18558one_byte_repeat_emit_calcBlockSizeSmall:
18559 ADDQ $0x01, AX
18560
18561memmove_repeat_emit_calcBlockSizeSmall:
18562 LEAQ (AX)(DI*1), AX
18563 JMP emit_literal_done_repeat_emit_calcBlockSizeSmall
18564
18565memmove_long_repeat_emit_calcBlockSizeSmall:
18566 LEAQ (AX)(DI*1), AX
18567
18568emit_literal_done_repeat_emit_calcBlockSizeSmall:
18569 ADDL $0x05, CX
18570 MOVL CX, BX
18571 SUBL 16(SP), BX
18572 MOVQ src_len+8(FP), DI
18573 SUBL CX, DI
18574 LEAQ (DX)(CX*1), R8
18575 LEAQ (DX)(BX*1), BX
18576
18577 // matchLen
18578 XORL R10, R10
18579
18580matchlen_loopback_16_repeat_extend_calcBlockSizeSmall:
18581 CMPL DI, $0x10
18582 JB matchlen_match8_repeat_extend_calcBlockSizeSmall
18583 MOVQ (R8)(R10*1), R9
18584 MOVQ 8(R8)(R10*1), R11
18585 XORQ (BX)(R10*1), R9
18586 JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
18587 XORQ 8(BX)(R10*1), R11
18588 JNZ matchlen_bsf_16repeat_extend_calcBlockSizeSmall
18589 LEAL -16(DI), DI
18590 LEAL 16(R10), R10
18591 JMP matchlen_loopback_16_repeat_extend_calcBlockSizeSmall
18592
18593matchlen_bsf_16repeat_extend_calcBlockSizeSmall:
18594#ifdef GOAMD64_v3
18595 TZCNTQ R11, R11
18596
18597#else
18598 BSFQ R11, R11
18599
18600#endif
18601 SARQ $0x03, R11
18602 LEAL 8(R10)(R11*1), R10
18603 JMP repeat_extend_forward_end_calcBlockSizeSmall
18604
18605matchlen_match8_repeat_extend_calcBlockSizeSmall:
18606 CMPL DI, $0x08
18607 JB matchlen_match4_repeat_extend_calcBlockSizeSmall
18608 MOVQ (R8)(R10*1), R9
18609 XORQ (BX)(R10*1), R9
18610 JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
18611 LEAL -8(DI), DI
18612 LEAL 8(R10), R10
18613 JMP matchlen_match4_repeat_extend_calcBlockSizeSmall
18614
18615matchlen_bsf_8_repeat_extend_calcBlockSizeSmall:
18616#ifdef GOAMD64_v3
18617 TZCNTQ R9, R9
18618
18619#else
18620 BSFQ R9, R9
18621
18622#endif
18623 SARQ $0x03, R9
18624 LEAL (R10)(R9*1), R10
18625 JMP repeat_extend_forward_end_calcBlockSizeSmall
18626
18627matchlen_match4_repeat_extend_calcBlockSizeSmall:
18628 CMPL DI, $0x04
18629 JB matchlen_match2_repeat_extend_calcBlockSizeSmall
18630 MOVL (R8)(R10*1), R9
18631 CMPL (BX)(R10*1), R9
18632 JNE matchlen_match2_repeat_extend_calcBlockSizeSmall
18633 LEAL -4(DI), DI
18634 LEAL 4(R10), R10
18635
18636matchlen_match2_repeat_extend_calcBlockSizeSmall:
18637 CMPL DI, $0x01
18638 JE matchlen_match1_repeat_extend_calcBlockSizeSmall
18639 JB repeat_extend_forward_end_calcBlockSizeSmall
18640 MOVW (R8)(R10*1), R9
18641 CMPW (BX)(R10*1), R9
18642 JNE matchlen_match1_repeat_extend_calcBlockSizeSmall
18643 LEAL 2(R10), R10
18644 SUBL $0x02, DI
18645 JZ repeat_extend_forward_end_calcBlockSizeSmall
18646
18647matchlen_match1_repeat_extend_calcBlockSizeSmall:
18648 MOVB (R8)(R10*1), R9
18649 CMPB (BX)(R10*1), R9
18650 JNE repeat_extend_forward_end_calcBlockSizeSmall
18651 LEAL 1(R10), R10
18652
18653repeat_extend_forward_end_calcBlockSizeSmall:
18654 ADDL R10, CX
18655 MOVL CX, BX
18656 SUBL SI, BX
18657 MOVL 16(SP), SI
18658
18659 // emitCopy
18660two_byte_offset_repeat_as_copy_calcBlockSizeSmall:
18661 CMPL BX, $0x40
18662 JBE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall
18663 LEAL -60(BX), BX
18664 ADDQ $0x03, AX
18665 JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall
18666
18667two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall:
18668 MOVL BX, SI
18669 SHLL $0x02, SI
18670 CMPL BX, $0x0c
18671 JAE emit_copy_three_repeat_as_copy_calcBlockSizeSmall
18672 ADDQ $0x02, AX
18673 JMP repeat_end_emit_calcBlockSizeSmall
18674
18675emit_copy_three_repeat_as_copy_calcBlockSizeSmall:
18676 ADDQ $0x03, AX
18677
18678repeat_end_emit_calcBlockSizeSmall:
18679 MOVL CX, 12(SP)
18680 JMP search_loop_calcBlockSizeSmall
18681
18682no_repeat_found_calcBlockSizeSmall:
18683 CMPL (DX)(BX*1), SI
18684 JEQ candidate_match_calcBlockSizeSmall
18685 SHRQ $0x08, SI
18686 MOVL 24(SP)(R9*4), BX
18687 LEAL 2(CX), R8
18688 CMPL (DX)(DI*1), SI
18689 JEQ candidate2_match_calcBlockSizeSmall
18690 MOVL R8, 24(SP)(R9*4)
18691 SHRQ $0x08, SI
18692 CMPL (DX)(BX*1), SI
18693 JEQ candidate3_match_calcBlockSizeSmall
18694 MOVL 20(SP), CX
18695 JMP search_loop_calcBlockSizeSmall
18696
18697candidate3_match_calcBlockSizeSmall:
18698 ADDL $0x02, CX
18699 JMP candidate_match_calcBlockSizeSmall
18700
18701candidate2_match_calcBlockSizeSmall:
18702 MOVL R8, 24(SP)(R9*4)
18703 INCL CX
18704 MOVL DI, BX
18705
18706candidate_match_calcBlockSizeSmall:
18707 MOVL 12(SP), SI
18708 TESTL BX, BX
18709 JZ match_extend_back_end_calcBlockSizeSmall
18710
18711match_extend_back_loop_calcBlockSizeSmall:
18712 CMPL CX, SI
18713 JBE match_extend_back_end_calcBlockSizeSmall
18714 MOVB -1(DX)(BX*1), DI
18715 MOVB -1(DX)(CX*1), R8
18716 CMPB DI, R8
18717 JNE match_extend_back_end_calcBlockSizeSmall
18718 LEAL -1(CX), CX
18719 DECL BX
18720 JZ match_extend_back_end_calcBlockSizeSmall
18721 JMP match_extend_back_loop_calcBlockSizeSmall
18722
18723match_extend_back_end_calcBlockSizeSmall:
18724 MOVL CX, SI
18725 SUBL 12(SP), SI
18726 LEAQ 3(AX)(SI*1), SI
18727 CMPQ SI, (SP)
18728 JB match_dst_size_check_calcBlockSizeSmall
18729 MOVQ $0x00000000, ret+24(FP)
18730 RET
18731
18732match_dst_size_check_calcBlockSizeSmall:
18733 MOVL CX, SI
18734 MOVL 12(SP), DI
18735 CMPL DI, SI
18736 JEQ emit_literal_done_match_emit_calcBlockSizeSmall
18737 MOVL SI, R8
18738 MOVL SI, 12(SP)
18739 LEAQ (DX)(DI*1), SI
18740 SUBL DI, R8
18741 LEAL -1(R8), SI
18742 CMPL SI, $0x3c
18743 JB one_byte_match_emit_calcBlockSizeSmall
18744 CMPL SI, $0x00000100
18745 JB two_bytes_match_emit_calcBlockSizeSmall
18746 JB three_bytes_match_emit_calcBlockSizeSmall
18747
18748three_bytes_match_emit_calcBlockSizeSmall:
18749 ADDQ $0x03, AX
18750 JMP memmove_long_match_emit_calcBlockSizeSmall
18751
18752two_bytes_match_emit_calcBlockSizeSmall:
18753 ADDQ $0x02, AX
18754 CMPL SI, $0x40
18755 JB memmove_match_emit_calcBlockSizeSmall
18756 JMP memmove_long_match_emit_calcBlockSizeSmall
18757
18758one_byte_match_emit_calcBlockSizeSmall:
18759 ADDQ $0x01, AX
18760
18761memmove_match_emit_calcBlockSizeSmall:
18762 LEAQ (AX)(R8*1), AX
18763 JMP emit_literal_done_match_emit_calcBlockSizeSmall
18764
18765memmove_long_match_emit_calcBlockSizeSmall:
18766 LEAQ (AX)(R8*1), AX
18767
18768emit_literal_done_match_emit_calcBlockSizeSmall:
18769match_nolit_loop_calcBlockSizeSmall:
18770 MOVL CX, SI
18771 SUBL BX, SI
18772 MOVL SI, 16(SP)
18773 ADDL $0x04, CX
18774 ADDL $0x04, BX
18775 MOVQ src_len+8(FP), SI
18776 SUBL CX, SI
18777 LEAQ (DX)(CX*1), DI
18778 LEAQ (DX)(BX*1), BX
18779
18780 // matchLen
18781 XORL R9, R9
18782
18783matchlen_loopback_16_match_nolit_calcBlockSizeSmall:
18784 CMPL SI, $0x10
18785 JB matchlen_match8_match_nolit_calcBlockSizeSmall
18786 MOVQ (DI)(R9*1), R8
18787 MOVQ 8(DI)(R9*1), R10
18788 XORQ (BX)(R9*1), R8
18789 JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall
18790 XORQ 8(BX)(R9*1), R10
18791 JNZ matchlen_bsf_16match_nolit_calcBlockSizeSmall
18792 LEAL -16(SI), SI
18793 LEAL 16(R9), R9
18794 JMP matchlen_loopback_16_match_nolit_calcBlockSizeSmall
18795
18796matchlen_bsf_16match_nolit_calcBlockSizeSmall:
18797#ifdef GOAMD64_v3
18798 TZCNTQ R10, R10
18799
18800#else
18801 BSFQ R10, R10
18802
18803#endif
18804 SARQ $0x03, R10
18805 LEAL 8(R9)(R10*1), R9
18806 JMP match_nolit_end_calcBlockSizeSmall
18807
18808matchlen_match8_match_nolit_calcBlockSizeSmall:
18809 CMPL SI, $0x08
18810 JB matchlen_match4_match_nolit_calcBlockSizeSmall
18811 MOVQ (DI)(R9*1), R8
18812 XORQ (BX)(R9*1), R8
18813 JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall
18814 LEAL -8(SI), SI
18815 LEAL 8(R9), R9
18816 JMP matchlen_match4_match_nolit_calcBlockSizeSmall
18817
18818matchlen_bsf_8_match_nolit_calcBlockSizeSmall:
18819#ifdef GOAMD64_v3
18820 TZCNTQ R8, R8
18821
18822#else
18823 BSFQ R8, R8
18824
18825#endif
18826 SARQ $0x03, R8
18827 LEAL (R9)(R8*1), R9
18828 JMP match_nolit_end_calcBlockSizeSmall
18829
18830matchlen_match4_match_nolit_calcBlockSizeSmall:
18831 CMPL SI, $0x04
18832 JB matchlen_match2_match_nolit_calcBlockSizeSmall
18833 MOVL (DI)(R9*1), R8
18834 CMPL (BX)(R9*1), R8
18835 JNE matchlen_match2_match_nolit_calcBlockSizeSmall
18836 LEAL -4(SI), SI
18837 LEAL 4(R9), R9
18838
18839matchlen_match2_match_nolit_calcBlockSizeSmall:
18840 CMPL SI, $0x01
18841 JE matchlen_match1_match_nolit_calcBlockSizeSmall
18842 JB match_nolit_end_calcBlockSizeSmall
18843 MOVW (DI)(R9*1), R8
18844 CMPW (BX)(R9*1), R8
18845 JNE matchlen_match1_match_nolit_calcBlockSizeSmall
18846 LEAL 2(R9), R9
18847 SUBL $0x02, SI
18848 JZ match_nolit_end_calcBlockSizeSmall
18849
18850matchlen_match1_match_nolit_calcBlockSizeSmall:
18851 MOVB (DI)(R9*1), R8
18852 CMPB (BX)(R9*1), R8
18853 JNE match_nolit_end_calcBlockSizeSmall
18854 LEAL 1(R9), R9
18855
18856match_nolit_end_calcBlockSizeSmall:
18857 ADDL R9, CX
18858 MOVL 16(SP), BX
18859 ADDL $0x04, R9
18860 MOVL CX, 12(SP)
18861
18862 // emitCopy
18863two_byte_offset_match_nolit_calcBlockSizeSmall:
18864 CMPL R9, $0x40
18865 JBE two_byte_offset_short_match_nolit_calcBlockSizeSmall
18866 LEAL -60(R9), R9
18867 ADDQ $0x03, AX
18868 JMP two_byte_offset_match_nolit_calcBlockSizeSmall
18869
18870two_byte_offset_short_match_nolit_calcBlockSizeSmall:
18871 MOVL R9, BX
18872 SHLL $0x02, BX
18873 CMPL R9, $0x0c
18874 JAE emit_copy_three_match_nolit_calcBlockSizeSmall
18875 ADDQ $0x02, AX
18876 JMP match_nolit_emitcopy_end_calcBlockSizeSmall
18877
18878emit_copy_three_match_nolit_calcBlockSizeSmall:
18879 ADDQ $0x03, AX
18880
18881match_nolit_emitcopy_end_calcBlockSizeSmall:
18882 CMPL CX, 8(SP)
18883 JAE emit_remainder_calcBlockSizeSmall
18884 MOVQ -2(DX)(CX*1), SI
18885 CMPQ AX, (SP)
18886 JB match_nolit_dst_ok_calcBlockSizeSmall
18887 MOVQ $0x00000000, ret+24(FP)
18888 RET
18889
18890match_nolit_dst_ok_calcBlockSizeSmall:
18891 MOVQ $0x9e3779b1, R8
18892 MOVQ SI, DI
18893 SHRQ $0x10, SI
18894 MOVQ SI, BX
18895 SHLQ $0x20, DI
18896 IMULQ R8, DI
18897 SHRQ $0x37, DI
18898 SHLQ $0x20, BX
18899 IMULQ R8, BX
18900 SHRQ $0x37, BX
18901 LEAL -2(CX), R8
18902 LEAQ 24(SP)(BX*4), R9
18903 MOVL (R9), BX
18904 MOVL R8, 24(SP)(DI*4)
18905 MOVL CX, (R9)
18906 CMPL (DX)(BX*1), SI
18907 JEQ match_nolit_loop_calcBlockSizeSmall
18908 INCL CX
18909 JMP search_loop_calcBlockSizeSmall
18910
18911emit_remainder_calcBlockSizeSmall:
18912 MOVQ src_len+8(FP), CX
18913 SUBL 12(SP), CX
18914 LEAQ 3(AX)(CX*1), CX
18915 CMPQ CX, (SP)
18916 JB emit_remainder_ok_calcBlockSizeSmall
18917 MOVQ $0x00000000, ret+24(FP)
18918 RET
18919
18920emit_remainder_ok_calcBlockSizeSmall:
18921 MOVQ src_len+8(FP), CX
18922 MOVL 12(SP), BX
18923 CMPL BX, CX
18924 JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall
18925 MOVL CX, SI
18926 MOVL CX, 12(SP)
18927 LEAQ (DX)(BX*1), CX
18928 SUBL BX, SI
18929 LEAL -1(SI), CX
18930 CMPL CX, $0x3c
18931 JB one_byte_emit_remainder_calcBlockSizeSmall
18932 CMPL CX, $0x00000100
18933 JB two_bytes_emit_remainder_calcBlockSizeSmall
18934 JB three_bytes_emit_remainder_calcBlockSizeSmall
18935
18936three_bytes_emit_remainder_calcBlockSizeSmall:
18937 ADDQ $0x03, AX
18938 JMP memmove_long_emit_remainder_calcBlockSizeSmall
18939
18940two_bytes_emit_remainder_calcBlockSizeSmall:
18941 ADDQ $0x02, AX
18942 CMPL CX, $0x40
18943 JB memmove_emit_remainder_calcBlockSizeSmall
18944 JMP memmove_long_emit_remainder_calcBlockSizeSmall
18945
18946one_byte_emit_remainder_calcBlockSizeSmall:
18947 ADDQ $0x01, AX
18948
18949memmove_emit_remainder_calcBlockSizeSmall:
18950 LEAQ (AX)(SI*1), AX
18951 JMP emit_literal_done_emit_remainder_calcBlockSizeSmall
18952
18953memmove_long_emit_remainder_calcBlockSizeSmall:
18954 LEAQ (AX)(SI*1), AX
18955
18956emit_literal_done_emit_remainder_calcBlockSizeSmall:
18957 MOVQ AX, ret+24(FP)
18958 RET
18959
18960// func emitLiteral(dst []byte, lit []byte) int
18961// Requires: SSE2
18962TEXT ·emitLiteral(SB), NOSPLIT, $0-56
18963 MOVQ lit_len+32(FP), DX
18964 MOVQ dst_base+0(FP), AX
18965 MOVQ lit_base+24(FP), CX
18966 TESTQ DX, DX
18967 JZ emit_literal_end_standalone_skip
18968 MOVL DX, BX
18969 LEAL -1(DX), SI
18970 CMPL SI, $0x3c
18971 JB one_byte_standalone
18972 CMPL SI, $0x00000100
18973 JB two_bytes_standalone
18974 CMPL SI, $0x00010000
18975 JB three_bytes_standalone
18976 CMPL SI, $0x01000000
18977 JB four_bytes_standalone
18978 MOVB $0xfc, (AX)
18979 MOVL SI, 1(AX)
18980 ADDQ $0x05, BX
18981 ADDQ $0x05, AX
18982 JMP memmove_long_standalone
18983
18984four_bytes_standalone:
18985 MOVL SI, DI
18986 SHRL $0x10, DI
18987 MOVB $0xf8, (AX)
18988 MOVW SI, 1(AX)
18989 MOVB DI, 3(AX)
18990 ADDQ $0x04, BX
18991 ADDQ $0x04, AX
18992 JMP memmove_long_standalone
18993
18994three_bytes_standalone:
18995 MOVB $0xf4, (AX)
18996 MOVW SI, 1(AX)
18997 ADDQ $0x03, BX
18998 ADDQ $0x03, AX
18999 JMP memmove_long_standalone
19000
19001two_bytes_standalone:
19002 MOVB $0xf0, (AX)
19003 MOVB SI, 1(AX)
19004 ADDQ $0x02, BX
19005 ADDQ $0x02, AX
19006 CMPL SI, $0x40
19007 JB memmove_standalone
19008 JMP memmove_long_standalone
19009
19010one_byte_standalone:
19011 SHLB $0x02, SI
19012 MOVB SI, (AX)
19013 ADDQ $0x01, BX
19014 ADDQ $0x01, AX
19015
19016memmove_standalone:
19017 // genMemMoveShort
19018 CMPQ DX, $0x03
19019 JB emit_lit_memmove_standalone_memmove_move_1or2
19020 JE emit_lit_memmove_standalone_memmove_move_3
19021 CMPQ DX, $0x08
19022 JB emit_lit_memmove_standalone_memmove_move_4through7
19023 CMPQ DX, $0x10
19024 JBE emit_lit_memmove_standalone_memmove_move_8through16
19025 CMPQ DX, $0x20
19026 JBE emit_lit_memmove_standalone_memmove_move_17through32
19027 JMP emit_lit_memmove_standalone_memmove_move_33through64
19028
19029emit_lit_memmove_standalone_memmove_move_1or2:
19030 MOVB (CX), SI
19031 MOVB -1(CX)(DX*1), CL
19032 MOVB SI, (AX)
19033 MOVB CL, -1(AX)(DX*1)
19034 JMP emit_literal_end_standalone
19035
19036emit_lit_memmove_standalone_memmove_move_3:
19037 MOVW (CX), SI
19038 MOVB 2(CX), CL
19039 MOVW SI, (AX)
19040 MOVB CL, 2(AX)
19041 JMP emit_literal_end_standalone
19042
19043emit_lit_memmove_standalone_memmove_move_4through7:
19044 MOVL (CX), SI
19045 MOVL -4(CX)(DX*1), CX
19046 MOVL SI, (AX)
19047 MOVL CX, -4(AX)(DX*1)
19048 JMP emit_literal_end_standalone
19049
19050emit_lit_memmove_standalone_memmove_move_8through16:
19051 MOVQ (CX), SI
19052 MOVQ -8(CX)(DX*1), CX
19053 MOVQ SI, (AX)
19054 MOVQ CX, -8(AX)(DX*1)
19055 JMP emit_literal_end_standalone
19056
19057emit_lit_memmove_standalone_memmove_move_17through32:
19058 MOVOU (CX), X0
19059 MOVOU -16(CX)(DX*1), X1
19060 MOVOU X0, (AX)
19061 MOVOU X1, -16(AX)(DX*1)
19062 JMP emit_literal_end_standalone
19063
19064emit_lit_memmove_standalone_memmove_move_33through64:
19065 MOVOU (CX), X0
19066 MOVOU 16(CX), X1
19067 MOVOU -32(CX)(DX*1), X2
19068 MOVOU -16(CX)(DX*1), X3
19069 MOVOU X0, (AX)
19070 MOVOU X1, 16(AX)
19071 MOVOU X2, -32(AX)(DX*1)
19072 MOVOU X3, -16(AX)(DX*1)
19073 JMP emit_literal_end_standalone
19074 JMP emit_literal_end_standalone
19075
19076memmove_long_standalone:
19077 // genMemMoveLong
19078 MOVOU (CX), X0
19079 MOVOU 16(CX), X1
19080 MOVOU -32(CX)(DX*1), X2
19081 MOVOU -16(CX)(DX*1), X3
19082 MOVQ DX, DI
19083 SHRQ $0x05, DI
19084 MOVQ AX, SI
19085 ANDL $0x0000001f, SI
19086 MOVQ $0x00000040, R8
19087 SUBQ SI, R8
19088 DECQ DI
19089 JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
19090 LEAQ -32(CX)(R8*1), SI
19091 LEAQ -32(AX)(R8*1), R9
19092
19093emit_lit_memmove_long_standalonelarge_big_loop_back:
19094 MOVOU (SI), X4
19095 MOVOU 16(SI), X5
19096 MOVOA X4, (R9)
19097 MOVOA X5, 16(R9)
19098 ADDQ $0x20, R9
19099 ADDQ $0x20, SI
19100 ADDQ $0x20, R8
19101 DECQ DI
19102 JNA emit_lit_memmove_long_standalonelarge_big_loop_back
19103
19104emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
19105 MOVOU -32(CX)(R8*1), X4
19106 MOVOU -16(CX)(R8*1), X5
19107 MOVOA X4, -32(AX)(R8*1)
19108 MOVOA X5, -16(AX)(R8*1)
19109 ADDQ $0x20, R8
19110 CMPQ DX, R8
19111 JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
19112 MOVOU X0, (AX)
19113 MOVOU X1, 16(AX)
19114 MOVOU X2, -32(AX)(DX*1)
19115 MOVOU X3, -16(AX)(DX*1)
19116 JMP emit_literal_end_standalone
19117 JMP emit_literal_end_standalone
19118
19119emit_literal_end_standalone_skip:
19120 XORQ BX, BX
19121
19122emit_literal_end_standalone:
19123 MOVQ BX, ret+48(FP)
19124 RET
19125
19126// func emitRepeat(dst []byte, offset int, length int) int
19127TEXT ·emitRepeat(SB), NOSPLIT, $0-48
19128 XORQ BX, BX
19129 MOVQ dst_base+0(FP), AX
19130 MOVQ offset+24(FP), CX
19131 MOVQ length+32(FP), DX
19132
19133 // emitRepeat
19134emit_repeat_again_standalone:
19135 MOVL DX, SI
19136 LEAL -4(DX), DX
19137 CMPL SI, $0x08
19138 JBE repeat_two_standalone
19139 CMPL SI, $0x0c
19140 JAE cant_repeat_two_offset_standalone
19141 CMPL CX, $0x00000800
19142 JB repeat_two_offset_standalone
19143
19144cant_repeat_two_offset_standalone:
19145 CMPL DX, $0x00000104
19146 JB repeat_three_standalone
19147 CMPL DX, $0x00010100
19148 JB repeat_four_standalone
19149 CMPL DX, $0x0100ffff
19150 JB repeat_five_standalone
19151 LEAL -16842747(DX), DX
19152 MOVL $0xfffb001d, (AX)
19153 MOVB $0xff, 4(AX)
19154 ADDQ $0x05, AX
19155 ADDQ $0x05, BX
19156 JMP emit_repeat_again_standalone
19157
19158repeat_five_standalone:
19159 LEAL -65536(DX), DX
19160 MOVL DX, CX
19161 MOVW $0x001d, (AX)
19162 MOVW DX, 2(AX)
19163 SARL $0x10, CX
19164 MOVB CL, 4(AX)
19165 ADDQ $0x05, BX
19166 ADDQ $0x05, AX
19167 JMP gen_emit_repeat_end
19168
19169repeat_four_standalone:
19170 LEAL -256(DX), DX
19171 MOVW $0x0019, (AX)
19172 MOVW DX, 2(AX)
19173 ADDQ $0x04, BX
19174 ADDQ $0x04, AX
19175 JMP gen_emit_repeat_end
19176
19177repeat_three_standalone:
19178 LEAL -4(DX), DX
19179 MOVW $0x0015, (AX)
19180 MOVB DL, 2(AX)
19181 ADDQ $0x03, BX
19182 ADDQ $0x03, AX
19183 JMP gen_emit_repeat_end
19184
19185repeat_two_standalone:
19186 SHLL $0x02, DX
19187 ORL $0x01, DX
19188 MOVW DX, (AX)
19189 ADDQ $0x02, BX
19190 ADDQ $0x02, AX
19191 JMP gen_emit_repeat_end
19192
19193repeat_two_offset_standalone:
19194 XORQ SI, SI
19195 LEAL 1(SI)(DX*4), DX
19196 MOVB CL, 1(AX)
19197 SARL $0x08, CX
19198 SHLL $0x05, CX
19199 ORL CX, DX
19200 MOVB DL, (AX)
19201 ADDQ $0x02, BX
19202 ADDQ $0x02, AX
19203
19204gen_emit_repeat_end:
19205 MOVQ BX, ret+40(FP)
19206 RET
19207
19208// func emitCopy(dst []byte, offset int, length int) int
19209TEXT ·emitCopy(SB), NOSPLIT, $0-48
19210 XORQ BX, BX
19211 MOVQ dst_base+0(FP), AX
19212 MOVQ offset+24(FP), CX
19213 MOVQ length+32(FP), DX
19214
19215 // emitCopy
19216 CMPL CX, $0x00010000
19217 JB two_byte_offset_standalone
19218 CMPL DX, $0x40
19219 JBE four_bytes_remain_standalone
19220 MOVB $0xff, (AX)
19221 MOVL CX, 1(AX)
19222 LEAL -64(DX), DX
19223 ADDQ $0x05, BX
19224 ADDQ $0x05, AX
19225 CMPL DX, $0x04
19226 JB four_bytes_remain_standalone
19227
19228 // emitRepeat
19229emit_repeat_again_standalone_emit_copy:
19230 MOVL DX, SI
19231 LEAL -4(DX), DX
19232 CMPL SI, $0x08
19233 JBE repeat_two_standalone_emit_copy
19234 CMPL SI, $0x0c
19235 JAE cant_repeat_two_offset_standalone_emit_copy
19236 CMPL CX, $0x00000800
19237 JB repeat_two_offset_standalone_emit_copy
19238
19239cant_repeat_two_offset_standalone_emit_copy:
19240 CMPL DX, $0x00000104
19241 JB repeat_three_standalone_emit_copy
19242 CMPL DX, $0x00010100
19243 JB repeat_four_standalone_emit_copy
19244 CMPL DX, $0x0100ffff
19245 JB repeat_five_standalone_emit_copy
19246 LEAL -16842747(DX), DX
19247 MOVL $0xfffb001d, (AX)
19248 MOVB $0xff, 4(AX)
19249 ADDQ $0x05, AX
19250 ADDQ $0x05, BX
19251 JMP emit_repeat_again_standalone_emit_copy
19252
19253repeat_five_standalone_emit_copy:
19254 LEAL -65536(DX), DX
19255 MOVL DX, CX
19256 MOVW $0x001d, (AX)
19257 MOVW DX, 2(AX)
19258 SARL $0x10, CX
19259 MOVB CL, 4(AX)
19260 ADDQ $0x05, BX
19261 ADDQ $0x05, AX
19262 JMP gen_emit_copy_end
19263
19264repeat_four_standalone_emit_copy:
19265 LEAL -256(DX), DX
19266 MOVW $0x0019, (AX)
19267 MOVW DX, 2(AX)
19268 ADDQ $0x04, BX
19269 ADDQ $0x04, AX
19270 JMP gen_emit_copy_end
19271
19272repeat_three_standalone_emit_copy:
19273 LEAL -4(DX), DX
19274 MOVW $0x0015, (AX)
19275 MOVB DL, 2(AX)
19276 ADDQ $0x03, BX
19277 ADDQ $0x03, AX
19278 JMP gen_emit_copy_end
19279
19280repeat_two_standalone_emit_copy:
19281 SHLL $0x02, DX
19282 ORL $0x01, DX
19283 MOVW DX, (AX)
19284 ADDQ $0x02, BX
19285 ADDQ $0x02, AX
19286 JMP gen_emit_copy_end
19287
19288repeat_two_offset_standalone_emit_copy:
19289 XORQ SI, SI
19290 LEAL 1(SI)(DX*4), DX
19291 MOVB CL, 1(AX)
19292 SARL $0x08, CX
19293 SHLL $0x05, CX
19294 ORL CX, DX
19295 MOVB DL, (AX)
19296 ADDQ $0x02, BX
19297 ADDQ $0x02, AX
19298 JMP gen_emit_copy_end
19299
19300four_bytes_remain_standalone:
19301 TESTL DX, DX
19302 JZ gen_emit_copy_end
19303 XORL SI, SI
19304 LEAL -1(SI)(DX*4), DX
19305 MOVB DL, (AX)
19306 MOVL CX, 1(AX)
19307 ADDQ $0x05, BX
19308 ADDQ $0x05, AX
19309 JMP gen_emit_copy_end
19310
19311two_byte_offset_standalone:
19312 CMPL DX, $0x40
19313 JBE two_byte_offset_short_standalone
19314 CMPL CX, $0x00000800
19315 JAE long_offset_short_standalone
19316 MOVL $0x00000001, SI
19317 LEAL 16(SI), SI
19318 MOVB CL, 1(AX)
19319 MOVL CX, DI
19320 SHRL $0x08, DI
19321 SHLL $0x05, DI
19322 ORL DI, SI
19323 MOVB SI, (AX)
19324 ADDQ $0x02, BX
19325 ADDQ $0x02, AX
19326 SUBL $0x08, DX
19327
19328 // emitRepeat
19329 LEAL -4(DX), DX
19330 JMP cant_repeat_two_offset_standalone_emit_copy_short_2b
19331
19332emit_repeat_again_standalone_emit_copy_short_2b:
19333 MOVL DX, SI
19334 LEAL -4(DX), DX
19335 CMPL SI, $0x08
19336 JBE repeat_two_standalone_emit_copy_short_2b
19337 CMPL SI, $0x0c
19338 JAE cant_repeat_two_offset_standalone_emit_copy_short_2b
19339 CMPL CX, $0x00000800
19340 JB repeat_two_offset_standalone_emit_copy_short_2b
19341
19342cant_repeat_two_offset_standalone_emit_copy_short_2b:
19343 CMPL DX, $0x00000104
19344 JB repeat_three_standalone_emit_copy_short_2b
19345 CMPL DX, $0x00010100
19346 JB repeat_four_standalone_emit_copy_short_2b
19347 CMPL DX, $0x0100ffff
19348 JB repeat_five_standalone_emit_copy_short_2b
19349 LEAL -16842747(DX), DX
19350 MOVL $0xfffb001d, (AX)
19351 MOVB $0xff, 4(AX)
19352 ADDQ $0x05, AX
19353 ADDQ $0x05, BX
19354 JMP emit_repeat_again_standalone_emit_copy_short_2b
19355
19356repeat_five_standalone_emit_copy_short_2b:
19357 LEAL -65536(DX), DX
19358 MOVL DX, CX
19359 MOVW $0x001d, (AX)
19360 MOVW DX, 2(AX)
19361 SARL $0x10, CX
19362 MOVB CL, 4(AX)
19363 ADDQ $0x05, BX
19364 ADDQ $0x05, AX
19365 JMP gen_emit_copy_end
19366
19367repeat_four_standalone_emit_copy_short_2b:
19368 LEAL -256(DX), DX
19369 MOVW $0x0019, (AX)
19370 MOVW DX, 2(AX)
19371 ADDQ $0x04, BX
19372 ADDQ $0x04, AX
19373 JMP gen_emit_copy_end
19374
19375repeat_three_standalone_emit_copy_short_2b:
19376 LEAL -4(DX), DX
19377 MOVW $0x0015, (AX)
19378 MOVB DL, 2(AX)
19379 ADDQ $0x03, BX
19380 ADDQ $0x03, AX
19381 JMP gen_emit_copy_end
19382
19383repeat_two_standalone_emit_copy_short_2b:
19384 SHLL $0x02, DX
19385 ORL $0x01, DX
19386 MOVW DX, (AX)
19387 ADDQ $0x02, BX
19388 ADDQ $0x02, AX
19389 JMP gen_emit_copy_end
19390
19391repeat_two_offset_standalone_emit_copy_short_2b:
19392 XORQ SI, SI
19393 LEAL 1(SI)(DX*4), DX
19394 MOVB CL, 1(AX)
19395 SARL $0x08, CX
19396 SHLL $0x05, CX
19397 ORL CX, DX
19398 MOVB DL, (AX)
19399 ADDQ $0x02, BX
19400 ADDQ $0x02, AX
19401 JMP gen_emit_copy_end
19402
19403long_offset_short_standalone:
19404 MOVB $0xee, (AX)
19405 MOVW CX, 1(AX)
19406 LEAL -60(DX), DX
19407 ADDQ $0x03, AX
19408 ADDQ $0x03, BX
19409
19410 // emitRepeat
19411emit_repeat_again_standalone_emit_copy_short:
19412 MOVL DX, SI
19413 LEAL -4(DX), DX
19414 CMPL SI, $0x08
19415 JBE repeat_two_standalone_emit_copy_short
19416 CMPL SI, $0x0c
19417 JAE cant_repeat_two_offset_standalone_emit_copy_short
19418 CMPL CX, $0x00000800
19419 JB repeat_two_offset_standalone_emit_copy_short
19420
19421cant_repeat_two_offset_standalone_emit_copy_short:
19422 CMPL DX, $0x00000104
19423 JB repeat_three_standalone_emit_copy_short
19424 CMPL DX, $0x00010100
19425 JB repeat_four_standalone_emit_copy_short
19426 CMPL DX, $0x0100ffff
19427 JB repeat_five_standalone_emit_copy_short
19428 LEAL -16842747(DX), DX
19429 MOVL $0xfffb001d, (AX)
19430 MOVB $0xff, 4(AX)
19431 ADDQ $0x05, AX
19432 ADDQ $0x05, BX
19433 JMP emit_repeat_again_standalone_emit_copy_short
19434
19435repeat_five_standalone_emit_copy_short:
19436 LEAL -65536(DX), DX
19437 MOVL DX, CX
19438 MOVW $0x001d, (AX)
19439 MOVW DX, 2(AX)
19440 SARL $0x10, CX
19441 MOVB CL, 4(AX)
19442 ADDQ $0x05, BX
19443 ADDQ $0x05, AX
19444 JMP gen_emit_copy_end
19445
19446repeat_four_standalone_emit_copy_short:
19447 LEAL -256(DX), DX
19448 MOVW $0x0019, (AX)
19449 MOVW DX, 2(AX)
19450 ADDQ $0x04, BX
19451 ADDQ $0x04, AX
19452 JMP gen_emit_copy_end
19453
19454repeat_three_standalone_emit_copy_short:
19455 LEAL -4(DX), DX
19456 MOVW $0x0015, (AX)
19457 MOVB DL, 2(AX)
19458 ADDQ $0x03, BX
19459 ADDQ $0x03, AX
19460 JMP gen_emit_copy_end
19461
19462repeat_two_standalone_emit_copy_short:
19463 SHLL $0x02, DX
19464 ORL $0x01, DX
19465 MOVW DX, (AX)
19466 ADDQ $0x02, BX
19467 ADDQ $0x02, AX
19468 JMP gen_emit_copy_end
19469
19470repeat_two_offset_standalone_emit_copy_short:
19471 XORQ SI, SI
19472 LEAL 1(SI)(DX*4), DX
19473 MOVB CL, 1(AX)
19474 SARL $0x08, CX
19475 SHLL $0x05, CX
19476 ORL CX, DX
19477 MOVB DL, (AX)
19478 ADDQ $0x02, BX
19479 ADDQ $0x02, AX
19480 JMP gen_emit_copy_end
19481
19482two_byte_offset_short_standalone:
19483 MOVL DX, SI
19484 SHLL $0x02, SI
19485 CMPL DX, $0x0c
19486 JAE emit_copy_three_standalone
19487 CMPL CX, $0x00000800
19488 JAE emit_copy_three_standalone
19489 LEAL -15(SI), SI
19490 MOVB CL, 1(AX)
19491 SHRL $0x08, CX
19492 SHLL $0x05, CX
19493 ORL CX, SI
19494 MOVB SI, (AX)
19495 ADDQ $0x02, BX
19496 ADDQ $0x02, AX
19497 JMP gen_emit_copy_end
19498
19499emit_copy_three_standalone:
19500 LEAL -2(SI), SI
19501 MOVB SI, (AX)
19502 MOVW CX, 1(AX)
19503 ADDQ $0x03, BX
19504 ADDQ $0x03, AX
19505
19506gen_emit_copy_end:
19507 MOVQ BX, ret+40(FP)
19508 RET
19509
19510// func emitCopyNoRepeat(dst []byte, offset int, length int) int
19511TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
19512 XORQ BX, BX
19513 MOVQ dst_base+0(FP), AX
19514 MOVQ offset+24(FP), CX
19515 MOVQ length+32(FP), DX
19516
19517 // emitCopy
19518 CMPL CX, $0x00010000
19519 JB two_byte_offset_standalone_snappy
19520
19521four_bytes_loop_back_standalone_snappy:
19522 CMPL DX, $0x40
19523 JBE four_bytes_remain_standalone_snappy
19524 MOVB $0xff, (AX)
19525 MOVL CX, 1(AX)
19526 LEAL -64(DX), DX
19527 ADDQ $0x05, BX
19528 ADDQ $0x05, AX
19529 CMPL DX, $0x04
19530 JB four_bytes_remain_standalone_snappy
19531 JMP four_bytes_loop_back_standalone_snappy
19532
19533four_bytes_remain_standalone_snappy:
19534 TESTL DX, DX
19535 JZ gen_emit_copy_end_snappy
19536 XORL SI, SI
19537 LEAL -1(SI)(DX*4), DX
19538 MOVB DL, (AX)
19539 MOVL CX, 1(AX)
19540 ADDQ $0x05, BX
19541 ADDQ $0x05, AX
19542 JMP gen_emit_copy_end_snappy
19543
19544two_byte_offset_standalone_snappy:
19545 CMPL DX, $0x40
19546 JBE two_byte_offset_short_standalone_snappy
19547 MOVB $0xee, (AX)
19548 MOVW CX, 1(AX)
19549 LEAL -60(DX), DX
19550 ADDQ $0x03, AX
19551 ADDQ $0x03, BX
19552 JMP two_byte_offset_standalone_snappy
19553
19554two_byte_offset_short_standalone_snappy:
19555 MOVL DX, SI
19556 SHLL $0x02, SI
19557 CMPL DX, $0x0c
19558 JAE emit_copy_three_standalone_snappy
19559 CMPL CX, $0x00000800
19560 JAE emit_copy_three_standalone_snappy
19561 LEAL -15(SI), SI
19562 MOVB CL, 1(AX)
19563 SHRL $0x08, CX
19564 SHLL $0x05, CX
19565 ORL CX, SI
19566 MOVB SI, (AX)
19567 ADDQ $0x02, BX
19568 ADDQ $0x02, AX
19569 JMP gen_emit_copy_end_snappy
19570
19571emit_copy_three_standalone_snappy:
19572 LEAL -2(SI), SI
19573 MOVB SI, (AX)
19574 MOVW CX, 1(AX)
19575 ADDQ $0x03, BX
19576 ADDQ $0x03, AX
19577
19578gen_emit_copy_end_snappy:
19579 MOVQ BX, ret+40(FP)
19580 RET
19581
19582// func matchLen(a []byte, b []byte) int
19583// Requires: BMI
19584TEXT ·matchLen(SB), NOSPLIT, $0-56
19585 MOVQ a_base+0(FP), AX
19586 MOVQ b_base+24(FP), CX
19587 MOVQ a_len+8(FP), DX
19588
19589 // matchLen
19590 XORL SI, SI
19591
19592matchlen_loopback_16_standalone:
19593 CMPL DX, $0x10
19594 JB matchlen_match8_standalone
19595 MOVQ (AX)(SI*1), BX
19596 MOVQ 8(AX)(SI*1), DI
19597 XORQ (CX)(SI*1), BX
19598 JNZ matchlen_bsf_8_standalone
19599 XORQ 8(CX)(SI*1), DI
19600 JNZ matchlen_bsf_16standalone
19601 LEAL -16(DX), DX
19602 LEAL 16(SI), SI
19603 JMP matchlen_loopback_16_standalone
19604
19605matchlen_bsf_16standalone:
19606#ifdef GOAMD64_v3
19607 TZCNTQ DI, DI
19608
19609#else
19610 BSFQ DI, DI
19611
19612#endif
19613 SARQ $0x03, DI
19614 LEAL 8(SI)(DI*1), SI
19615 JMP gen_match_len_end
19616
19617matchlen_match8_standalone:
19618 CMPL DX, $0x08
19619 JB matchlen_match4_standalone
19620 MOVQ (AX)(SI*1), BX
19621 XORQ (CX)(SI*1), BX
19622 JNZ matchlen_bsf_8_standalone
19623 LEAL -8(DX), DX
19624 LEAL 8(SI), SI
19625 JMP matchlen_match4_standalone
19626
19627matchlen_bsf_8_standalone:
19628#ifdef GOAMD64_v3
19629 TZCNTQ BX, BX
19630
19631#else
19632 BSFQ BX, BX
19633
19634#endif
19635 SARQ $0x03, BX
19636 LEAL (SI)(BX*1), SI
19637 JMP gen_match_len_end
19638
19639matchlen_match4_standalone:
19640 CMPL DX, $0x04
19641 JB matchlen_match2_standalone
19642 MOVL (AX)(SI*1), BX
19643 CMPL (CX)(SI*1), BX
19644 JNE matchlen_match2_standalone
19645 LEAL -4(DX), DX
19646 LEAL 4(SI), SI
19647
19648matchlen_match2_standalone:
19649 CMPL DX, $0x01
19650 JE matchlen_match1_standalone
19651 JB gen_match_len_end
19652 MOVW (AX)(SI*1), BX
19653 CMPW (CX)(SI*1), BX
19654 JNE matchlen_match1_standalone
19655 LEAL 2(SI), SI
19656 SUBL $0x02, DX
19657 JZ gen_match_len_end
19658
19659matchlen_match1_standalone:
19660 MOVB (AX)(SI*1), BL
19661 CMPB (CX)(SI*1), BL
19662 JNE gen_match_len_end
19663 LEAL 1(SI), SI
19664
19665gen_match_len_end:
19666 MOVQ SI, ret+48(FP)
19667 RET
19668
19669// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
19670// Requires: SSE2
19671TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
19672 XORQ SI, SI
19673 MOVQ dst_base+0(FP), AX
19674 MOVQ dst_len+8(FP), CX
19675 MOVQ src_base+24(FP), DX
19676 MOVQ src_len+32(FP), BX
19677 LEAQ (DX)(BX*1), BX
19678 LEAQ -10(AX)(CX*1), CX
19679 XORQ DI, DI
19680
19681lz4_s2_loop:
19682 CMPQ DX, BX
19683 JAE lz4_s2_corrupt
19684 CMPQ AX, CX
19685 JAE lz4_s2_dstfull
19686 MOVBQZX (DX), R8
19687 MOVQ R8, R9
19688 MOVQ R8, R10
19689 SHRQ $0x04, R9
19690 ANDQ $0x0f, R10
19691 CMPQ R8, $0xf0
19692 JB lz4_s2_ll_end
19693
19694lz4_s2_ll_loop:
19695 INCQ DX
19696 CMPQ DX, BX
19697 JAE lz4_s2_corrupt
19698 MOVBQZX (DX), R8
19699 ADDQ R8, R9
19700 CMPQ R8, $0xff
19701 JEQ lz4_s2_ll_loop
19702
19703lz4_s2_ll_end:
19704 LEAQ (DX)(R9*1), R8
19705 ADDQ $0x04, R10
19706 CMPQ R8, BX
19707 JAE lz4_s2_corrupt
19708 INCQ DX
19709 INCQ R8
19710 TESTQ R9, R9
19711 JZ lz4_s2_lits_done
19712 LEAQ (AX)(R9*1), R11
19713 CMPQ R11, CX
19714 JAE lz4_s2_dstfull
19715 ADDQ R9, SI
19716 LEAL -1(R9), R11
19717 CMPL R11, $0x3c
19718 JB one_byte_lz4_s2
19719 CMPL R11, $0x00000100
19720 JB two_bytes_lz4_s2
19721 CMPL R11, $0x00010000
19722 JB three_bytes_lz4_s2
19723 CMPL R11, $0x01000000
19724 JB four_bytes_lz4_s2
19725 MOVB $0xfc, (AX)
19726 MOVL R11, 1(AX)
19727 ADDQ $0x05, AX
19728 JMP memmove_long_lz4_s2
19729
19730four_bytes_lz4_s2:
19731 MOVL R11, R12
19732 SHRL $0x10, R12
19733 MOVB $0xf8, (AX)
19734 MOVW R11, 1(AX)
19735 MOVB R12, 3(AX)
19736 ADDQ $0x04, AX
19737 JMP memmove_long_lz4_s2
19738
19739three_bytes_lz4_s2:
19740 MOVB $0xf4, (AX)
19741 MOVW R11, 1(AX)
19742 ADDQ $0x03, AX
19743 JMP memmove_long_lz4_s2
19744
19745two_bytes_lz4_s2:
19746 MOVB $0xf0, (AX)
19747 MOVB R11, 1(AX)
19748 ADDQ $0x02, AX
19749 CMPL R11, $0x40
19750 JB memmove_lz4_s2
19751 JMP memmove_long_lz4_s2
19752
19753one_byte_lz4_s2:
19754 SHLB $0x02, R11
19755 MOVB R11, (AX)
19756 ADDQ $0x01, AX
19757
19758memmove_lz4_s2:
19759 LEAQ (AX)(R9*1), R11
19760
19761 // genMemMoveShort
19762 CMPQ R9, $0x08
19763 JBE emit_lit_memmove_lz4_s2_memmove_move_8
19764 CMPQ R9, $0x10
19765 JBE emit_lit_memmove_lz4_s2_memmove_move_8through16
19766 CMPQ R9, $0x20
19767 JBE emit_lit_memmove_lz4_s2_memmove_move_17through32
19768 JMP emit_lit_memmove_lz4_s2_memmove_move_33through64
19769
19770emit_lit_memmove_lz4_s2_memmove_move_8:
19771 MOVQ (DX), R12
19772 MOVQ R12, (AX)
19773 JMP memmove_end_copy_lz4_s2
19774
19775emit_lit_memmove_lz4_s2_memmove_move_8through16:
19776 MOVQ (DX), R12
19777 MOVQ -8(DX)(R9*1), DX
19778 MOVQ R12, (AX)
19779 MOVQ DX, -8(AX)(R9*1)
19780 JMP memmove_end_copy_lz4_s2
19781
19782emit_lit_memmove_lz4_s2_memmove_move_17through32:
19783 MOVOU (DX), X0
19784 MOVOU -16(DX)(R9*1), X1
19785 MOVOU X0, (AX)
19786 MOVOU X1, -16(AX)(R9*1)
19787 JMP memmove_end_copy_lz4_s2
19788
19789emit_lit_memmove_lz4_s2_memmove_move_33through64:
19790 MOVOU (DX), X0
19791 MOVOU 16(DX), X1
19792 MOVOU -32(DX)(R9*1), X2
19793 MOVOU -16(DX)(R9*1), X3
19794 MOVOU X0, (AX)
19795 MOVOU X1, 16(AX)
19796 MOVOU X2, -32(AX)(R9*1)
19797 MOVOU X3, -16(AX)(R9*1)
19798
19799memmove_end_copy_lz4_s2:
19800 MOVQ R11, AX
19801 JMP lz4_s2_lits_emit_done
19802
19803memmove_long_lz4_s2:
19804 LEAQ (AX)(R9*1), R11
19805
19806 // genMemMoveLong
19807 MOVOU (DX), X0
19808 MOVOU 16(DX), X1
19809 MOVOU -32(DX)(R9*1), X2
19810 MOVOU -16(DX)(R9*1), X3
19811 MOVQ R9, R13
19812 SHRQ $0x05, R13
19813 MOVQ AX, R12
19814 ANDL $0x0000001f, R12
19815 MOVQ $0x00000040, R14
19816 SUBQ R12, R14
19817 DECQ R13
19818 JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
19819 LEAQ -32(DX)(R14*1), R12
19820 LEAQ -32(AX)(R14*1), R15
19821
19822emit_lit_memmove_long_lz4_s2large_big_loop_back:
19823 MOVOU (R12), X4
19824 MOVOU 16(R12), X5
19825 MOVOA X4, (R15)
19826 MOVOA X5, 16(R15)
19827 ADDQ $0x20, R15
19828 ADDQ $0x20, R12
19829 ADDQ $0x20, R14
19830 DECQ R13
19831 JNA emit_lit_memmove_long_lz4_s2large_big_loop_back
19832
19833emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
19834 MOVOU -32(DX)(R14*1), X4
19835 MOVOU -16(DX)(R14*1), X5
19836 MOVOA X4, -32(AX)(R14*1)
19837 MOVOA X5, -16(AX)(R14*1)
19838 ADDQ $0x20, R14
19839 CMPQ R9, R14
19840 JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
19841 MOVOU X0, (AX)
19842 MOVOU X1, 16(AX)
19843 MOVOU X2, -32(AX)(R9*1)
19844 MOVOU X3, -16(AX)(R9*1)
19845 MOVQ R11, AX
19846
19847lz4_s2_lits_emit_done:
19848 MOVQ R8, DX
19849
19850lz4_s2_lits_done:
19851 CMPQ DX, BX
19852 JNE lz4_s2_match
19853 CMPQ R10, $0x04
19854 JEQ lz4_s2_done
19855 JMP lz4_s2_corrupt
19856
19857lz4_s2_match:
19858 LEAQ 2(DX), R8
19859 CMPQ R8, BX
19860 JAE lz4_s2_corrupt
19861 MOVWQZX (DX), R9
19862 MOVQ R8, DX
19863 TESTQ R9, R9
19864 JZ lz4_s2_corrupt
19865 CMPQ R9, SI
19866 JA lz4_s2_corrupt
19867 CMPQ R10, $0x13
19868 JNE lz4_s2_ml_done
19869
19870lz4_s2_ml_loop:
19871 MOVBQZX (DX), R8
19872 INCQ DX
19873 ADDQ R8, R10
19874 CMPQ DX, BX
19875 JAE lz4_s2_corrupt
19876 CMPQ R8, $0xff
19877 JEQ lz4_s2_ml_loop
19878
19879lz4_s2_ml_done:
19880 ADDQ R10, SI
19881 CMPQ R9, DI
19882 JNE lz4_s2_docopy
19883
19884 // emitRepeat
19885emit_repeat_again_lz4_s2:
19886 MOVL R10, R8
19887 LEAL -4(R10), R10
19888 CMPL R8, $0x08
19889 JBE repeat_two_lz4_s2
19890 CMPL R8, $0x0c
19891 JAE cant_repeat_two_offset_lz4_s2
19892 CMPL R9, $0x00000800
19893 JB repeat_two_offset_lz4_s2
19894
19895cant_repeat_two_offset_lz4_s2:
19896 CMPL R10, $0x00000104
19897 JB repeat_three_lz4_s2
19898 CMPL R10, $0x00010100
19899 JB repeat_four_lz4_s2
19900 CMPL R10, $0x0100ffff
19901 JB repeat_five_lz4_s2
19902 LEAL -16842747(R10), R10
19903 MOVL $0xfffb001d, (AX)
19904 MOVB $0xff, 4(AX)
19905 ADDQ $0x05, AX
19906 JMP emit_repeat_again_lz4_s2
19907
19908repeat_five_lz4_s2:
19909 LEAL -65536(R10), R10
19910 MOVL R10, R9
19911 MOVW $0x001d, (AX)
19912 MOVW R10, 2(AX)
19913 SARL $0x10, R9
19914 MOVB R9, 4(AX)
19915 ADDQ $0x05, AX
19916 JMP lz4_s2_loop
19917
19918repeat_four_lz4_s2:
19919 LEAL -256(R10), R10
19920 MOVW $0x0019, (AX)
19921 MOVW R10, 2(AX)
19922 ADDQ $0x04, AX
19923 JMP lz4_s2_loop
19924
19925repeat_three_lz4_s2:
19926 LEAL -4(R10), R10
19927 MOVW $0x0015, (AX)
19928 MOVB R10, 2(AX)
19929 ADDQ $0x03, AX
19930 JMP lz4_s2_loop
19931
19932repeat_two_lz4_s2:
19933 SHLL $0x02, R10
19934 ORL $0x01, R10
19935 MOVW R10, (AX)
19936 ADDQ $0x02, AX
19937 JMP lz4_s2_loop
19938
19939repeat_two_offset_lz4_s2:
19940 XORQ R8, R8
19941 LEAL 1(R8)(R10*4), R10
19942 MOVB R9, 1(AX)
19943 SARL $0x08, R9
19944 SHLL $0x05, R9
19945 ORL R9, R10
19946 MOVB R10, (AX)
19947 ADDQ $0x02, AX
19948 JMP lz4_s2_loop
19949
19950lz4_s2_docopy:
19951 MOVQ R9, DI
19952
19953 // emitCopy
19954 CMPL R10, $0x40
19955 JBE two_byte_offset_short_lz4_s2
19956 CMPL R9, $0x00000800
19957 JAE long_offset_short_lz4_s2
19958 MOVL $0x00000001, R8
19959 LEAL 16(R8), R8
19960 MOVB R9, 1(AX)
19961 MOVL R9, R11
19962 SHRL $0x08, R11
19963 SHLL $0x05, R11
19964 ORL R11, R8
19965 MOVB R8, (AX)
19966 ADDQ $0x02, AX
19967 SUBL $0x08, R10
19968
19969 // emitRepeat
19970 LEAL -4(R10), R10
19971 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
19972
19973emit_repeat_again_lz4_s2_emit_copy_short_2b:
19974 MOVL R10, R8
19975 LEAL -4(R10), R10
19976 CMPL R8, $0x08
19977 JBE repeat_two_lz4_s2_emit_copy_short_2b
19978 CMPL R8, $0x0c
19979 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
19980 CMPL R9, $0x00000800
19981 JB repeat_two_offset_lz4_s2_emit_copy_short_2b
19982
19983cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
19984 CMPL R10, $0x00000104
19985 JB repeat_three_lz4_s2_emit_copy_short_2b
19986 CMPL R10, $0x00010100
19987 JB repeat_four_lz4_s2_emit_copy_short_2b
19988 CMPL R10, $0x0100ffff
19989 JB repeat_five_lz4_s2_emit_copy_short_2b
19990 LEAL -16842747(R10), R10
19991 MOVL $0xfffb001d, (AX)
19992 MOVB $0xff, 4(AX)
19993 ADDQ $0x05, AX
19994 JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
19995
19996repeat_five_lz4_s2_emit_copy_short_2b:
19997 LEAL -65536(R10), R10
19998 MOVL R10, R9
19999 MOVW $0x001d, (AX)
20000 MOVW R10, 2(AX)
20001 SARL $0x10, R9
20002 MOVB R9, 4(AX)
20003 ADDQ $0x05, AX
20004 JMP lz4_s2_loop
20005
20006repeat_four_lz4_s2_emit_copy_short_2b:
20007 LEAL -256(R10), R10
20008 MOVW $0x0019, (AX)
20009 MOVW R10, 2(AX)
20010 ADDQ $0x04, AX
20011 JMP lz4_s2_loop
20012
20013repeat_three_lz4_s2_emit_copy_short_2b:
20014 LEAL -4(R10), R10
20015 MOVW $0x0015, (AX)
20016 MOVB R10, 2(AX)
20017 ADDQ $0x03, AX
20018 JMP lz4_s2_loop
20019
20020repeat_two_lz4_s2_emit_copy_short_2b:
20021 SHLL $0x02, R10
20022 ORL $0x01, R10
20023 MOVW R10, (AX)
20024 ADDQ $0x02, AX
20025 JMP lz4_s2_loop
20026
20027repeat_two_offset_lz4_s2_emit_copy_short_2b:
20028 XORQ R8, R8
20029 LEAL 1(R8)(R10*4), R10
20030 MOVB R9, 1(AX)
20031 SARL $0x08, R9
20032 SHLL $0x05, R9
20033 ORL R9, R10
20034 MOVB R10, (AX)
20035 ADDQ $0x02, AX
20036 JMP lz4_s2_loop
20037
20038long_offset_short_lz4_s2:
20039 MOVB $0xee, (AX)
20040 MOVW R9, 1(AX)
20041 LEAL -60(R10), R10
20042 ADDQ $0x03, AX
20043
20044 // emitRepeat
20045emit_repeat_again_lz4_s2_emit_copy_short:
20046 MOVL R10, R8
20047 LEAL -4(R10), R10
20048 CMPL R8, $0x08
20049 JBE repeat_two_lz4_s2_emit_copy_short
20050 CMPL R8, $0x0c
20051 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short
20052 CMPL R9, $0x00000800
20053 JB repeat_two_offset_lz4_s2_emit_copy_short
20054
20055cant_repeat_two_offset_lz4_s2_emit_copy_short:
20056 CMPL R10, $0x00000104
20057 JB repeat_three_lz4_s2_emit_copy_short
20058 CMPL R10, $0x00010100
20059 JB repeat_four_lz4_s2_emit_copy_short
20060 CMPL R10, $0x0100ffff
20061 JB repeat_five_lz4_s2_emit_copy_short
20062 LEAL -16842747(R10), R10
20063 MOVL $0xfffb001d, (AX)
20064 MOVB $0xff, 4(AX)
20065 ADDQ $0x05, AX
20066 JMP emit_repeat_again_lz4_s2_emit_copy_short
20067
20068repeat_five_lz4_s2_emit_copy_short:
20069 LEAL -65536(R10), R10
20070 MOVL R10, R9
20071 MOVW $0x001d, (AX)
20072 MOVW R10, 2(AX)
20073 SARL $0x10, R9
20074 MOVB R9, 4(AX)
20075 ADDQ $0x05, AX
20076 JMP lz4_s2_loop
20077
20078repeat_four_lz4_s2_emit_copy_short:
20079 LEAL -256(R10), R10
20080 MOVW $0x0019, (AX)
20081 MOVW R10, 2(AX)
20082 ADDQ $0x04, AX
20083 JMP lz4_s2_loop
20084
20085repeat_three_lz4_s2_emit_copy_short:
20086 LEAL -4(R10), R10
20087 MOVW $0x0015, (AX)
20088 MOVB R10, 2(AX)
20089 ADDQ $0x03, AX
20090 JMP lz4_s2_loop
20091
20092repeat_two_lz4_s2_emit_copy_short:
20093 SHLL $0x02, R10
20094 ORL $0x01, R10
20095 MOVW R10, (AX)
20096 ADDQ $0x02, AX
20097 JMP lz4_s2_loop
20098
20099repeat_two_offset_lz4_s2_emit_copy_short:
20100 XORQ R8, R8
20101 LEAL 1(R8)(R10*4), R10
20102 MOVB R9, 1(AX)
20103 SARL $0x08, R9
20104 SHLL $0x05, R9
20105 ORL R9, R10
20106 MOVB R10, (AX)
20107 ADDQ $0x02, AX
20108 JMP lz4_s2_loop
20109
20110two_byte_offset_short_lz4_s2:
20111 MOVL R10, R8
20112 SHLL $0x02, R8
20113 CMPL R10, $0x0c
20114 JAE emit_copy_three_lz4_s2
20115 CMPL R9, $0x00000800
20116 JAE emit_copy_three_lz4_s2
20117 LEAL -15(R8), R8
20118 MOVB R9, 1(AX)
20119 SHRL $0x08, R9
20120 SHLL $0x05, R9
20121 ORL R9, R8
20122 MOVB R8, (AX)
20123 ADDQ $0x02, AX
20124 JMP lz4_s2_loop
20125
20126emit_copy_three_lz4_s2:
20127 LEAL -2(R8), R8
20128 MOVB R8, (AX)
20129 MOVW R9, 1(AX)
20130 ADDQ $0x03, AX
20131 JMP lz4_s2_loop
20132
20133lz4_s2_done:
20134 MOVQ dst_base+0(FP), CX
20135 SUBQ CX, AX
20136 MOVQ SI, uncompressed+48(FP)
20137 MOVQ AX, dstUsed+56(FP)
20138 RET
20139
20140lz4_s2_corrupt:
20141 XORQ AX, AX
20142 LEAQ -1(AX), SI
20143 MOVQ SI, uncompressed+48(FP)
20144 RET
20145
20146lz4_s2_dstfull:
20147 XORQ AX, AX
20148 LEAQ -2(AX), SI
20149 MOVQ SI, uncompressed+48(FP)
20150 RET
20151
20152// func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
20153// Requires: SSE2
20154TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64
20155 XORQ SI, SI
20156 MOVQ dst_base+0(FP), AX
20157 MOVQ dst_len+8(FP), CX
20158 MOVQ src_base+24(FP), DX
20159 MOVQ src_len+32(FP), BX
20160 LEAQ (DX)(BX*1), BX
20161 LEAQ -10(AX)(CX*1), CX
20162 XORQ DI, DI
20163
20164lz4s_s2_loop:
20165 CMPQ DX, BX
20166 JAE lz4s_s2_corrupt
20167 CMPQ AX, CX
20168 JAE lz4s_s2_dstfull
20169 MOVBQZX (DX), R8
20170 MOVQ R8, R9
20171 MOVQ R8, R10
20172 SHRQ $0x04, R9
20173 ANDQ $0x0f, R10
20174 CMPQ R8, $0xf0
20175 JB lz4s_s2_ll_end
20176
20177lz4s_s2_ll_loop:
20178 INCQ DX
20179 CMPQ DX, BX
20180 JAE lz4s_s2_corrupt
20181 MOVBQZX (DX), R8
20182 ADDQ R8, R9
20183 CMPQ R8, $0xff
20184 JEQ lz4s_s2_ll_loop
20185
20186lz4s_s2_ll_end:
20187 LEAQ (DX)(R9*1), R8
20188 ADDQ $0x03, R10
20189 CMPQ R8, BX
20190 JAE lz4s_s2_corrupt
20191 INCQ DX
20192 INCQ R8
20193 TESTQ R9, R9
20194 JZ lz4s_s2_lits_done
20195 LEAQ (AX)(R9*1), R11
20196 CMPQ R11, CX
20197 JAE lz4s_s2_dstfull
20198 ADDQ R9, SI
20199 LEAL -1(R9), R11
20200 CMPL R11, $0x3c
20201 JB one_byte_lz4s_s2
20202 CMPL R11, $0x00000100
20203 JB two_bytes_lz4s_s2
20204 CMPL R11, $0x00010000
20205 JB three_bytes_lz4s_s2
20206 CMPL R11, $0x01000000
20207 JB four_bytes_lz4s_s2
20208 MOVB $0xfc, (AX)
20209 MOVL R11, 1(AX)
20210 ADDQ $0x05, AX
20211 JMP memmove_long_lz4s_s2
20212
20213four_bytes_lz4s_s2:
20214 MOVL R11, R12
20215 SHRL $0x10, R12
20216 MOVB $0xf8, (AX)
20217 MOVW R11, 1(AX)
20218 MOVB R12, 3(AX)
20219 ADDQ $0x04, AX
20220 JMP memmove_long_lz4s_s2
20221
20222three_bytes_lz4s_s2:
20223 MOVB $0xf4, (AX)
20224 MOVW R11, 1(AX)
20225 ADDQ $0x03, AX
20226 JMP memmove_long_lz4s_s2
20227
20228two_bytes_lz4s_s2:
20229 MOVB $0xf0, (AX)
20230 MOVB R11, 1(AX)
20231 ADDQ $0x02, AX
20232 CMPL R11, $0x40
20233 JB memmove_lz4s_s2
20234 JMP memmove_long_lz4s_s2
20235
20236one_byte_lz4s_s2:
20237 SHLB $0x02, R11
20238 MOVB R11, (AX)
20239 ADDQ $0x01, AX
20240
20241memmove_lz4s_s2:
20242 LEAQ (AX)(R9*1), R11
20243
20244 // genMemMoveShort
20245 CMPQ R9, $0x08
20246 JBE emit_lit_memmove_lz4s_s2_memmove_move_8
20247 CMPQ R9, $0x10
20248 JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16
20249 CMPQ R9, $0x20
20250 JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32
20251 JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64
20252
20253emit_lit_memmove_lz4s_s2_memmove_move_8:
20254 MOVQ (DX), R12
20255 MOVQ R12, (AX)
20256 JMP memmove_end_copy_lz4s_s2
20257
20258emit_lit_memmove_lz4s_s2_memmove_move_8through16:
20259 MOVQ (DX), R12
20260 MOVQ -8(DX)(R9*1), DX
20261 MOVQ R12, (AX)
20262 MOVQ DX, -8(AX)(R9*1)
20263 JMP memmove_end_copy_lz4s_s2
20264
20265emit_lit_memmove_lz4s_s2_memmove_move_17through32:
20266 MOVOU (DX), X0
20267 MOVOU -16(DX)(R9*1), X1
20268 MOVOU X0, (AX)
20269 MOVOU X1, -16(AX)(R9*1)
20270 JMP memmove_end_copy_lz4s_s2
20271
20272emit_lit_memmove_lz4s_s2_memmove_move_33through64:
20273 MOVOU (DX), X0
20274 MOVOU 16(DX), X1
20275 MOVOU -32(DX)(R9*1), X2
20276 MOVOU -16(DX)(R9*1), X3
20277 MOVOU X0, (AX)
20278 MOVOU X1, 16(AX)
20279 MOVOU X2, -32(AX)(R9*1)
20280 MOVOU X3, -16(AX)(R9*1)
20281
20282memmove_end_copy_lz4s_s2:
20283 MOVQ R11, AX
20284 JMP lz4s_s2_lits_emit_done
20285
20286memmove_long_lz4s_s2:
20287 LEAQ (AX)(R9*1), R11
20288
20289 // genMemMoveLong
20290 MOVOU (DX), X0
20291 MOVOU 16(DX), X1
20292 MOVOU -32(DX)(R9*1), X2
20293 MOVOU -16(DX)(R9*1), X3
20294 MOVQ R9, R13
20295 SHRQ $0x05, R13
20296 MOVQ AX, R12
20297 ANDL $0x0000001f, R12
20298 MOVQ $0x00000040, R14
20299 SUBQ R12, R14
20300 DECQ R13
20301 JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
20302 LEAQ -32(DX)(R14*1), R12
20303 LEAQ -32(AX)(R14*1), R15
20304
20305emit_lit_memmove_long_lz4s_s2large_big_loop_back:
20306 MOVOU (R12), X4
20307 MOVOU 16(R12), X5
20308 MOVOA X4, (R15)
20309 MOVOA X5, 16(R15)
20310 ADDQ $0x20, R15
20311 ADDQ $0x20, R12
20312 ADDQ $0x20, R14
20313 DECQ R13
20314 JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back
20315
20316emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32:
20317 MOVOU -32(DX)(R14*1), X4
20318 MOVOU -16(DX)(R14*1), X5
20319 MOVOA X4, -32(AX)(R14*1)
20320 MOVOA X5, -16(AX)(R14*1)
20321 ADDQ $0x20, R14
20322 CMPQ R9, R14
20323 JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
20324 MOVOU X0, (AX)
20325 MOVOU X1, 16(AX)
20326 MOVOU X2, -32(AX)(R9*1)
20327 MOVOU X3, -16(AX)(R9*1)
20328 MOVQ R11, AX
20329
20330lz4s_s2_lits_emit_done:
20331 MOVQ R8, DX
20332
20333lz4s_s2_lits_done:
20334 CMPQ DX, BX
20335 JNE lz4s_s2_match
20336 CMPQ R10, $0x03
20337 JEQ lz4s_s2_done
20338 JMP lz4s_s2_corrupt
20339
20340lz4s_s2_match:
20341 CMPQ R10, $0x03
20342 JEQ lz4s_s2_loop
20343 LEAQ 2(DX), R8
20344 CMPQ R8, BX
20345 JAE lz4s_s2_corrupt
20346 MOVWQZX (DX), R9
20347 MOVQ R8, DX
20348 TESTQ R9, R9
20349 JZ lz4s_s2_corrupt
20350 CMPQ R9, SI
20351 JA lz4s_s2_corrupt
20352 CMPQ R10, $0x12
20353 JNE lz4s_s2_ml_done
20354
20355lz4s_s2_ml_loop:
20356 MOVBQZX (DX), R8
20357 INCQ DX
20358 ADDQ R8, R10
20359 CMPQ DX, BX
20360 JAE lz4s_s2_corrupt
20361 CMPQ R8, $0xff
20362 JEQ lz4s_s2_ml_loop
20363
20364lz4s_s2_ml_done:
20365 ADDQ R10, SI
20366 CMPQ R9, DI
20367 JNE lz4s_s2_docopy
20368
20369 // emitRepeat
20370emit_repeat_again_lz4_s2:
20371 MOVL R10, R8
20372 LEAL -4(R10), R10
20373 CMPL R8, $0x08
20374 JBE repeat_two_lz4_s2
20375 CMPL R8, $0x0c
20376 JAE cant_repeat_two_offset_lz4_s2
20377 CMPL R9, $0x00000800
20378 JB repeat_two_offset_lz4_s2
20379
20380cant_repeat_two_offset_lz4_s2:
20381 CMPL R10, $0x00000104
20382 JB repeat_three_lz4_s2
20383 CMPL R10, $0x00010100
20384 JB repeat_four_lz4_s2
20385 CMPL R10, $0x0100ffff
20386 JB repeat_five_lz4_s2
20387 LEAL -16842747(R10), R10
20388 MOVL $0xfffb001d, (AX)
20389 MOVB $0xff, 4(AX)
20390 ADDQ $0x05, AX
20391 JMP emit_repeat_again_lz4_s2
20392
20393repeat_five_lz4_s2:
20394 LEAL -65536(R10), R10
20395 MOVL R10, R9
20396 MOVW $0x001d, (AX)
20397 MOVW R10, 2(AX)
20398 SARL $0x10, R9
20399 MOVB R9, 4(AX)
20400 ADDQ $0x05, AX
20401 JMP lz4s_s2_loop
20402
20403repeat_four_lz4_s2:
20404 LEAL -256(R10), R10
20405 MOVW $0x0019, (AX)
20406 MOVW R10, 2(AX)
20407 ADDQ $0x04, AX
20408 JMP lz4s_s2_loop
20409
20410repeat_three_lz4_s2:
20411 LEAL -4(R10), R10
20412 MOVW $0x0015, (AX)
20413 MOVB R10, 2(AX)
20414 ADDQ $0x03, AX
20415 JMP lz4s_s2_loop
20416
20417repeat_two_lz4_s2:
20418 SHLL $0x02, R10
20419 ORL $0x01, R10
20420 MOVW R10, (AX)
20421 ADDQ $0x02, AX
20422 JMP lz4s_s2_loop
20423
20424repeat_two_offset_lz4_s2:
20425 XORQ R8, R8
20426 LEAL 1(R8)(R10*4), R10
20427 MOVB R9, 1(AX)
20428 SARL $0x08, R9
20429 SHLL $0x05, R9
20430 ORL R9, R10
20431 MOVB R10, (AX)
20432 ADDQ $0x02, AX
20433 JMP lz4s_s2_loop
20434
20435lz4s_s2_docopy:
20436 MOVQ R9, DI
20437
20438 // emitCopy
20439 CMPL R10, $0x40
20440 JBE two_byte_offset_short_lz4_s2
20441 CMPL R9, $0x00000800
20442 JAE long_offset_short_lz4_s2
20443 MOVL $0x00000001, R8
20444 LEAL 16(R8), R8
20445 MOVB R9, 1(AX)
20446 MOVL R9, R11
20447 SHRL $0x08, R11
20448 SHLL $0x05, R11
20449 ORL R11, R8
20450 MOVB R8, (AX)
20451 ADDQ $0x02, AX
20452 SUBL $0x08, R10
20453
20454 // emitRepeat
20455 LEAL -4(R10), R10
20456 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
20457
20458emit_repeat_again_lz4_s2_emit_copy_short_2b:
20459 MOVL R10, R8
20460 LEAL -4(R10), R10
20461 CMPL R8, $0x08
20462 JBE repeat_two_lz4_s2_emit_copy_short_2b
20463 CMPL R8, $0x0c
20464 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
20465 CMPL R9, $0x00000800
20466 JB repeat_two_offset_lz4_s2_emit_copy_short_2b
20467
20468cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
20469 CMPL R10, $0x00000104
20470 JB repeat_three_lz4_s2_emit_copy_short_2b
20471 CMPL R10, $0x00010100
20472 JB repeat_four_lz4_s2_emit_copy_short_2b
20473 CMPL R10, $0x0100ffff
20474 JB repeat_five_lz4_s2_emit_copy_short_2b
20475 LEAL -16842747(R10), R10
20476 MOVL $0xfffb001d, (AX)
20477 MOVB $0xff, 4(AX)
20478 ADDQ $0x05, AX
20479 JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
20480
20481repeat_five_lz4_s2_emit_copy_short_2b:
20482 LEAL -65536(R10), R10
20483 MOVL R10, R9
20484 MOVW $0x001d, (AX)
20485 MOVW R10, 2(AX)
20486 SARL $0x10, R9
20487 MOVB R9, 4(AX)
20488 ADDQ $0x05, AX
20489 JMP lz4s_s2_loop
20490
20491repeat_four_lz4_s2_emit_copy_short_2b:
20492 LEAL -256(R10), R10
20493 MOVW $0x0019, (AX)
20494 MOVW R10, 2(AX)
20495 ADDQ $0x04, AX
20496 JMP lz4s_s2_loop
20497
20498repeat_three_lz4_s2_emit_copy_short_2b:
20499 LEAL -4(R10), R10
20500 MOVW $0x0015, (AX)
20501 MOVB R10, 2(AX)
20502 ADDQ $0x03, AX
20503 JMP lz4s_s2_loop
20504
20505repeat_two_lz4_s2_emit_copy_short_2b:
20506 SHLL $0x02, R10
20507 ORL $0x01, R10
20508 MOVW R10, (AX)
20509 ADDQ $0x02, AX
20510 JMP lz4s_s2_loop
20511
20512repeat_two_offset_lz4_s2_emit_copy_short_2b:
20513 XORQ R8, R8
20514 LEAL 1(R8)(R10*4), R10
20515 MOVB R9, 1(AX)
20516 SARL $0x08, R9
20517 SHLL $0x05, R9
20518 ORL R9, R10
20519 MOVB R10, (AX)
20520 ADDQ $0x02, AX
20521 JMP lz4s_s2_loop
20522
20523long_offset_short_lz4_s2:
20524 MOVB $0xee, (AX)
20525 MOVW R9, 1(AX)
20526 LEAL -60(R10), R10
20527 ADDQ $0x03, AX
20528
20529 // emitRepeat
20530emit_repeat_again_lz4_s2_emit_copy_short:
20531 MOVL R10, R8
20532 LEAL -4(R10), R10
20533 CMPL R8, $0x08
20534 JBE repeat_two_lz4_s2_emit_copy_short
20535 CMPL R8, $0x0c
20536 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short
20537 CMPL R9, $0x00000800
20538 JB repeat_two_offset_lz4_s2_emit_copy_short
20539
20540cant_repeat_two_offset_lz4_s2_emit_copy_short:
20541 CMPL R10, $0x00000104
20542 JB repeat_three_lz4_s2_emit_copy_short
20543 CMPL R10, $0x00010100
20544 JB repeat_four_lz4_s2_emit_copy_short
20545 CMPL R10, $0x0100ffff
20546 JB repeat_five_lz4_s2_emit_copy_short
20547 LEAL -16842747(R10), R10
20548 MOVL $0xfffb001d, (AX)
20549 MOVB $0xff, 4(AX)
20550 ADDQ $0x05, AX
20551 JMP emit_repeat_again_lz4_s2_emit_copy_short
20552
20553repeat_five_lz4_s2_emit_copy_short:
20554 LEAL -65536(R10), R10
20555 MOVL R10, R9
20556 MOVW $0x001d, (AX)
20557 MOVW R10, 2(AX)
20558 SARL $0x10, R9
20559 MOVB R9, 4(AX)
20560 ADDQ $0x05, AX
20561 JMP lz4s_s2_loop
20562
20563repeat_four_lz4_s2_emit_copy_short:
20564 LEAL -256(R10), R10
20565 MOVW $0x0019, (AX)
20566 MOVW R10, 2(AX)
20567 ADDQ $0x04, AX
20568 JMP lz4s_s2_loop
20569
20570repeat_three_lz4_s2_emit_copy_short:
20571 LEAL -4(R10), R10
20572 MOVW $0x0015, (AX)
20573 MOVB R10, 2(AX)
20574 ADDQ $0x03, AX
20575 JMP lz4s_s2_loop
20576
20577repeat_two_lz4_s2_emit_copy_short:
20578 SHLL $0x02, R10
20579 ORL $0x01, R10
20580 MOVW R10, (AX)
20581 ADDQ $0x02, AX
20582 JMP lz4s_s2_loop
20583
20584repeat_two_offset_lz4_s2_emit_copy_short:
20585 XORQ R8, R8
20586 LEAL 1(R8)(R10*4), R10
20587 MOVB R9, 1(AX)
20588 SARL $0x08, R9
20589 SHLL $0x05, R9
20590 ORL R9, R10
20591 MOVB R10, (AX)
20592 ADDQ $0x02, AX
20593 JMP lz4s_s2_loop
20594
20595two_byte_offset_short_lz4_s2:
20596 MOVL R10, R8
20597 SHLL $0x02, R8
20598 CMPL R10, $0x0c
20599 JAE emit_copy_three_lz4_s2
20600 CMPL R9, $0x00000800
20601 JAE emit_copy_three_lz4_s2
20602 LEAL -15(R8), R8
20603 MOVB R9, 1(AX)
20604 SHRL $0x08, R9
20605 SHLL $0x05, R9
20606 ORL R9, R8
20607 MOVB R8, (AX)
20608 ADDQ $0x02, AX
20609 JMP lz4s_s2_loop
20610
20611emit_copy_three_lz4_s2:
20612 LEAL -2(R8), R8
20613 MOVB R8, (AX)
20614 MOVW R9, 1(AX)
20615 ADDQ $0x03, AX
20616 JMP lz4s_s2_loop
20617
20618lz4s_s2_done:
20619 MOVQ dst_base+0(FP), CX
20620 SUBQ CX, AX
20621 MOVQ SI, uncompressed+48(FP)
20622 MOVQ AX, dstUsed+56(FP)
20623 RET
20624
20625lz4s_s2_corrupt:
20626 XORQ AX, AX
20627 LEAQ -1(AX), SI
20628 MOVQ SI, uncompressed+48(FP)
20629 RET
20630
20631lz4s_s2_dstfull:
20632 XORQ AX, AX
20633 LEAQ -2(AX), SI
20634 MOVQ SI, uncompressed+48(FP)
20635 RET
20636
20637// func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
20638// Requires: SSE2
20639TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
20640 XORQ SI, SI
20641 MOVQ dst_base+0(FP), AX
20642 MOVQ dst_len+8(FP), CX
20643 MOVQ src_base+24(FP), DX
20644 MOVQ src_len+32(FP), BX
20645 LEAQ (DX)(BX*1), BX
20646 LEAQ -10(AX)(CX*1), CX
20647
20648lz4_snappy_loop:
20649 CMPQ DX, BX
20650 JAE lz4_snappy_corrupt
20651 CMPQ AX, CX
20652 JAE lz4_snappy_dstfull
20653 MOVBQZX (DX), DI
20654 MOVQ DI, R8
20655 MOVQ DI, R9
20656 SHRQ $0x04, R8
20657 ANDQ $0x0f, R9
20658 CMPQ DI, $0xf0
20659 JB lz4_snappy_ll_end
20660
20661lz4_snappy_ll_loop:
20662 INCQ DX
20663 CMPQ DX, BX
20664 JAE lz4_snappy_corrupt
20665 MOVBQZX (DX), DI
20666 ADDQ DI, R8
20667 CMPQ DI, $0xff
20668 JEQ lz4_snappy_ll_loop
20669
20670lz4_snappy_ll_end:
20671 LEAQ (DX)(R8*1), DI
20672 ADDQ $0x04, R9
20673 CMPQ DI, BX
20674 JAE lz4_snappy_corrupt
20675 INCQ DX
20676 INCQ DI
20677 TESTQ R8, R8
20678 JZ lz4_snappy_lits_done
20679 LEAQ (AX)(R8*1), R10
20680 CMPQ R10, CX
20681 JAE lz4_snappy_dstfull
20682 ADDQ R8, SI
20683 LEAL -1(R8), R10
20684 CMPL R10, $0x3c
20685 JB one_byte_lz4_snappy
20686 CMPL R10, $0x00000100
20687 JB two_bytes_lz4_snappy
20688 CMPL R10, $0x00010000
20689 JB three_bytes_lz4_snappy
20690 CMPL R10, $0x01000000
20691 JB four_bytes_lz4_snappy
20692 MOVB $0xfc, (AX)
20693 MOVL R10, 1(AX)
20694 ADDQ $0x05, AX
20695 JMP memmove_long_lz4_snappy
20696
20697four_bytes_lz4_snappy:
20698 MOVL R10, R11
20699 SHRL $0x10, R11
20700 MOVB $0xf8, (AX)
20701 MOVW R10, 1(AX)
20702 MOVB R11, 3(AX)
20703 ADDQ $0x04, AX
20704 JMP memmove_long_lz4_snappy
20705
20706three_bytes_lz4_snappy:
20707 MOVB $0xf4, (AX)
20708 MOVW R10, 1(AX)
20709 ADDQ $0x03, AX
20710 JMP memmove_long_lz4_snappy
20711
20712two_bytes_lz4_snappy:
20713 MOVB $0xf0, (AX)
20714 MOVB R10, 1(AX)
20715 ADDQ $0x02, AX
20716 CMPL R10, $0x40
20717 JB memmove_lz4_snappy
20718 JMP memmove_long_lz4_snappy
20719
20720one_byte_lz4_snappy:
20721 SHLB $0x02, R10
20722 MOVB R10, (AX)
20723 ADDQ $0x01, AX
20724
20725memmove_lz4_snappy:
20726 LEAQ (AX)(R8*1), R10
20727
20728 // genMemMoveShort
20729 CMPQ R8, $0x08
20730 JBE emit_lit_memmove_lz4_snappy_memmove_move_8
20731 CMPQ R8, $0x10
20732 JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16
20733 CMPQ R8, $0x20
20734 JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32
20735 JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64
20736
20737emit_lit_memmove_lz4_snappy_memmove_move_8:
20738 MOVQ (DX), R11
20739 MOVQ R11, (AX)
20740 JMP memmove_end_copy_lz4_snappy
20741
20742emit_lit_memmove_lz4_snappy_memmove_move_8through16:
20743 MOVQ (DX), R11
20744 MOVQ -8(DX)(R8*1), DX
20745 MOVQ R11, (AX)
20746 MOVQ DX, -8(AX)(R8*1)
20747 JMP memmove_end_copy_lz4_snappy
20748
20749emit_lit_memmove_lz4_snappy_memmove_move_17through32:
20750 MOVOU (DX), X0
20751 MOVOU -16(DX)(R8*1), X1
20752 MOVOU X0, (AX)
20753 MOVOU X1, -16(AX)(R8*1)
20754 JMP memmove_end_copy_lz4_snappy
20755
20756emit_lit_memmove_lz4_snappy_memmove_move_33through64:
20757 MOVOU (DX), X0
20758 MOVOU 16(DX), X1
20759 MOVOU -32(DX)(R8*1), X2
20760 MOVOU -16(DX)(R8*1), X3
20761 MOVOU X0, (AX)
20762 MOVOU X1, 16(AX)
20763 MOVOU X2, -32(AX)(R8*1)
20764 MOVOU X3, -16(AX)(R8*1)
20765
20766memmove_end_copy_lz4_snappy:
20767 MOVQ R10, AX
20768 JMP lz4_snappy_lits_emit_done
20769
20770memmove_long_lz4_snappy:
20771 LEAQ (AX)(R8*1), R10
20772
20773 // genMemMoveLong
20774 MOVOU (DX), X0
20775 MOVOU 16(DX), X1
20776 MOVOU -32(DX)(R8*1), X2
20777 MOVOU -16(DX)(R8*1), X3
20778 MOVQ R8, R12
20779 SHRQ $0x05, R12
20780 MOVQ AX, R11
20781 ANDL $0x0000001f, R11
20782 MOVQ $0x00000040, R13
20783 SUBQ R11, R13
20784 DECQ R12
20785 JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
20786 LEAQ -32(DX)(R13*1), R11
20787 LEAQ -32(AX)(R13*1), R14
20788
20789emit_lit_memmove_long_lz4_snappylarge_big_loop_back:
20790 MOVOU (R11), X4
20791 MOVOU 16(R11), X5
20792 MOVOA X4, (R14)
20793 MOVOA X5, 16(R14)
20794 ADDQ $0x20, R14
20795 ADDQ $0x20, R11
20796 ADDQ $0x20, R13
20797 DECQ R12
20798 JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back
20799
20800emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32:
20801 MOVOU -32(DX)(R13*1), X4
20802 MOVOU -16(DX)(R13*1), X5
20803 MOVOA X4, -32(AX)(R13*1)
20804 MOVOA X5, -16(AX)(R13*1)
20805 ADDQ $0x20, R13
20806 CMPQ R8, R13
20807 JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
20808 MOVOU X0, (AX)
20809 MOVOU X1, 16(AX)
20810 MOVOU X2, -32(AX)(R8*1)
20811 MOVOU X3, -16(AX)(R8*1)
20812 MOVQ R10, AX
20813
20814lz4_snappy_lits_emit_done:
20815 MOVQ DI, DX
20816
20817lz4_snappy_lits_done:
20818 CMPQ DX, BX
20819 JNE lz4_snappy_match
20820 CMPQ R9, $0x04
20821 JEQ lz4_snappy_done
20822 JMP lz4_snappy_corrupt
20823
20824lz4_snappy_match:
20825 LEAQ 2(DX), DI
20826 CMPQ DI, BX
20827 JAE lz4_snappy_corrupt
20828 MOVWQZX (DX), R8
20829 MOVQ DI, DX
20830 TESTQ R8, R8
20831 JZ lz4_snappy_corrupt
20832 CMPQ R8, SI
20833 JA lz4_snappy_corrupt
20834 CMPQ R9, $0x13
20835 JNE lz4_snappy_ml_done
20836
20837lz4_snappy_ml_loop:
20838 MOVBQZX (DX), DI
20839 INCQ DX
20840 ADDQ DI, R9
20841 CMPQ DX, BX
20842 JAE lz4_snappy_corrupt
20843 CMPQ DI, $0xff
20844 JEQ lz4_snappy_ml_loop
20845
20846lz4_snappy_ml_done:
20847 ADDQ R9, SI
20848
20849 // emitCopy
20850two_byte_offset_lz4_s2:
20851 CMPL R9, $0x40
20852 JBE two_byte_offset_short_lz4_s2
20853 MOVB $0xee, (AX)
20854 MOVW R8, 1(AX)
20855 LEAL -60(R9), R9
20856 ADDQ $0x03, AX
20857 CMPQ AX, CX
20858 JAE lz4_snappy_loop
20859 JMP two_byte_offset_lz4_s2
20860
20861two_byte_offset_short_lz4_s2:
20862 MOVL R9, DI
20863 SHLL $0x02, DI
20864 CMPL R9, $0x0c
20865 JAE emit_copy_three_lz4_s2
20866 CMPL R8, $0x00000800
20867 JAE emit_copy_three_lz4_s2
20868 LEAL -15(DI), DI
20869 MOVB R8, 1(AX)
20870 SHRL $0x08, R8
20871 SHLL $0x05, R8
20872 ORL R8, DI
20873 MOVB DI, (AX)
20874 ADDQ $0x02, AX
20875 JMP lz4_snappy_loop
20876
20877emit_copy_three_lz4_s2:
20878 LEAL -2(DI), DI
20879 MOVB DI, (AX)
20880 MOVW R8, 1(AX)
20881 ADDQ $0x03, AX
20882 JMP lz4_snappy_loop
20883
20884lz4_snappy_done:
20885 MOVQ dst_base+0(FP), CX
20886 SUBQ CX, AX
20887 MOVQ SI, uncompressed+48(FP)
20888 MOVQ AX, dstUsed+56(FP)
20889 RET
20890
20891lz4_snappy_corrupt:
20892 XORQ AX, AX
20893 LEAQ -1(AX), SI
20894 MOVQ SI, uncompressed+48(FP)
20895 RET
20896
20897lz4_snappy_dstfull:
20898 XORQ AX, AX
20899 LEAQ -2(AX), SI
20900 MOVQ SI, uncompressed+48(FP)
20901 RET
20902
20903// func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
20904// Requires: SSE2
20905TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64
20906 XORQ SI, SI
20907 MOVQ dst_base+0(FP), AX
20908 MOVQ dst_len+8(FP), CX
20909 MOVQ src_base+24(FP), DX
20910 MOVQ src_len+32(FP), BX
20911 LEAQ (DX)(BX*1), BX
20912 LEAQ -10(AX)(CX*1), CX
20913
20914lz4s_snappy_loop:
20915 CMPQ DX, BX
20916 JAE lz4s_snappy_corrupt
20917 CMPQ AX, CX
20918 JAE lz4s_snappy_dstfull
20919 MOVBQZX (DX), DI
20920 MOVQ DI, R8
20921 MOVQ DI, R9
20922 SHRQ $0x04, R8
20923 ANDQ $0x0f, R9
20924 CMPQ DI, $0xf0
20925 JB lz4s_snappy_ll_end
20926
20927lz4s_snappy_ll_loop:
20928 INCQ DX
20929 CMPQ DX, BX
20930 JAE lz4s_snappy_corrupt
20931 MOVBQZX (DX), DI
20932 ADDQ DI, R8
20933 CMPQ DI, $0xff
20934 JEQ lz4s_snappy_ll_loop
20935
20936lz4s_snappy_ll_end:
20937 LEAQ (DX)(R8*1), DI
20938 ADDQ $0x03, R9
20939 CMPQ DI, BX
20940 JAE lz4s_snappy_corrupt
20941 INCQ DX
20942 INCQ DI
20943 TESTQ R8, R8
20944 JZ lz4s_snappy_lits_done
20945 LEAQ (AX)(R8*1), R10
20946 CMPQ R10, CX
20947 JAE lz4s_snappy_dstfull
20948 ADDQ R8, SI
20949 LEAL -1(R8), R10
20950 CMPL R10, $0x3c
20951 JB one_byte_lz4s_snappy
20952 CMPL R10, $0x00000100
20953 JB two_bytes_lz4s_snappy
20954 CMPL R10, $0x00010000
20955 JB three_bytes_lz4s_snappy
20956 CMPL R10, $0x01000000
20957 JB four_bytes_lz4s_snappy
20958 MOVB $0xfc, (AX)
20959 MOVL R10, 1(AX)
20960 ADDQ $0x05, AX
20961 JMP memmove_long_lz4s_snappy
20962
20963four_bytes_lz4s_snappy:
20964 MOVL R10, R11
20965 SHRL $0x10, R11
20966 MOVB $0xf8, (AX)
20967 MOVW R10, 1(AX)
20968 MOVB R11, 3(AX)
20969 ADDQ $0x04, AX
20970 JMP memmove_long_lz4s_snappy
20971
20972three_bytes_lz4s_snappy:
20973 MOVB $0xf4, (AX)
20974 MOVW R10, 1(AX)
20975 ADDQ $0x03, AX
20976 JMP memmove_long_lz4s_snappy
20977
20978two_bytes_lz4s_snappy:
20979 MOVB $0xf0, (AX)
20980 MOVB R10, 1(AX)
20981 ADDQ $0x02, AX
20982 CMPL R10, $0x40
20983 JB memmove_lz4s_snappy
20984 JMP memmove_long_lz4s_snappy
20985
20986one_byte_lz4s_snappy:
20987 SHLB $0x02, R10
20988 MOVB R10, (AX)
20989 ADDQ $0x01, AX
20990
20991memmove_lz4s_snappy:
20992 LEAQ (AX)(R8*1), R10
20993
20994 // genMemMoveShort
20995 CMPQ R8, $0x08
20996 JBE emit_lit_memmove_lz4s_snappy_memmove_move_8
20997 CMPQ R8, $0x10
20998 JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16
20999 CMPQ R8, $0x20
21000 JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32
21001 JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64
21002
21003emit_lit_memmove_lz4s_snappy_memmove_move_8:
21004 MOVQ (DX), R11
21005 MOVQ R11, (AX)
21006 JMP memmove_end_copy_lz4s_snappy
21007
21008emit_lit_memmove_lz4s_snappy_memmove_move_8through16:
21009 MOVQ (DX), R11
21010 MOVQ -8(DX)(R8*1), DX
21011 MOVQ R11, (AX)
21012 MOVQ DX, -8(AX)(R8*1)
21013 JMP memmove_end_copy_lz4s_snappy
21014
21015emit_lit_memmove_lz4s_snappy_memmove_move_17through32:
21016 MOVOU (DX), X0
21017 MOVOU -16(DX)(R8*1), X1
21018 MOVOU X0, (AX)
21019 MOVOU X1, -16(AX)(R8*1)
21020 JMP memmove_end_copy_lz4s_snappy
21021
21022emit_lit_memmove_lz4s_snappy_memmove_move_33through64:
21023 MOVOU (DX), X0
21024 MOVOU 16(DX), X1
21025 MOVOU -32(DX)(R8*1), X2
21026 MOVOU -16(DX)(R8*1), X3
21027 MOVOU X0, (AX)
21028 MOVOU X1, 16(AX)
21029 MOVOU X2, -32(AX)(R8*1)
21030 MOVOU X3, -16(AX)(R8*1)
21031
21032memmove_end_copy_lz4s_snappy:
21033 MOVQ R10, AX
21034 JMP lz4s_snappy_lits_emit_done
21035
21036memmove_long_lz4s_snappy:
21037 LEAQ (AX)(R8*1), R10
21038
21039 // genMemMoveLong
21040 MOVOU (DX), X0
21041 MOVOU 16(DX), X1
21042 MOVOU -32(DX)(R8*1), X2
21043 MOVOU -16(DX)(R8*1), X3
21044 MOVQ R8, R12
21045 SHRQ $0x05, R12
21046 MOVQ AX, R11
21047 ANDL $0x0000001f, R11
21048 MOVQ $0x00000040, R13
21049 SUBQ R11, R13
21050 DECQ R12
21051 JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
21052 LEAQ -32(DX)(R13*1), R11
21053 LEAQ -32(AX)(R13*1), R14
21054
21055emit_lit_memmove_long_lz4s_snappylarge_big_loop_back:
21056 MOVOU (R11), X4
21057 MOVOU 16(R11), X5
21058 MOVOA X4, (R14)
21059 MOVOA X5, 16(R14)
21060 ADDQ $0x20, R14
21061 ADDQ $0x20, R11
21062 ADDQ $0x20, R13
21063 DECQ R12
21064 JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back
21065
21066emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32:
21067 MOVOU -32(DX)(R13*1), X4
21068 MOVOU -16(DX)(R13*1), X5
21069 MOVOA X4, -32(AX)(R13*1)
21070 MOVOA X5, -16(AX)(R13*1)
21071 ADDQ $0x20, R13
21072 CMPQ R8, R13
21073 JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
21074 MOVOU X0, (AX)
21075 MOVOU X1, 16(AX)
21076 MOVOU X2, -32(AX)(R8*1)
21077 MOVOU X3, -16(AX)(R8*1)
21078 MOVQ R10, AX
21079
21080lz4s_snappy_lits_emit_done:
21081 MOVQ DI, DX
21082
21083lz4s_snappy_lits_done:
21084 CMPQ DX, BX
21085 JNE lz4s_snappy_match
21086 CMPQ R9, $0x03
21087 JEQ lz4s_snappy_done
21088 JMP lz4s_snappy_corrupt
21089
21090lz4s_snappy_match:
21091 CMPQ R9, $0x03
21092 JEQ lz4s_snappy_loop
21093 LEAQ 2(DX), DI
21094 CMPQ DI, BX
21095 JAE lz4s_snappy_corrupt
21096 MOVWQZX (DX), R8
21097 MOVQ DI, DX
21098 TESTQ R8, R8
21099 JZ lz4s_snappy_corrupt
21100 CMPQ R8, SI
21101 JA lz4s_snappy_corrupt
21102 CMPQ R9, $0x12
21103 JNE lz4s_snappy_ml_done
21104
21105lz4s_snappy_ml_loop:
21106 MOVBQZX (DX), DI
21107 INCQ DX
21108 ADDQ DI, R9
21109 CMPQ DX, BX
21110 JAE lz4s_snappy_corrupt
21111 CMPQ DI, $0xff
21112 JEQ lz4s_snappy_ml_loop
21113
21114lz4s_snappy_ml_done:
21115 ADDQ R9, SI
21116
21117 // emitCopy
21118two_byte_offset_lz4_s2:
21119 CMPL R9, $0x40
21120 JBE two_byte_offset_short_lz4_s2
21121 MOVB $0xee, (AX)
21122 MOVW R8, 1(AX)
21123 LEAL -60(R9), R9
21124 ADDQ $0x03, AX
21125 CMPQ AX, CX
21126 JAE lz4s_snappy_loop
21127 JMP two_byte_offset_lz4_s2
21128
21129two_byte_offset_short_lz4_s2:
21130 MOVL R9, DI
21131 SHLL $0x02, DI
21132 CMPL R9, $0x0c
21133 JAE emit_copy_three_lz4_s2
21134 CMPL R8, $0x00000800
21135 JAE emit_copy_three_lz4_s2
21136 LEAL -15(DI), DI
21137 MOVB R8, 1(AX)
21138 SHRL $0x08, R8
21139 SHLL $0x05, R8
21140 ORL R8, DI
21141 MOVB DI, (AX)
21142 ADDQ $0x02, AX
21143 JMP lz4s_snappy_loop
21144
21145emit_copy_three_lz4_s2:
21146 LEAL -2(DI), DI
21147 MOVB DI, (AX)
21148 MOVW R8, 1(AX)
21149 ADDQ $0x03, AX
21150 JMP lz4s_snappy_loop
21151
21152lz4s_snappy_done:
21153 MOVQ dst_base+0(FP), CX
21154 SUBQ CX, AX
21155 MOVQ SI, uncompressed+48(FP)
21156 MOVQ AX, dstUsed+56(FP)
21157 RET
21158
21159lz4s_snappy_corrupt:
21160 XORQ AX, AX
21161 LEAQ -1(AX), SI
21162 MOVQ SI, uncompressed+48(FP)
21163 RET
21164
21165lz4s_snappy_dstfull:
21166 XORQ AX, AX
21167 LEAQ -2(AX), SI
21168 MOVQ SI, uncompressed+48(FP)
21169 RET
diff --git a/vendor/github.com/klauspost/compress/s2/index.go b/vendor/github.com/klauspost/compress/s2/index.go
new file mode 100644
index 0000000..18a4f7a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/index.go
@@ -0,0 +1,596 @@
1// Copyright (c) 2022+ Klaus Post. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package s2
6
7import (
8 "bytes"
9 "encoding/binary"
10 "encoding/json"
11 "fmt"
12 "io"
13 "sort"
14)
15
16const (
17 S2IndexHeader = "s2idx\x00"
18 S2IndexTrailer = "\x00xdi2s"
19 maxIndexEntries = 1 << 16
20)
21
22// Index represents an S2/Snappy index.
23type Index struct {
24 TotalUncompressed int64 // Total Uncompressed size if known. Will be -1 if unknown.
25 TotalCompressed int64 // Total Compressed size if known. Will be -1 if unknown.
26 info []struct {
27 compressedOffset int64
28 uncompressedOffset int64
29 }
30 estBlockUncomp int64
31}
32
33func (i *Index) reset(maxBlock int) {
34 i.estBlockUncomp = int64(maxBlock)
35 i.TotalCompressed = -1
36 i.TotalUncompressed = -1
37 if len(i.info) > 0 {
38 i.info = i.info[:0]
39 }
40}
41
42// allocInfos will allocate an empty slice of infos.
43func (i *Index) allocInfos(n int) {
44 if n > maxIndexEntries {
45 panic("n > maxIndexEntries")
46 }
47 i.info = make([]struct {
48 compressedOffset int64
49 uncompressedOffset int64
50 }, 0, n)
51}
52
53// add an uncompressed and compressed pair.
54// Entries must be sent in order.
55func (i *Index) add(compressedOffset, uncompressedOffset int64) error {
56 if i == nil {
57 return nil
58 }
59 lastIdx := len(i.info) - 1
60 if lastIdx >= 0 {
61 latest := i.info[lastIdx]
62 if latest.uncompressedOffset == uncompressedOffset {
63 // Uncompressed didn't change, don't add entry,
64 // but update start index.
65 latest.compressedOffset = compressedOffset
66 i.info[lastIdx] = latest
67 return nil
68 }
69 if latest.uncompressedOffset > uncompressedOffset {
70 return fmt.Errorf("internal error: Earlier uncompressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
71 }
72 if latest.compressedOffset > compressedOffset {
73 return fmt.Errorf("internal error: Earlier compressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
74 }
75 }
76 i.info = append(i.info, struct {
77 compressedOffset int64
78 uncompressedOffset int64
79 }{compressedOffset: compressedOffset, uncompressedOffset: uncompressedOffset})
80 return nil
81}
82
83// Find the offset at or before the wanted (uncompressed) offset.
84// If offset is 0 or positive it is the offset from the beginning of the file.
85// If the uncompressed size is known, the offset must be within the file.
86// If an offset outside the file is requested io.ErrUnexpectedEOF is returned.
87// If the offset is negative, it is interpreted as the distance from the end of the file,
88// where -1 represents the last byte.
89// If offset from the end of the file is requested, but size is unknown,
90// ErrUnsupported will be returned.
91func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err error) {
92 if i.TotalUncompressed < 0 {
93 return 0, 0, ErrCorrupt
94 }
95 if offset < 0 {
96 offset = i.TotalUncompressed + offset
97 if offset < 0 {
98 return 0, 0, io.ErrUnexpectedEOF
99 }
100 }
101 if offset > i.TotalUncompressed {
102 return 0, 0, io.ErrUnexpectedEOF
103 }
104 if len(i.info) > 200 {
105 n := sort.Search(len(i.info), func(n int) bool {
106 return i.info[n].uncompressedOffset > offset
107 })
108 if n == 0 {
109 n = 1
110 }
111 return i.info[n-1].compressedOffset, i.info[n-1].uncompressedOffset, nil
112 }
113 for _, info := range i.info {
114 if info.uncompressedOffset > offset {
115 break
116 }
117 compressedOff = info.compressedOffset
118 uncompressedOff = info.uncompressedOffset
119 }
120 return compressedOff, uncompressedOff, nil
121}
122
123// reduce to stay below maxIndexEntries
124func (i *Index) reduce() {
125 if len(i.info) < maxIndexEntries && i.estBlockUncomp >= 1<<20 {
126 return
127 }
128
129 // Algorithm, keep 1, remove removeN entries...
130 removeN := (len(i.info) + 1) / maxIndexEntries
131 src := i.info
132 j := 0
133
134 // Each block should be at least 1MB, but don't reduce below 1000 entries.
135 for i.estBlockUncomp*(int64(removeN)+1) < 1<<20 && len(i.info)/(removeN+1) > 1000 {
136 removeN++
137 }
138 for idx := 0; idx < len(src); idx++ {
139 i.info[j] = src[idx]
140 j++
141 idx += removeN
142 }
143 i.info = i.info[:j]
144 // Update maxblock estimate.
145 i.estBlockUncomp += i.estBlockUncomp * int64(removeN)
146}
147
148func (i *Index) appendTo(b []byte, uncompTotal, compTotal int64) []byte {
149 i.reduce()
150 var tmp [binary.MaxVarintLen64]byte
151
152 initSize := len(b)
153 // We make the start a skippable header+size.
154 b = append(b, ChunkTypeIndex, 0, 0, 0)
155 b = append(b, []byte(S2IndexHeader)...)
156 // Total Uncompressed size
157 n := binary.PutVarint(tmp[:], uncompTotal)
158 b = append(b, tmp[:n]...)
159 // Total Compressed size
160 n = binary.PutVarint(tmp[:], compTotal)
161 b = append(b, tmp[:n]...)
162 // Put EstBlockUncomp size
163 n = binary.PutVarint(tmp[:], i.estBlockUncomp)
164 b = append(b, tmp[:n]...)
165 // Put length
166 n = binary.PutVarint(tmp[:], int64(len(i.info)))
167 b = append(b, tmp[:n]...)
168
169 // Check if we should add uncompressed offsets
170 var hasUncompressed byte
171 for idx, info := range i.info {
172 if idx == 0 {
173 if info.uncompressedOffset != 0 {
174 hasUncompressed = 1
175 break
176 }
177 continue
178 }
179 if info.uncompressedOffset != i.info[idx-1].uncompressedOffset+i.estBlockUncomp {
180 hasUncompressed = 1
181 break
182 }
183 }
184 b = append(b, hasUncompressed)
185
186 // Add each entry
187 if hasUncompressed == 1 {
188 for idx, info := range i.info {
189 uOff := info.uncompressedOffset
190 if idx > 0 {
191 prev := i.info[idx-1]
192 uOff -= prev.uncompressedOffset + (i.estBlockUncomp)
193 }
194 n = binary.PutVarint(tmp[:], uOff)
195 b = append(b, tmp[:n]...)
196 }
197 }
198
199 // Initial compressed size estimate.
200 cPredict := i.estBlockUncomp / 2
201
202 for idx, info := range i.info {
203 cOff := info.compressedOffset
204 if idx > 0 {
205 prev := i.info[idx-1]
206 cOff -= prev.compressedOffset + cPredict
207 // Update compressed size prediction, with half the error.
208 cPredict += cOff / 2
209 }
210 n = binary.PutVarint(tmp[:], cOff)
211 b = append(b, tmp[:n]...)
212 }
213
214 // Add Total Size.
215 // Stored as fixed size for easier reading.
216 binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)-initSize+4+len(S2IndexTrailer)))
217 b = append(b, tmp[:4]...)
218 // Trailer
219 b = append(b, []byte(S2IndexTrailer)...)
220
221 // Update size
222 chunkLen := len(b) - initSize - skippableFrameHeader
223 b[initSize+1] = uint8(chunkLen >> 0)
224 b[initSize+2] = uint8(chunkLen >> 8)
225 b[initSize+3] = uint8(chunkLen >> 16)
226 //fmt.Printf("chunklen: 0x%x Uncomp:%d, Comp:%d\n", chunkLen, uncompTotal, compTotal)
227 return b
228}
229
230// Load a binary index.
231// A zero value Index can be used or a previous one can be reused.
232func (i *Index) Load(b []byte) ([]byte, error) {
233 if len(b) <= 4+len(S2IndexHeader)+len(S2IndexTrailer) {
234 return b, io.ErrUnexpectedEOF
235 }
236 if b[0] != ChunkTypeIndex {
237 return b, ErrCorrupt
238 }
239 chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
240 b = b[4:]
241
242 // Validate we have enough...
243 if len(b) < chunkLen {
244 return b, io.ErrUnexpectedEOF
245 }
246 if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
247 return b, ErrUnsupported
248 }
249 b = b[len(S2IndexHeader):]
250
251 // Total Uncompressed
252 if v, n := binary.Varint(b); n <= 0 || v < 0 {
253 return b, ErrCorrupt
254 } else {
255 i.TotalUncompressed = v
256 b = b[n:]
257 }
258
259 // Total Compressed
260 if v, n := binary.Varint(b); n <= 0 {
261 return b, ErrCorrupt
262 } else {
263 i.TotalCompressed = v
264 b = b[n:]
265 }
266
267 // Read EstBlockUncomp
268 if v, n := binary.Varint(b); n <= 0 {
269 return b, ErrCorrupt
270 } else {
271 if v < 0 {
272 return b, ErrCorrupt
273 }
274 i.estBlockUncomp = v
275 b = b[n:]
276 }
277
278 var entries int
279 if v, n := binary.Varint(b); n <= 0 {
280 return b, ErrCorrupt
281 } else {
282 if v < 0 || v > maxIndexEntries {
283 return b, ErrCorrupt
284 }
285 entries = int(v)
286 b = b[n:]
287 }
288 if cap(i.info) < entries {
289 i.allocInfos(entries)
290 }
291 i.info = i.info[:entries]
292
293 if len(b) < 1 {
294 return b, io.ErrUnexpectedEOF
295 }
296 hasUncompressed := b[0]
297 b = b[1:]
298 if hasUncompressed&1 != hasUncompressed {
299 return b, ErrCorrupt
300 }
301
302 // Add each uncompressed entry
303 for idx := range i.info {
304 var uOff int64
305 if hasUncompressed != 0 {
306 // Load delta
307 if v, n := binary.Varint(b); n <= 0 {
308 return b, ErrCorrupt
309 } else {
310 uOff = v
311 b = b[n:]
312 }
313 }
314
315 if idx > 0 {
316 prev := i.info[idx-1].uncompressedOffset
317 uOff += prev + (i.estBlockUncomp)
318 if uOff <= prev {
319 return b, ErrCorrupt
320 }
321 }
322 if uOff < 0 {
323 return b, ErrCorrupt
324 }
325 i.info[idx].uncompressedOffset = uOff
326 }
327
328 // Initial compressed size estimate.
329 cPredict := i.estBlockUncomp / 2
330
331 // Add each compressed entry
332 for idx := range i.info {
333 var cOff int64
334 if v, n := binary.Varint(b); n <= 0 {
335 return b, ErrCorrupt
336 } else {
337 cOff = v
338 b = b[n:]
339 }
340
341 if idx > 0 {
342 // Update compressed size prediction, with half the error.
343 cPredictNew := cPredict + cOff/2
344
345 prev := i.info[idx-1].compressedOffset
346 cOff += prev + cPredict
347 if cOff <= prev {
348 return b, ErrCorrupt
349 }
350 cPredict = cPredictNew
351 }
352 if cOff < 0 {
353 return b, ErrCorrupt
354 }
355 i.info[idx].compressedOffset = cOff
356 }
357 if len(b) < 4+len(S2IndexTrailer) {
358 return b, io.ErrUnexpectedEOF
359 }
360 // Skip size...
361 b = b[4:]
362
363 // Check trailer...
364 if !bytes.Equal(b[:len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
365 return b, ErrCorrupt
366 }
367 return b[len(S2IndexTrailer):], nil
368}
369
370// LoadStream will load an index from the end of the supplied stream.
371// ErrUnsupported will be returned if the signature cannot be found.
372// ErrCorrupt will be returned if unexpected values are found.
373// io.ErrUnexpectedEOF is returned if there are too few bytes.
374// IO errors are returned as-is.
375func (i *Index) LoadStream(rs io.ReadSeeker) error {
376 // Go to end.
377 _, err := rs.Seek(-10, io.SeekEnd)
378 if err != nil {
379 return err
380 }
381 var tmp [10]byte
382 _, err = io.ReadFull(rs, tmp[:])
383 if err != nil {
384 return err
385 }
386 // Check trailer...
387 if !bytes.Equal(tmp[4:4+len(S2IndexTrailer)], []byte(S2IndexTrailer)) {
388 return ErrUnsupported
389 }
390 sz := binary.LittleEndian.Uint32(tmp[:4])
391 if sz > maxChunkSize+skippableFrameHeader {
392 return ErrCorrupt
393 }
394 _, err = rs.Seek(-int64(sz), io.SeekEnd)
395 if err != nil {
396 return err
397 }
398
399 // Read index.
400 buf := make([]byte, sz)
401 _, err = io.ReadFull(rs, buf)
402 if err != nil {
403 return err
404 }
405 _, err = i.Load(buf)
406 return err
407}
408
409// IndexStream will return an index for a stream.
410// The stream structure will be checked, but
411// data within blocks is not verified.
412// The returned index can either be appended to the end of the stream
413// or stored separately.
414func IndexStream(r io.Reader) ([]byte, error) {
415 var i Index
416 var buf [maxChunkSize]byte
417 var readHeader bool
418 for {
419 _, err := io.ReadFull(r, buf[:4])
420 if err != nil {
421 if err == io.EOF {
422 return i.appendTo(nil, i.TotalUncompressed, i.TotalCompressed), nil
423 }
424 return nil, err
425 }
426 // Start of this chunk.
427 startChunk := i.TotalCompressed
428 i.TotalCompressed += 4
429
430 chunkType := buf[0]
431 if !readHeader {
432 if chunkType != chunkTypeStreamIdentifier {
433 return nil, ErrCorrupt
434 }
435 readHeader = true
436 }
437 chunkLen := int(buf[1]) | int(buf[2])<<8 | int(buf[3])<<16
438 if chunkLen < checksumSize {
439 return nil, ErrCorrupt
440 }
441
442 i.TotalCompressed += int64(chunkLen)
443 _, err = io.ReadFull(r, buf[:chunkLen])
444 if err != nil {
445 return nil, io.ErrUnexpectedEOF
446 }
447 // The chunk types are specified at
448 // https://github.com/google/snappy/blob/master/framing_format.txt
449 switch chunkType {
450 case chunkTypeCompressedData:
451 // Section 4.2. Compressed data (chunk type 0x00).
452 // Skip checksum.
453 dLen, err := DecodedLen(buf[checksumSize:])
454 if err != nil {
455 return nil, err
456 }
457 if dLen > maxBlockSize {
458 return nil, ErrCorrupt
459 }
460 if i.estBlockUncomp == 0 {
461 // Use first block for estimate...
462 i.estBlockUncomp = int64(dLen)
463 }
464 err = i.add(startChunk, i.TotalUncompressed)
465 if err != nil {
466 return nil, err
467 }
468 i.TotalUncompressed += int64(dLen)
469 continue
470 case chunkTypeUncompressedData:
471 n2 := chunkLen - checksumSize
472 if n2 > maxBlockSize {
473 return nil, ErrCorrupt
474 }
475 if i.estBlockUncomp == 0 {
476 // Use first block for estimate...
477 i.estBlockUncomp = int64(n2)
478 }
479 err = i.add(startChunk, i.TotalUncompressed)
480 if err != nil {
481 return nil, err
482 }
483 i.TotalUncompressed += int64(n2)
484 continue
485 case chunkTypeStreamIdentifier:
486 // Section 4.1. Stream identifier (chunk type 0xff).
487 if chunkLen != len(magicBody) {
488 return nil, ErrCorrupt
489 }
490
491 if string(buf[:len(magicBody)]) != magicBody {
492 if string(buf[:len(magicBody)]) != magicBodySnappy {
493 return nil, ErrCorrupt
494 }
495 }
496
497 continue
498 }
499
500 if chunkType <= 0x7f {
501 // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
502 return nil, ErrUnsupported
503 }
504 if chunkLen > maxChunkSize {
505 return nil, ErrUnsupported
506 }
507 // Section 4.4 Padding (chunk type 0xfe).
508 // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
509 }
510}
511
512// JSON returns the index as JSON text.
513func (i *Index) JSON() []byte {
514 type offset struct {
515 CompressedOffset int64 `json:"compressed"`
516 UncompressedOffset int64 `json:"uncompressed"`
517 }
518 x := struct {
519 TotalUncompressed int64 `json:"total_uncompressed"` // Total Uncompressed size if known. Will be -1 if unknown.
520 TotalCompressed int64 `json:"total_compressed"` // Total Compressed size if known. Will be -1 if unknown.
521 Offsets []offset `json:"offsets"`
522 EstBlockUncomp int64 `json:"est_block_uncompressed"`
523 }{
524 TotalUncompressed: i.TotalUncompressed,
525 TotalCompressed: i.TotalCompressed,
526 EstBlockUncomp: i.estBlockUncomp,
527 }
528 for _, v := range i.info {
529 x.Offsets = append(x.Offsets, offset{CompressedOffset: v.compressedOffset, UncompressedOffset: v.uncompressedOffset})
530 }
531 b, _ := json.MarshalIndent(x, "", " ")
532 return b
533}
534
535// RemoveIndexHeaders will trim all headers and trailers from a given index.
536// This is expected to save 20 bytes.
537// These can be restored using RestoreIndexHeaders.
538// This removes a layer of security, but is the most compact representation.
539// Returns nil if headers contains errors.
540// The returned slice references the provided slice.
541func RemoveIndexHeaders(b []byte) []byte {
542 const save = 4 + len(S2IndexHeader) + len(S2IndexTrailer) + 4
543 if len(b) <= save {
544 return nil
545 }
546 if b[0] != ChunkTypeIndex {
547 return nil
548 }
549 chunkLen := int(b[1]) | int(b[2])<<8 | int(b[3])<<16
550 b = b[4:]
551
552 // Validate we have enough...
553 if len(b) < chunkLen {
554 return nil
555 }
556 b = b[:chunkLen]
557
558 if !bytes.Equal(b[:len(S2IndexHeader)], []byte(S2IndexHeader)) {
559 return nil
560 }
561 b = b[len(S2IndexHeader):]
562 if !bytes.HasSuffix(b, []byte(S2IndexTrailer)) {
563 return nil
564 }
565 b = bytes.TrimSuffix(b, []byte(S2IndexTrailer))
566
567 if len(b) < 4 {
568 return nil
569 }
570 return b[:len(b)-4]
571}
572
573// RestoreIndexHeaders will index restore headers removed by RemoveIndexHeaders.
574// No error checking is performed on the input.
575// If a 0 length slice is sent, it is returned without modification.
576func RestoreIndexHeaders(in []byte) []byte {
577 if len(in) == 0 {
578 return in
579 }
580 b := make([]byte, 0, 4+len(S2IndexHeader)+len(in)+len(S2IndexTrailer)+4)
581 b = append(b, ChunkTypeIndex, 0, 0, 0)
582 b = append(b, []byte(S2IndexHeader)...)
583 b = append(b, in...)
584
585 var tmp [4]byte
586 binary.LittleEndian.PutUint32(tmp[:], uint32(len(b)+4+len(S2IndexTrailer)))
587 b = append(b, tmp[:4]...)
588 // Trailer
589 b = append(b, []byte(S2IndexTrailer)...)
590
591 chunkLen := len(b) - skippableFrameHeader
592 b[1] = uint8(chunkLen >> 0)
593 b[2] = uint8(chunkLen >> 8)
594 b[3] = uint8(chunkLen >> 16)
595 return b
596}
diff --git a/vendor/github.com/klauspost/compress/s2/lz4convert.go b/vendor/github.com/klauspost/compress/s2/lz4convert.go
new file mode 100644
index 0000000..46ed908
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/lz4convert.go
@@ -0,0 +1,585 @@
1// Copyright (c) 2022 Klaus Post. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package s2
6
7import (
8 "encoding/binary"
9 "errors"
10 "fmt"
11)
12
13// LZ4Converter provides conversion from LZ4 blocks as defined here:
14// https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md
15type LZ4Converter struct {
16}
17
18// ErrDstTooSmall is returned when provided destination is too small.
19var ErrDstTooSmall = errors.New("s2: destination too small")
20
21// ConvertBlock will convert an LZ4 block and append it as an S2
22// block without block length to dst.
23// The uncompressed size is returned as well.
24// dst must have capacity to contain the entire compressed block.
25func (l *LZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
26 if len(src) == 0 {
27 return dst, 0, nil
28 }
29 const debug = false
30 const inline = true
31 const lz4MinMatch = 4
32
33 s, d := 0, len(dst)
34 dst = dst[:cap(dst)]
35 if !debug && hasAmd64Asm {
36 res, sz := cvtLZ4BlockAsm(dst[d:], src)
37 if res < 0 {
38 const (
39 errCorrupt = -1
40 errDstTooSmall = -2
41 )
42 switch res {
43 case errCorrupt:
44 return nil, 0, ErrCorrupt
45 case errDstTooSmall:
46 return nil, 0, ErrDstTooSmall
47 default:
48 return nil, 0, fmt.Errorf("unexpected result: %d", res)
49 }
50 }
51 if d+sz > len(dst) {
52 return nil, 0, ErrDstTooSmall
53 }
54 return dst[:d+sz], res, nil
55 }
56
57 dLimit := len(dst) - 10
58 var lastOffset uint16
59 var uncompressed int
60 if debug {
61 fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
62 }
63
64 for {
65 if s >= len(src) {
66 return dst[:d], 0, ErrCorrupt
67 }
68 // Read literal info
69 token := src[s]
70 ll := int(token >> 4)
71 ml := int(lz4MinMatch + (token & 0xf))
72
73 // If upper nibble is 15, literal length is extended
74 if token >= 0xf0 {
75 for {
76 s++
77 if s >= len(src) {
78 if debug {
79 fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
80 }
81 return dst[:d], 0, ErrCorrupt
82 }
83 val := src[s]
84 ll += int(val)
85 if val != 255 {
86 break
87 }
88 }
89 }
90 // Skip past token
91 if s+ll >= len(src) {
92 if debug {
93 fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
94 }
95 return nil, 0, ErrCorrupt
96 }
97 s++
98 if ll > 0 {
99 if d+ll > dLimit {
100 return nil, 0, ErrDstTooSmall
101 }
102 if debug {
103 fmt.Printf("emit %d literals\n", ll)
104 }
105 d += emitLiteralGo(dst[d:], src[s:s+ll])
106 s += ll
107 uncompressed += ll
108 }
109
110 // Check if we are done...
111 if s == len(src) && ml == lz4MinMatch {
112 break
113 }
114 // 2 byte offset
115 if s >= len(src)-2 {
116 if debug {
117 fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
118 }
119 return nil, 0, ErrCorrupt
120 }
121 offset := binary.LittleEndian.Uint16(src[s:])
122 s += 2
123 if offset == 0 {
124 if debug {
125 fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
126 }
127 return nil, 0, ErrCorrupt
128 }
129 if int(offset) > uncompressed {
130 if debug {
131 fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
132 }
133 return nil, 0, ErrCorrupt
134 }
135
136 if ml == lz4MinMatch+15 {
137 for {
138 if s >= len(src) {
139 if debug {
140 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
141 }
142 return nil, 0, ErrCorrupt
143 }
144 val := src[s]
145 s++
146 ml += int(val)
147 if val != 255 {
148 if s >= len(src) {
149 if debug {
150 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
151 }
152 return nil, 0, ErrCorrupt
153 }
154 break
155 }
156 }
157 }
158 if offset == lastOffset {
159 if debug {
160 fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset)
161 }
162 if !inline {
163 d += emitRepeat16(dst[d:], offset, ml)
164 } else {
165 length := ml
166 dst := dst[d:]
167 for len(dst) > 5 {
168 // Repeat offset, make length cheaper
169 length -= 4
170 if length <= 4 {
171 dst[0] = uint8(length)<<2 | tagCopy1
172 dst[1] = 0
173 d += 2
174 break
175 }
176 if length < 8 && offset < 2048 {
177 // Encode WITH offset
178 dst[1] = uint8(offset)
179 dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
180 d += 2
181 break
182 }
183 if length < (1<<8)+4 {
184 length -= 4
185 dst[2] = uint8(length)
186 dst[1] = 0
187 dst[0] = 5<<2 | tagCopy1
188 d += 3
189 break
190 }
191 if length < (1<<16)+(1<<8) {
192 length -= 1 << 8
193 dst[3] = uint8(length >> 8)
194 dst[2] = uint8(length >> 0)
195 dst[1] = 0
196 dst[0] = 6<<2 | tagCopy1
197 d += 4
198 break
199 }
200 const maxRepeat = (1 << 24) - 1
201 length -= 1 << 16
202 left := 0
203 if length > maxRepeat {
204 left = length - maxRepeat + 4
205 length = maxRepeat - 4
206 }
207 dst[4] = uint8(length >> 16)
208 dst[3] = uint8(length >> 8)
209 dst[2] = uint8(length >> 0)
210 dst[1] = 0
211 dst[0] = 7<<2 | tagCopy1
212 if left > 0 {
213 d += 5 + emitRepeat16(dst[5:], offset, left)
214 break
215 }
216 d += 5
217 break
218 }
219 }
220 } else {
221 if debug {
222 fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
223 }
224 if !inline {
225 d += emitCopy16(dst[d:], offset, ml)
226 } else {
227 length := ml
228 dst := dst[d:]
229 for len(dst) > 5 {
230 // Offset no more than 2 bytes.
231 if length > 64 {
232 off := 3
233 if offset < 2048 {
234 // emit 8 bytes as tagCopy1, rest as repeats.
235 dst[1] = uint8(offset)
236 dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
237 length -= 8
238 off = 2
239 } else {
240 // Emit a length 60 copy, encoded as 3 bytes.
241 // Emit remaining as repeat value (minimum 4 bytes).
242 dst[2] = uint8(offset >> 8)
243 dst[1] = uint8(offset)
244 dst[0] = 59<<2 | tagCopy2
245 length -= 60
246 }
247 // Emit remaining as repeats, at least 4 bytes remain.
248 d += off + emitRepeat16(dst[off:], offset, length)
249 break
250 }
251 if length >= 12 || offset >= 2048 {
252 // Emit the remaining copy, encoded as 3 bytes.
253 dst[2] = uint8(offset >> 8)
254 dst[1] = uint8(offset)
255 dst[0] = uint8(length-1)<<2 | tagCopy2
256 d += 3
257 break
258 }
259 // Emit the remaining copy, encoded as 2 bytes.
260 dst[1] = uint8(offset)
261 dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
262 d += 2
263 break
264 }
265 }
266 lastOffset = offset
267 }
268 uncompressed += ml
269 if d > dLimit {
270 return nil, 0, ErrDstTooSmall
271 }
272 }
273
274 return dst[:d], uncompressed, nil
275}
276
277// ConvertBlockSnappy will convert an LZ4 block and append it
278// as a Snappy block without block length to dst.
279// The uncompressed size is returned as well.
280// dst must have capacity to contain the entire compressed block.
281func (l *LZ4Converter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) {
282 if len(src) == 0 {
283 return dst, 0, nil
284 }
285 const debug = false
286 const lz4MinMatch = 4
287
288 s, d := 0, len(dst)
289 dst = dst[:cap(dst)]
290 // Use assembly when possible
291 if !debug && hasAmd64Asm {
292 res, sz := cvtLZ4BlockSnappyAsm(dst[d:], src)
293 if res < 0 {
294 const (
295 errCorrupt = -1
296 errDstTooSmall = -2
297 )
298 switch res {
299 case errCorrupt:
300 return nil, 0, ErrCorrupt
301 case errDstTooSmall:
302 return nil, 0, ErrDstTooSmall
303 default:
304 return nil, 0, fmt.Errorf("unexpected result: %d", res)
305 }
306 }
307 if d+sz > len(dst) {
308 return nil, 0, ErrDstTooSmall
309 }
310 return dst[:d+sz], res, nil
311 }
312
313 dLimit := len(dst) - 10
314 var uncompressed int
315 if debug {
316 fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
317 }
318
319 for {
320 if s >= len(src) {
321 return nil, 0, ErrCorrupt
322 }
323 // Read literal info
324 token := src[s]
325 ll := int(token >> 4)
326 ml := int(lz4MinMatch + (token & 0xf))
327
328 // If upper nibble is 15, literal length is extended
329 if token >= 0xf0 {
330 for {
331 s++
332 if s >= len(src) {
333 if debug {
334 fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
335 }
336 return nil, 0, ErrCorrupt
337 }
338 val := src[s]
339 ll += int(val)
340 if val != 255 {
341 break
342 }
343 }
344 }
345 // Skip past token
346 if s+ll >= len(src) {
347 if debug {
348 fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
349 }
350 return nil, 0, ErrCorrupt
351 }
352 s++
353 if ll > 0 {
354 if d+ll > dLimit {
355 return nil, 0, ErrDstTooSmall
356 }
357 if debug {
358 fmt.Printf("emit %d literals\n", ll)
359 }
360 d += emitLiteralGo(dst[d:], src[s:s+ll])
361 s += ll
362 uncompressed += ll
363 }
364
365 // Check if we are done...
366 if s == len(src) && ml == lz4MinMatch {
367 break
368 }
369 // 2 byte offset
370 if s >= len(src)-2 {
371 if debug {
372 fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
373 }
374 return nil, 0, ErrCorrupt
375 }
376 offset := binary.LittleEndian.Uint16(src[s:])
377 s += 2
378 if offset == 0 {
379 if debug {
380 fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
381 }
382 return nil, 0, ErrCorrupt
383 }
384 if int(offset) > uncompressed {
385 if debug {
386 fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
387 }
388 return nil, 0, ErrCorrupt
389 }
390
391 if ml == lz4MinMatch+15 {
392 for {
393 if s >= len(src) {
394 if debug {
395 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
396 }
397 return nil, 0, ErrCorrupt
398 }
399 val := src[s]
400 s++
401 ml += int(val)
402 if val != 255 {
403 if s >= len(src) {
404 if debug {
405 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
406 }
407 return nil, 0, ErrCorrupt
408 }
409 break
410 }
411 }
412 }
413 if debug {
414 fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
415 }
416 length := ml
417 // d += emitCopyNoRepeat(dst[d:], int(offset), ml)
418 for length > 0 {
419 if d >= dLimit {
420 return nil, 0, ErrDstTooSmall
421 }
422
423 // Offset no more than 2 bytes.
424 if length > 64 {
425 // Emit a length 64 copy, encoded as 3 bytes.
426 dst[d+2] = uint8(offset >> 8)
427 dst[d+1] = uint8(offset)
428 dst[d+0] = 63<<2 | tagCopy2
429 length -= 64
430 d += 3
431 continue
432 }
433 if length >= 12 || offset >= 2048 || length < 4 {
434 // Emit the remaining copy, encoded as 3 bytes.
435 dst[d+2] = uint8(offset >> 8)
436 dst[d+1] = uint8(offset)
437 dst[d+0] = uint8(length-1)<<2 | tagCopy2
438 d += 3
439 break
440 }
441 // Emit the remaining copy, encoded as 2 bytes.
442 dst[d+1] = uint8(offset)
443 dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
444 d += 2
445 break
446 }
447 uncompressed += ml
448 if d > dLimit {
449 return nil, 0, ErrDstTooSmall
450 }
451 }
452
453 return dst[:d], uncompressed, nil
454}
455
456// emitRepeat writes a repeat chunk and returns the number of bytes written.
457// Length must be at least 4 and < 1<<24
458func emitRepeat16(dst []byte, offset uint16, length int) int {
459 // Repeat offset, make length cheaper
460 length -= 4
461 if length <= 4 {
462 dst[0] = uint8(length)<<2 | tagCopy1
463 dst[1] = 0
464 return 2
465 }
466 if length < 8 && offset < 2048 {
467 // Encode WITH offset
468 dst[1] = uint8(offset)
469 dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
470 return 2
471 }
472 if length < (1<<8)+4 {
473 length -= 4
474 dst[2] = uint8(length)
475 dst[1] = 0
476 dst[0] = 5<<2 | tagCopy1
477 return 3
478 }
479 if length < (1<<16)+(1<<8) {
480 length -= 1 << 8
481 dst[3] = uint8(length >> 8)
482 dst[2] = uint8(length >> 0)
483 dst[1] = 0
484 dst[0] = 6<<2 | tagCopy1
485 return 4
486 }
487 const maxRepeat = (1 << 24) - 1
488 length -= 1 << 16
489 left := 0
490 if length > maxRepeat {
491 left = length - maxRepeat + 4
492 length = maxRepeat - 4
493 }
494 dst[4] = uint8(length >> 16)
495 dst[3] = uint8(length >> 8)
496 dst[2] = uint8(length >> 0)
497 dst[1] = 0
498 dst[0] = 7<<2 | tagCopy1
499 if left > 0 {
500 return 5 + emitRepeat16(dst[5:], offset, left)
501 }
502 return 5
503}
504
505// emitCopy writes a copy chunk and returns the number of bytes written.
506//
507// It assumes that:
508//
509// dst is long enough to hold the encoded bytes
510// 1 <= offset && offset <= math.MaxUint16
511// 4 <= length && length <= math.MaxUint32
512func emitCopy16(dst []byte, offset uint16, length int) int {
513 // Offset no more than 2 bytes.
514 if length > 64 {
515 off := 3
516 if offset < 2048 {
517 // emit 8 bytes as tagCopy1, rest as repeats.
518 dst[1] = uint8(offset)
519 dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
520 length -= 8
521 off = 2
522 } else {
523 // Emit a length 60 copy, encoded as 3 bytes.
524 // Emit remaining as repeat value (minimum 4 bytes).
525 dst[2] = uint8(offset >> 8)
526 dst[1] = uint8(offset)
527 dst[0] = 59<<2 | tagCopy2
528 length -= 60
529 }
530 // Emit remaining as repeats, at least 4 bytes remain.
531 return off + emitRepeat16(dst[off:], offset, length)
532 }
533 if length >= 12 || offset >= 2048 {
534 // Emit the remaining copy, encoded as 3 bytes.
535 dst[2] = uint8(offset >> 8)
536 dst[1] = uint8(offset)
537 dst[0] = uint8(length-1)<<2 | tagCopy2
538 return 3
539 }
540 // Emit the remaining copy, encoded as 2 bytes.
541 dst[1] = uint8(offset)
542 dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
543 return 2
544}
545
546// emitLiteral writes a literal chunk and returns the number of bytes written.
547//
548// It assumes that:
549//
550// dst is long enough to hold the encoded bytes
551// 0 <= len(lit) && len(lit) <= math.MaxUint32
552func emitLiteralGo(dst, lit []byte) int {
553 if len(lit) == 0 {
554 return 0
555 }
556 i, n := 0, uint(len(lit)-1)
557 switch {
558 case n < 60:
559 dst[0] = uint8(n)<<2 | tagLiteral
560 i = 1
561 case n < 1<<8:
562 dst[1] = uint8(n)
563 dst[0] = 60<<2 | tagLiteral
564 i = 2
565 case n < 1<<16:
566 dst[2] = uint8(n >> 8)
567 dst[1] = uint8(n)
568 dst[0] = 61<<2 | tagLiteral
569 i = 3
570 case n < 1<<24:
571 dst[3] = uint8(n >> 16)
572 dst[2] = uint8(n >> 8)
573 dst[1] = uint8(n)
574 dst[0] = 62<<2 | tagLiteral
575 i = 4
576 default:
577 dst[4] = uint8(n >> 24)
578 dst[3] = uint8(n >> 16)
579 dst[2] = uint8(n >> 8)
580 dst[1] = uint8(n)
581 dst[0] = 63<<2 | tagLiteral
582 i = 5
583 }
584 return i + copy(dst[i:], lit)
585}
diff --git a/vendor/github.com/klauspost/compress/s2/lz4sconvert.go b/vendor/github.com/klauspost/compress/s2/lz4sconvert.go
new file mode 100644
index 0000000..000f397
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/lz4sconvert.go
@@ -0,0 +1,467 @@
1// Copyright (c) 2022 Klaus Post. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package s2
6
7import (
8 "encoding/binary"
9 "fmt"
10)
11
12// LZ4sConverter provides conversion from LZ4s.
13// (Intel modified LZ4 Blocks)
14// https://cdrdv2-public.intel.com/743912/743912-qat-programmers-guide-v2.0.pdf
15// LZ4s is a variant of LZ4 block format. LZ4s should be considered as an intermediate compressed block format.
16// The LZ4s format is selected when the application sets the compType to CPA_DC_LZ4S in CpaDcSessionSetupData.
17// The LZ4s block returned by the Intel® QAT hardware can be used by an external
18// software post-processing to generate other compressed data formats.
19// The following table lists the differences between LZ4 and LZ4s block format. LZ4s block format uses
20// the same high-level formatting as LZ4 block format with the following encoding changes:
21// For Min Match of 4 bytes, Copy length value 1-15 means length 4-18 with 18 bytes adding an extra byte.
22// ONLY "Min match of 4 bytes" is supported.
23type LZ4sConverter struct {
24}
25
26// ConvertBlock will convert an LZ4s block and append it as an S2
27// block without block length to dst.
28// The uncompressed size is returned as well.
29// dst must have capacity to contain the entire compressed block.
30func (l *LZ4sConverter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
31 if len(src) == 0 {
32 return dst, 0, nil
33 }
34 const debug = false
35 const inline = true
36 const lz4MinMatch = 3
37
38 s, d := 0, len(dst)
39 dst = dst[:cap(dst)]
40 if !debug && hasAmd64Asm {
41 res, sz := cvtLZ4sBlockAsm(dst[d:], src)
42 if res < 0 {
43 const (
44 errCorrupt = -1
45 errDstTooSmall = -2
46 )
47 switch res {
48 case errCorrupt:
49 return nil, 0, ErrCorrupt
50 case errDstTooSmall:
51 return nil, 0, ErrDstTooSmall
52 default:
53 return nil, 0, fmt.Errorf("unexpected result: %d", res)
54 }
55 }
56 if d+sz > len(dst) {
57 return nil, 0, ErrDstTooSmall
58 }
59 return dst[:d+sz], res, nil
60 }
61
62 dLimit := len(dst) - 10
63 var lastOffset uint16
64 var uncompressed int
65 if debug {
66 fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
67 }
68
69 for {
70 if s >= len(src) {
71 return dst[:d], 0, ErrCorrupt
72 }
73 // Read literal info
74 token := src[s]
75 ll := int(token >> 4)
76 ml := int(lz4MinMatch + (token & 0xf))
77
78 // If upper nibble is 15, literal length is extended
79 if token >= 0xf0 {
80 for {
81 s++
82 if s >= len(src) {
83 if debug {
84 fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
85 }
86 return dst[:d], 0, ErrCorrupt
87 }
88 val := src[s]
89 ll += int(val)
90 if val != 255 {
91 break
92 }
93 }
94 }
95 // Skip past token
96 if s+ll >= len(src) {
97 if debug {
98 fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
99 }
100 return nil, 0, ErrCorrupt
101 }
102 s++
103 if ll > 0 {
104 if d+ll > dLimit {
105 return nil, 0, ErrDstTooSmall
106 }
107 if debug {
108 fmt.Printf("emit %d literals\n", ll)
109 }
110 d += emitLiteralGo(dst[d:], src[s:s+ll])
111 s += ll
112 uncompressed += ll
113 }
114
115 // Check if we are done...
116 if ml == lz4MinMatch {
117 if s == len(src) {
118 break
119 }
120 // 0 bytes.
121 continue
122 }
123 // 2 byte offset
124 if s >= len(src)-2 {
125 if debug {
126 fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
127 }
128 return nil, 0, ErrCorrupt
129 }
130 offset := binary.LittleEndian.Uint16(src[s:])
131 s += 2
132 if offset == 0 {
133 if debug {
134 fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
135 }
136 return nil, 0, ErrCorrupt
137 }
138 if int(offset) > uncompressed {
139 if debug {
140 fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
141 }
142 return nil, 0, ErrCorrupt
143 }
144
145 if ml == lz4MinMatch+15 {
146 for {
147 if s >= len(src) {
148 if debug {
149 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
150 }
151 return nil, 0, ErrCorrupt
152 }
153 val := src[s]
154 s++
155 ml += int(val)
156 if val != 255 {
157 if s >= len(src) {
158 if debug {
159 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
160 }
161 return nil, 0, ErrCorrupt
162 }
163 break
164 }
165 }
166 }
167 if offset == lastOffset {
168 if debug {
169 fmt.Printf("emit repeat, length: %d, offset: %d\n", ml, offset)
170 }
171 if !inline {
172 d += emitRepeat16(dst[d:], offset, ml)
173 } else {
174 length := ml
175 dst := dst[d:]
176 for len(dst) > 5 {
177 // Repeat offset, make length cheaper
178 length -= 4
179 if length <= 4 {
180 dst[0] = uint8(length)<<2 | tagCopy1
181 dst[1] = 0
182 d += 2
183 break
184 }
185 if length < 8 && offset < 2048 {
186 // Encode WITH offset
187 dst[1] = uint8(offset)
188 dst[0] = uint8(offset>>8)<<5 | uint8(length)<<2 | tagCopy1
189 d += 2
190 break
191 }
192 if length < (1<<8)+4 {
193 length -= 4
194 dst[2] = uint8(length)
195 dst[1] = 0
196 dst[0] = 5<<2 | tagCopy1
197 d += 3
198 break
199 }
200 if length < (1<<16)+(1<<8) {
201 length -= 1 << 8
202 dst[3] = uint8(length >> 8)
203 dst[2] = uint8(length >> 0)
204 dst[1] = 0
205 dst[0] = 6<<2 | tagCopy1
206 d += 4
207 break
208 }
209 const maxRepeat = (1 << 24) - 1
210 length -= 1 << 16
211 left := 0
212 if length > maxRepeat {
213 left = length - maxRepeat + 4
214 length = maxRepeat - 4
215 }
216 dst[4] = uint8(length >> 16)
217 dst[3] = uint8(length >> 8)
218 dst[2] = uint8(length >> 0)
219 dst[1] = 0
220 dst[0] = 7<<2 | tagCopy1
221 if left > 0 {
222 d += 5 + emitRepeat16(dst[5:], offset, left)
223 break
224 }
225 d += 5
226 break
227 }
228 }
229 } else {
230 if debug {
231 fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
232 }
233 if !inline {
234 d += emitCopy16(dst[d:], offset, ml)
235 } else {
236 length := ml
237 dst := dst[d:]
238 for len(dst) > 5 {
239 // Offset no more than 2 bytes.
240 if length > 64 {
241 off := 3
242 if offset < 2048 {
243 // emit 8 bytes as tagCopy1, rest as repeats.
244 dst[1] = uint8(offset)
245 dst[0] = uint8(offset>>8)<<5 | uint8(8-4)<<2 | tagCopy1
246 length -= 8
247 off = 2
248 } else {
249 // Emit a length 60 copy, encoded as 3 bytes.
250 // Emit remaining as repeat value (minimum 4 bytes).
251 dst[2] = uint8(offset >> 8)
252 dst[1] = uint8(offset)
253 dst[0] = 59<<2 | tagCopy2
254 length -= 60
255 }
256 // Emit remaining as repeats, at least 4 bytes remain.
257 d += off + emitRepeat16(dst[off:], offset, length)
258 break
259 }
260 if length >= 12 || offset >= 2048 {
261 // Emit the remaining copy, encoded as 3 bytes.
262 dst[2] = uint8(offset >> 8)
263 dst[1] = uint8(offset)
264 dst[0] = uint8(length-1)<<2 | tagCopy2
265 d += 3
266 break
267 }
268 // Emit the remaining copy, encoded as 2 bytes.
269 dst[1] = uint8(offset)
270 dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
271 d += 2
272 break
273 }
274 }
275 lastOffset = offset
276 }
277 uncompressed += ml
278 if d > dLimit {
279 return nil, 0, ErrDstTooSmall
280 }
281 }
282
283 return dst[:d], uncompressed, nil
284}
285
286// ConvertBlockSnappy will convert an LZ4s block and append it
287// as a Snappy block without block length to dst.
288// The uncompressed size is returned as well.
289// dst must have capacity to contain the entire compressed block.
290func (l *LZ4sConverter) ConvertBlockSnappy(dst, src []byte) ([]byte, int, error) {
291 if len(src) == 0 {
292 return dst, 0, nil
293 }
294 const debug = false
295 const lz4MinMatch = 3
296
297 s, d := 0, len(dst)
298 dst = dst[:cap(dst)]
299 // Use assembly when possible
300 if !debug && hasAmd64Asm {
301 res, sz := cvtLZ4sBlockSnappyAsm(dst[d:], src)
302 if res < 0 {
303 const (
304 errCorrupt = -1
305 errDstTooSmall = -2
306 )
307 switch res {
308 case errCorrupt:
309 return nil, 0, ErrCorrupt
310 case errDstTooSmall:
311 return nil, 0, ErrDstTooSmall
312 default:
313 return nil, 0, fmt.Errorf("unexpected result: %d", res)
314 }
315 }
316 if d+sz > len(dst) {
317 return nil, 0, ErrDstTooSmall
318 }
319 return dst[:d+sz], res, nil
320 }
321
322 dLimit := len(dst) - 10
323 var uncompressed int
324 if debug {
325 fmt.Printf("convert block start: len(src): %d, len(dst):%d \n", len(src), len(dst))
326 }
327
328 for {
329 if s >= len(src) {
330 return nil, 0, ErrCorrupt
331 }
332 // Read literal info
333 token := src[s]
334 ll := int(token >> 4)
335 ml := int(lz4MinMatch + (token & 0xf))
336
337 // If upper nibble is 15, literal length is extended
338 if token >= 0xf0 {
339 for {
340 s++
341 if s >= len(src) {
342 if debug {
343 fmt.Printf("error reading ll: s (%d) >= len(src) (%d)\n", s, len(src))
344 }
345 return nil, 0, ErrCorrupt
346 }
347 val := src[s]
348 ll += int(val)
349 if val != 255 {
350 break
351 }
352 }
353 }
354 // Skip past token
355 if s+ll >= len(src) {
356 if debug {
357 fmt.Printf("error literals: s+ll (%d+%d) >= len(src) (%d)\n", s, ll, len(src))
358 }
359 return nil, 0, ErrCorrupt
360 }
361 s++
362 if ll > 0 {
363 if d+ll > dLimit {
364 return nil, 0, ErrDstTooSmall
365 }
366 if debug {
367 fmt.Printf("emit %d literals\n", ll)
368 }
369 d += emitLiteralGo(dst[d:], src[s:s+ll])
370 s += ll
371 uncompressed += ll
372 }
373
374 // Check if we are done...
375 if ml == lz4MinMatch {
376 if s == len(src) {
377 break
378 }
379 // 0 bytes.
380 continue
381 }
382 // 2 byte offset
383 if s >= len(src)-2 {
384 if debug {
385 fmt.Printf("s (%d) >= len(src)-2 (%d)", s, len(src)-2)
386 }
387 return nil, 0, ErrCorrupt
388 }
389 offset := binary.LittleEndian.Uint16(src[s:])
390 s += 2
391 if offset == 0 {
392 if debug {
393 fmt.Printf("error: offset 0, ml: %d, len(src)-s: %d\n", ml, len(src)-s)
394 }
395 return nil, 0, ErrCorrupt
396 }
397 if int(offset) > uncompressed {
398 if debug {
399 fmt.Printf("error: offset (%d)> uncompressed (%d)\n", offset, uncompressed)
400 }
401 return nil, 0, ErrCorrupt
402 }
403
404 if ml == lz4MinMatch+15 {
405 for {
406 if s >= len(src) {
407 if debug {
408 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
409 }
410 return nil, 0, ErrCorrupt
411 }
412 val := src[s]
413 s++
414 ml += int(val)
415 if val != 255 {
416 if s >= len(src) {
417 if debug {
418 fmt.Printf("error reading ml: s (%d) >= len(src) (%d)\n", s, len(src))
419 }
420 return nil, 0, ErrCorrupt
421 }
422 break
423 }
424 }
425 }
426 if debug {
427 fmt.Printf("emit copy, length: %d, offset: %d\n", ml, offset)
428 }
429 length := ml
430 // d += emitCopyNoRepeat(dst[d:], int(offset), ml)
431 for length > 0 {
432 if d >= dLimit {
433 return nil, 0, ErrDstTooSmall
434 }
435
436 // Offset no more than 2 bytes.
437 if length > 64 {
438 // Emit a length 64 copy, encoded as 3 bytes.
439 dst[d+2] = uint8(offset >> 8)
440 dst[d+1] = uint8(offset)
441 dst[d+0] = 63<<2 | tagCopy2
442 length -= 64
443 d += 3
444 continue
445 }
446 if length >= 12 || offset >= 2048 || length < 4 {
447 // Emit the remaining copy, encoded as 3 bytes.
448 dst[d+2] = uint8(offset >> 8)
449 dst[d+1] = uint8(offset)
450 dst[d+0] = uint8(length-1)<<2 | tagCopy2
451 d += 3
452 break
453 }
454 // Emit the remaining copy, encoded as 2 bytes.
455 dst[d+1] = uint8(offset)
456 dst[d+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
457 d += 2
458 break
459 }
460 uncompressed += ml
461 if d > dLimit {
462 return nil, 0, ErrDstTooSmall
463 }
464 }
465
466 return dst[:d], uncompressed, nil
467}
diff --git a/vendor/github.com/klauspost/compress/s2/reader.go b/vendor/github.com/klauspost/compress/s2/reader.go
new file mode 100644
index 0000000..2f01a39
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/reader.go
@@ -0,0 +1,1062 @@
1// Copyright 2011 The Snappy-Go Authors. All rights reserved.
2// Copyright (c) 2019+ Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6package s2
7
8import (
9 "errors"
10 "fmt"
11 "io"
12 "io/ioutil"
13 "math"
14 "runtime"
15 "sync"
16)
17
18// ErrCantSeek is returned if the stream cannot be seeked.
19type ErrCantSeek struct {
20 Reason string
21}
22
23// Error returns the error as string.
24func (e ErrCantSeek) Error() string {
25 return fmt.Sprintf("s2: Can't seek because %s", e.Reason)
26}
27
28// NewReader returns a new Reader that decompresses from r, using the framing
29// format described at
30// https://github.com/google/snappy/blob/master/framing_format.txt with S2 changes.
31func NewReader(r io.Reader, opts ...ReaderOption) *Reader {
32 nr := Reader{
33 r: r,
34 maxBlock: maxBlockSize,
35 }
36 for _, opt := range opts {
37 if err := opt(&nr); err != nil {
38 nr.err = err
39 return &nr
40 }
41 }
42 nr.maxBufSize = MaxEncodedLen(nr.maxBlock) + checksumSize
43 if nr.lazyBuf > 0 {
44 nr.buf = make([]byte, MaxEncodedLen(nr.lazyBuf)+checksumSize)
45 } else {
46 nr.buf = make([]byte, MaxEncodedLen(defaultBlockSize)+checksumSize)
47 }
48 nr.readHeader = nr.ignoreStreamID
49 nr.paramsOK = true
50 return &nr
51}
52
53// ReaderOption is an option for creating a decoder.
54type ReaderOption func(*Reader) error
55
56// ReaderMaxBlockSize allows to control allocations if the stream
57// has been compressed with a smaller WriterBlockSize, or with the default 1MB.
58// Blocks must be this size or smaller to decompress,
59// otherwise the decoder will return ErrUnsupported.
60//
61// For streams compressed with Snappy this can safely be set to 64KB (64 << 10).
62//
63// Default is the maximum limit of 4MB.
64func ReaderMaxBlockSize(blockSize int) ReaderOption {
65 return func(r *Reader) error {
66 if blockSize > maxBlockSize || blockSize <= 0 {
67 return errors.New("s2: block size too large. Must be <= 4MB and > 0")
68 }
69 if r.lazyBuf == 0 && blockSize < defaultBlockSize {
70 r.lazyBuf = blockSize
71 }
72 r.maxBlock = blockSize
73 return nil
74 }
75}
76
77// ReaderAllocBlock allows to control upfront stream allocations
78// and not allocate for frames bigger than this initially.
79// If frames bigger than this is seen a bigger buffer will be allocated.
80//
81// Default is 1MB, which is default output size.
82func ReaderAllocBlock(blockSize int) ReaderOption {
83 return func(r *Reader) error {
84 if blockSize > maxBlockSize || blockSize < 1024 {
85 return errors.New("s2: invalid ReaderAllocBlock. Must be <= 4MB and >= 1024")
86 }
87 r.lazyBuf = blockSize
88 return nil
89 }
90}
91
92// ReaderIgnoreStreamIdentifier will make the reader skip the expected
93// stream identifier at the beginning of the stream.
94// This can be used when serving a stream that has been forwarded to a specific point.
95func ReaderIgnoreStreamIdentifier() ReaderOption {
96 return func(r *Reader) error {
97 r.ignoreStreamID = true
98 return nil
99 }
100}
101
102// ReaderSkippableCB will register a callback for chuncks with the specified ID.
103// ID must be a Reserved skippable chunks ID, 0x80-0xfd (inclusive).
104// For each chunk with the ID, the callback is called with the content.
105// Any returned non-nil error will abort decompression.
106// Only one callback per ID is supported, latest sent will be used.
107func ReaderSkippableCB(id uint8, fn func(r io.Reader) error) ReaderOption {
108 return func(r *Reader) error {
109 if id < 0x80 || id > 0xfd {
110 return fmt.Errorf("ReaderSkippableCB: Invalid id provided, must be 0x80-0xfd (inclusive)")
111 }
112 r.skippableCB[id] = fn
113 return nil
114 }
115}
116
117// ReaderIgnoreCRC will make the reader skip CRC calculation and checks.
118func ReaderIgnoreCRC() ReaderOption {
119 return func(r *Reader) error {
120 r.ignoreCRC = true
121 return nil
122 }
123}
124
125// Reader is an io.Reader that can read Snappy-compressed bytes.
126type Reader struct {
127 r io.Reader
128 err error
129 decoded []byte
130 buf []byte
131 skippableCB [0x80]func(r io.Reader) error
132 blockStart int64 // Uncompressed offset at start of current.
133 index *Index
134
135 // decoded[i:j] contains decoded bytes that have not yet been passed on.
136 i, j int
137 // maximum block size allowed.
138 maxBlock int
139 // maximum expected buffer size.
140 maxBufSize int
141 // alloc a buffer this size if > 0.
142 lazyBuf int
143 readHeader bool
144 paramsOK bool
145 snappyFrame bool
146 ignoreStreamID bool
147 ignoreCRC bool
148}
149
150// GetBufferCapacity returns the capacity of the internal buffer.
151// This might be useful to know when reusing the same reader in combination
152// with the lazy buffer option.
153func (r *Reader) GetBufferCapacity() int {
154 return cap(r.buf)
155}
156
157// ensureBufferSize will ensure that the buffer can take at least n bytes.
158// If false is returned the buffer exceeds maximum allowed size.
159func (r *Reader) ensureBufferSize(n int) bool {
160 if n > r.maxBufSize {
161 r.err = ErrCorrupt
162 return false
163 }
164 if cap(r.buf) >= n {
165 return true
166 }
167 // Realloc buffer.
168 r.buf = make([]byte, n)
169 return true
170}
171
172// Reset discards any buffered data, resets all state, and switches the Snappy
173// reader to read from r. This permits reusing a Reader rather than allocating
174// a new one.
175func (r *Reader) Reset(reader io.Reader) {
176 if !r.paramsOK {
177 return
178 }
179 r.index = nil
180 r.r = reader
181 r.err = nil
182 r.i = 0
183 r.j = 0
184 r.blockStart = 0
185 r.readHeader = r.ignoreStreamID
186}
187
188func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
189 if _, r.err = io.ReadFull(r.r, p); r.err != nil {
190 if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
191 r.err = ErrCorrupt
192 }
193 return false
194 }
195 return true
196}
197
198// skippable will skip n bytes.
199// If the supplied reader supports seeking that is used.
200// tmp is used as a temporary buffer for reading.
201// The supplied slice does not need to be the size of the read.
202func (r *Reader) skippable(tmp []byte, n int, allowEOF bool, id uint8) (ok bool) {
203 if id < 0x80 {
204 r.err = fmt.Errorf("interbal error: skippable id < 0x80")
205 return false
206 }
207 if fn := r.skippableCB[id-0x80]; fn != nil {
208 rd := io.LimitReader(r.r, int64(n))
209 r.err = fn(rd)
210 if r.err != nil {
211 return false
212 }
213 _, r.err = io.CopyBuffer(ioutil.Discard, rd, tmp)
214 return r.err == nil
215 }
216 if rs, ok := r.r.(io.ReadSeeker); ok {
217 _, err := rs.Seek(int64(n), io.SeekCurrent)
218 if err == nil {
219 return true
220 }
221 if err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
222 r.err = ErrCorrupt
223 return false
224 }
225 }
226 for n > 0 {
227 if n < len(tmp) {
228 tmp = tmp[:n]
229 }
230 if _, r.err = io.ReadFull(r.r, tmp); r.err != nil {
231 if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
232 r.err = ErrCorrupt
233 }
234 return false
235 }
236 n -= len(tmp)
237 }
238 return true
239}
240
241// Read satisfies the io.Reader interface.
242func (r *Reader) Read(p []byte) (int, error) {
243 if r.err != nil {
244 return 0, r.err
245 }
246 for {
247 if r.i < r.j {
248 n := copy(p, r.decoded[r.i:r.j])
249 r.i += n
250 return n, nil
251 }
252 if !r.readFull(r.buf[:4], true) {
253 return 0, r.err
254 }
255 chunkType := r.buf[0]
256 if !r.readHeader {
257 if chunkType != chunkTypeStreamIdentifier {
258 r.err = ErrCorrupt
259 return 0, r.err
260 }
261 r.readHeader = true
262 }
263 chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
264
265 // The chunk types are specified at
266 // https://github.com/google/snappy/blob/master/framing_format.txt
267 switch chunkType {
268 case chunkTypeCompressedData:
269 r.blockStart += int64(r.j)
270 // Section 4.2. Compressed data (chunk type 0x00).
271 if chunkLen < checksumSize {
272 r.err = ErrCorrupt
273 return 0, r.err
274 }
275 if !r.ensureBufferSize(chunkLen) {
276 if r.err == nil {
277 r.err = ErrUnsupported
278 }
279 return 0, r.err
280 }
281 buf := r.buf[:chunkLen]
282 if !r.readFull(buf, false) {
283 return 0, r.err
284 }
285 checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
286 buf = buf[checksumSize:]
287
288 n, err := DecodedLen(buf)
289 if err != nil {
290 r.err = err
291 return 0, r.err
292 }
293 if r.snappyFrame && n > maxSnappyBlockSize {
294 r.err = ErrCorrupt
295 return 0, r.err
296 }
297
298 if n > len(r.decoded) {
299 if n > r.maxBlock {
300 r.err = ErrCorrupt
301 return 0, r.err
302 }
303 r.decoded = make([]byte, n)
304 }
305 if _, err := Decode(r.decoded, buf); err != nil {
306 r.err = err
307 return 0, r.err
308 }
309 if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {
310 r.err = ErrCRC
311 return 0, r.err
312 }
313 r.i, r.j = 0, n
314 continue
315
316 case chunkTypeUncompressedData:
317 r.blockStart += int64(r.j)
318 // Section 4.3. Uncompressed data (chunk type 0x01).
319 if chunkLen < checksumSize {
320 r.err = ErrCorrupt
321 return 0, r.err
322 }
323 if !r.ensureBufferSize(chunkLen) {
324 if r.err == nil {
325 r.err = ErrUnsupported
326 }
327 return 0, r.err
328 }
329 buf := r.buf[:checksumSize]
330 if !r.readFull(buf, false) {
331 return 0, r.err
332 }
333 checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
334 // Read directly into r.decoded instead of via r.buf.
335 n := chunkLen - checksumSize
336 if r.snappyFrame && n > maxSnappyBlockSize {
337 r.err = ErrCorrupt
338 return 0, r.err
339 }
340 if n > len(r.decoded) {
341 if n > r.maxBlock {
342 r.err = ErrCorrupt
343 return 0, r.err
344 }
345 r.decoded = make([]byte, n)
346 }
347 if !r.readFull(r.decoded[:n], false) {
348 return 0, r.err
349 }
350 if !r.ignoreCRC && crc(r.decoded[:n]) != checksum {
351 r.err = ErrCRC
352 return 0, r.err
353 }
354 r.i, r.j = 0, n
355 continue
356
357 case chunkTypeStreamIdentifier:
358 // Section 4.1. Stream identifier (chunk type 0xff).
359 if chunkLen != len(magicBody) {
360 r.err = ErrCorrupt
361 return 0, r.err
362 }
363 if !r.readFull(r.buf[:len(magicBody)], false) {
364 return 0, r.err
365 }
366 if string(r.buf[:len(magicBody)]) != magicBody {
367 if string(r.buf[:len(magicBody)]) != magicBodySnappy {
368 r.err = ErrCorrupt
369 return 0, r.err
370 } else {
371 r.snappyFrame = true
372 }
373 } else {
374 r.snappyFrame = false
375 }
376 continue
377 }
378
379 if chunkType <= 0x7f {
380 // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
381 // fmt.Printf("ERR chunktype: 0x%x\n", chunkType)
382 r.err = ErrUnsupported
383 return 0, r.err
384 }
385 // Section 4.4 Padding (chunk type 0xfe).
386 // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
387 if chunkLen > maxChunkSize {
388 // fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen)
389 r.err = ErrUnsupported
390 return 0, r.err
391 }
392
393 // fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen)
394 if !r.skippable(r.buf, chunkLen, false, chunkType) {
395 return 0, r.err
396 }
397 }
398}
399
400// DecodeConcurrent will decode the full stream to w.
401// This function should not be combined with reading, seeking or other operations.
402// Up to 'concurrent' goroutines will be used.
403// If <= 0, runtime.NumCPU will be used.
404// On success the number of bytes decompressed nil and is returned.
405// This is mainly intended for bigger streams.
406func (r *Reader) DecodeConcurrent(w io.Writer, concurrent int) (written int64, err error) {
407 if r.i > 0 || r.j > 0 || r.blockStart > 0 {
408 return 0, errors.New("DecodeConcurrent called after ")
409 }
410 if concurrent <= 0 {
411 concurrent = runtime.NumCPU()
412 }
413
414 // Write to output
415 var errMu sync.Mutex
416 var aErr error
417 setErr := func(e error) (ok bool) {
418 errMu.Lock()
419 defer errMu.Unlock()
420 if e == nil {
421 return aErr == nil
422 }
423 if aErr == nil {
424 aErr = e
425 }
426 return false
427 }
428 hasErr := func() (ok bool) {
429 errMu.Lock()
430 v := aErr != nil
431 errMu.Unlock()
432 return v
433 }
434
435 var aWritten int64
436 toRead := make(chan []byte, concurrent)
437 writtenBlocks := make(chan []byte, concurrent)
438 queue := make(chan chan []byte, concurrent)
439 reUse := make(chan chan []byte, concurrent)
440 for i := 0; i < concurrent; i++ {
441 toRead <- make([]byte, 0, r.maxBufSize)
442 writtenBlocks <- make([]byte, 0, r.maxBufSize)
443 reUse <- make(chan []byte, 1)
444 }
445 // Writer
446 var wg sync.WaitGroup
447 wg.Add(1)
448 go func() {
449 defer wg.Done()
450 for toWrite := range queue {
451 entry := <-toWrite
452 reUse <- toWrite
453 if hasErr() {
454 writtenBlocks <- entry
455 continue
456 }
457 n, err := w.Write(entry)
458 want := len(entry)
459 writtenBlocks <- entry
460 if err != nil {
461 setErr(err)
462 continue
463 }
464 if n != want {
465 setErr(io.ErrShortWrite)
466 continue
467 }
468 aWritten += int64(n)
469 }
470 }()
471
472 // Reader
473 defer func() {
474 close(queue)
475 if r.err != nil {
476 err = r.err
477 setErr(r.err)
478 }
479 wg.Wait()
480 if err == nil {
481 err = aErr
482 }
483 written = aWritten
484 }()
485
486 for !hasErr() {
487 if !r.readFull(r.buf[:4], true) {
488 if r.err == io.EOF {
489 r.err = nil
490 }
491 return 0, r.err
492 }
493 chunkType := r.buf[0]
494 if !r.readHeader {
495 if chunkType != chunkTypeStreamIdentifier {
496 r.err = ErrCorrupt
497 return 0, r.err
498 }
499 r.readHeader = true
500 }
501 chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
502
503 // The chunk types are specified at
504 // https://github.com/google/snappy/blob/master/framing_format.txt
505 switch chunkType {
506 case chunkTypeCompressedData:
507 r.blockStart += int64(r.j)
508 // Section 4.2. Compressed data (chunk type 0x00).
509 if chunkLen < checksumSize {
510 r.err = ErrCorrupt
511 return 0, r.err
512 }
513 if chunkLen > r.maxBufSize {
514 r.err = ErrCorrupt
515 return 0, r.err
516 }
517 orgBuf := <-toRead
518 buf := orgBuf[:chunkLen]
519
520 if !r.readFull(buf, false) {
521 return 0, r.err
522 }
523
524 checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
525 buf = buf[checksumSize:]
526
527 n, err := DecodedLen(buf)
528 if err != nil {
529 r.err = err
530 return 0, r.err
531 }
532 if r.snappyFrame && n > maxSnappyBlockSize {
533 r.err = ErrCorrupt
534 return 0, r.err
535 }
536
537 if n > r.maxBlock {
538 r.err = ErrCorrupt
539 return 0, r.err
540 }
541 wg.Add(1)
542
543 decoded := <-writtenBlocks
544 entry := <-reUse
545 queue <- entry
546 go func() {
547 defer wg.Done()
548 decoded = decoded[:n]
549 _, err := Decode(decoded, buf)
550 toRead <- orgBuf
551 if err != nil {
552 writtenBlocks <- decoded
553 setErr(err)
554 return
555 }
556 if !r.ignoreCRC && crc(decoded) != checksum {
557 writtenBlocks <- decoded
558 setErr(ErrCRC)
559 return
560 }
561 entry <- decoded
562 }()
563 continue
564
565 case chunkTypeUncompressedData:
566
567 // Section 4.3. Uncompressed data (chunk type 0x01).
568 if chunkLen < checksumSize {
569 r.err = ErrCorrupt
570 return 0, r.err
571 }
572 if chunkLen > r.maxBufSize {
573 r.err = ErrCorrupt
574 return 0, r.err
575 }
576 // Grab write buffer
577 orgBuf := <-writtenBlocks
578 buf := orgBuf[:checksumSize]
579 if !r.readFull(buf, false) {
580 return 0, r.err
581 }
582 checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
583 // Read content.
584 n := chunkLen - checksumSize
585
586 if r.snappyFrame && n > maxSnappyBlockSize {
587 r.err = ErrCorrupt
588 return 0, r.err
589 }
590 if n > r.maxBlock {
591 r.err = ErrCorrupt
592 return 0, r.err
593 }
594 // Read uncompressed
595 buf = orgBuf[:n]
596 if !r.readFull(buf, false) {
597 return 0, r.err
598 }
599
600 if !r.ignoreCRC && crc(buf) != checksum {
601 r.err = ErrCRC
602 return 0, r.err
603 }
604 entry := <-reUse
605 queue <- entry
606 entry <- buf
607 continue
608
609 case chunkTypeStreamIdentifier:
610 // Section 4.1. Stream identifier (chunk type 0xff).
611 if chunkLen != len(magicBody) {
612 r.err = ErrCorrupt
613 return 0, r.err
614 }
615 if !r.readFull(r.buf[:len(magicBody)], false) {
616 return 0, r.err
617 }
618 if string(r.buf[:len(magicBody)]) != magicBody {
619 if string(r.buf[:len(magicBody)]) != magicBodySnappy {
620 r.err = ErrCorrupt
621 return 0, r.err
622 } else {
623 r.snappyFrame = true
624 }
625 } else {
626 r.snappyFrame = false
627 }
628 continue
629 }
630
631 if chunkType <= 0x7f {
632 // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
633 // fmt.Printf("ERR chunktype: 0x%x\n", chunkType)
634 r.err = ErrUnsupported
635 return 0, r.err
636 }
637 // Section 4.4 Padding (chunk type 0xfe).
638 // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
639 if chunkLen > maxChunkSize {
640 // fmt.Printf("ERR chunkLen: 0x%x\n", chunkLen)
641 r.err = ErrUnsupported
642 return 0, r.err
643 }
644
645 // fmt.Printf("skippable: ID: 0x%x, len: 0x%x\n", chunkType, chunkLen)
646 if !r.skippable(r.buf, chunkLen, false, chunkType) {
647 return 0, r.err
648 }
649 }
650 return 0, r.err
651}
652
653// Skip will skip n bytes forward in the decompressed output.
654// For larger skips this consumes less CPU and is faster than reading output and discarding it.
655// CRC is not checked on skipped blocks.
656// io.ErrUnexpectedEOF is returned if the stream ends before all bytes have been skipped.
657// If a decoding error is encountered subsequent calls to Read will also fail.
658func (r *Reader) Skip(n int64) error {
659 if n < 0 {
660 return errors.New("attempted negative skip")
661 }
662 if r.err != nil {
663 return r.err
664 }
665
666 for n > 0 {
667 if r.i < r.j {
668 // Skip in buffer.
669 // decoded[i:j] contains decoded bytes that have not yet been passed on.
670 left := int64(r.j - r.i)
671 if left >= n {
672 tmp := int64(r.i) + n
673 if tmp > math.MaxInt32 {
674 return errors.New("s2: internal overflow in skip")
675 }
676 r.i = int(tmp)
677 return nil
678 }
679 n -= int64(r.j - r.i)
680 r.i = r.j
681 }
682
683 // Buffer empty; read blocks until we have content.
684 if !r.readFull(r.buf[:4], true) {
685 if r.err == io.EOF {
686 r.err = io.ErrUnexpectedEOF
687 }
688 return r.err
689 }
690 chunkType := r.buf[0]
691 if !r.readHeader {
692 if chunkType != chunkTypeStreamIdentifier {
693 r.err = ErrCorrupt
694 return r.err
695 }
696 r.readHeader = true
697 }
698 chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
699
700 // The chunk types are specified at
701 // https://github.com/google/snappy/blob/master/framing_format.txt
702 switch chunkType {
703 case chunkTypeCompressedData:
704 r.blockStart += int64(r.j)
705 // Section 4.2. Compressed data (chunk type 0x00).
706 if chunkLen < checksumSize {
707 r.err = ErrCorrupt
708 return r.err
709 }
710 if !r.ensureBufferSize(chunkLen) {
711 if r.err == nil {
712 r.err = ErrUnsupported
713 }
714 return r.err
715 }
716 buf := r.buf[:chunkLen]
717 if !r.readFull(buf, false) {
718 return r.err
719 }
720 checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
721 buf = buf[checksumSize:]
722
723 dLen, err := DecodedLen(buf)
724 if err != nil {
725 r.err = err
726 return r.err
727 }
728 if dLen > r.maxBlock {
729 r.err = ErrCorrupt
730 return r.err
731 }
732 // Check if destination is within this block
733 if int64(dLen) > n {
734 if len(r.decoded) < dLen {
735 r.decoded = make([]byte, dLen)
736 }
737 if _, err := Decode(r.decoded, buf); err != nil {
738 r.err = err
739 return r.err
740 }
741 if crc(r.decoded[:dLen]) != checksum {
742 r.err = ErrCorrupt
743 return r.err
744 }
745 } else {
746 // Skip block completely
747 n -= int64(dLen)
748 r.blockStart += int64(dLen)
749 dLen = 0
750 }
751 r.i, r.j = 0, dLen
752 continue
753 case chunkTypeUncompressedData:
754 r.blockStart += int64(r.j)
755 // Section 4.3. Uncompressed data (chunk type 0x01).
756 if chunkLen < checksumSize {
757 r.err = ErrCorrupt
758 return r.err
759 }
760 if !r.ensureBufferSize(chunkLen) {
761 if r.err != nil {
762 r.err = ErrUnsupported
763 }
764 return r.err
765 }
766 buf := r.buf[:checksumSize]
767 if !r.readFull(buf, false) {
768 return r.err
769 }
770 checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
771 // Read directly into r.decoded instead of via r.buf.
772 n2 := chunkLen - checksumSize
773 if n2 > len(r.decoded) {
774 if n2 > r.maxBlock {
775 r.err = ErrCorrupt
776 return r.err
777 }
778 r.decoded = make([]byte, n2)
779 }
780 if !r.readFull(r.decoded[:n2], false) {
781 return r.err
782 }
783 if int64(n2) < n {
784 if crc(r.decoded[:n2]) != checksum {
785 r.err = ErrCorrupt
786 return r.err
787 }
788 }
789 r.i, r.j = 0, n2
790 continue
791 case chunkTypeStreamIdentifier:
792 // Section 4.1. Stream identifier (chunk type 0xff).
793 if chunkLen != len(magicBody) {
794 r.err = ErrCorrupt
795 return r.err
796 }
797 if !r.readFull(r.buf[:len(magicBody)], false) {
798 return r.err
799 }
800 if string(r.buf[:len(magicBody)]) != magicBody {
801 if string(r.buf[:len(magicBody)]) != magicBodySnappy {
802 r.err = ErrCorrupt
803 return r.err
804 }
805 }
806
807 continue
808 }
809
810 if chunkType <= 0x7f {
811 // Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
812 r.err = ErrUnsupported
813 return r.err
814 }
815 if chunkLen > maxChunkSize {
816 r.err = ErrUnsupported
817 return r.err
818 }
819 // Section 4.4 Padding (chunk type 0xfe).
820 // Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
821 if !r.skippable(r.buf, chunkLen, false, chunkType) {
822 return r.err
823 }
824 }
825 return nil
826}
827
828// ReadSeeker provides random or forward seeking in compressed content.
829// See Reader.ReadSeeker
830type ReadSeeker struct {
831 *Reader
832 readAtMu sync.Mutex
833}
834
835// ReadSeeker will return an io.ReadSeeker and io.ReaderAt
836// compatible version of the reader.
837// If 'random' is specified the returned io.Seeker can be used for
838// random seeking, otherwise only forward seeking is supported.
839// Enabling random seeking requires the original input to support
840// the io.Seeker interface.
841// A custom index can be specified which will be used if supplied.
842// When using a custom index, it will not be read from the input stream.
843// The ReadAt position will affect regular reads and the current position of Seek.
844// So using Read after ReadAt will continue from where the ReadAt stopped.
845// No functions should be used concurrently.
846// The returned ReadSeeker contains a shallow reference to the existing Reader,
847// meaning changes performed to one is reflected in the other.
848func (r *Reader) ReadSeeker(random bool, index []byte) (*ReadSeeker, error) {
849 // Read index if provided.
850 if len(index) != 0 {
851 if r.index == nil {
852 r.index = &Index{}
853 }
854 if _, err := r.index.Load(index); err != nil {
855 return nil, ErrCantSeek{Reason: "loading index returned: " + err.Error()}
856 }
857 }
858
859 // Check if input is seekable
860 rs, ok := r.r.(io.ReadSeeker)
861 if !ok {
862 if !random {
863 return &ReadSeeker{Reader: r}, nil
864 }
865 return nil, ErrCantSeek{Reason: "input stream isn't seekable"}
866 }
867
868 if r.index != nil {
869 // Seekable and index, ok...
870 return &ReadSeeker{Reader: r}, nil
871 }
872
873 // Load from stream.
874 r.index = &Index{}
875
876 // Read current position.
877 pos, err := rs.Seek(0, io.SeekCurrent)
878 if err != nil {
879 return nil, ErrCantSeek{Reason: "seeking input returned: " + err.Error()}
880 }
881 err = r.index.LoadStream(rs)
882 if err != nil {
883 if err == ErrUnsupported {
884 // If we don't require random seeking, reset input and return.
885 if !random {
886 _, err = rs.Seek(pos, io.SeekStart)
887 if err != nil {
888 return nil, ErrCantSeek{Reason: "resetting stream returned: " + err.Error()}
889 }
890 r.index = nil
891 return &ReadSeeker{Reader: r}, nil
892 }
893 return nil, ErrCantSeek{Reason: "input stream does not contain an index"}
894 }
895 return nil, ErrCantSeek{Reason: "reading index returned: " + err.Error()}
896 }
897
898 // reset position.
899 _, err = rs.Seek(pos, io.SeekStart)
900 if err != nil {
901 return nil, ErrCantSeek{Reason: "seeking input returned: " + err.Error()}
902 }
903 return &ReadSeeker{Reader: r}, nil
904}
905
906// Seek allows seeking in compressed data.
907func (r *ReadSeeker) Seek(offset int64, whence int) (int64, error) {
908 if r.err != nil {
909 if !errors.Is(r.err, io.EOF) {
910 return 0, r.err
911 }
912 // Reset on EOF
913 r.err = nil
914 }
915
916 // Calculate absolute offset.
917 absOffset := offset
918
919 switch whence {
920 case io.SeekStart:
921 case io.SeekCurrent:
922 absOffset = r.blockStart + int64(r.i) + offset
923 case io.SeekEnd:
924 if r.index == nil {
925 return 0, ErrUnsupported
926 }
927 absOffset = r.index.TotalUncompressed + offset
928 default:
929 r.err = ErrUnsupported
930 return 0, r.err
931 }
932
933 if absOffset < 0 {
934 return 0, errors.New("seek before start of file")
935 }
936
937 if !r.readHeader {
938 // Make sure we read the header.
939 _, r.err = r.Read([]byte{})
940 if r.err != nil {
941 return 0, r.err
942 }
943 }
944
945 // If we are inside current block no need to seek.
946 // This includes no offset changes.
947 if absOffset >= r.blockStart && absOffset < r.blockStart+int64(r.j) {
948 r.i = int(absOffset - r.blockStart)
949 return r.blockStart + int64(r.i), nil
950 }
951
952 rs, ok := r.r.(io.ReadSeeker)
953 if r.index == nil || !ok {
954 currOffset := r.blockStart + int64(r.i)
955 if absOffset >= currOffset {
956 err := r.Skip(absOffset - currOffset)
957 return r.blockStart + int64(r.i), err
958 }
959 return 0, ErrUnsupported
960 }
961
962 // We can seek and we have an index.
963 c, u, err := r.index.Find(absOffset)
964 if err != nil {
965 return r.blockStart + int64(r.i), err
966 }
967
968 // Seek to next block
969 _, err = rs.Seek(c, io.SeekStart)
970 if err != nil {
971 return 0, err
972 }
973
974 r.i = r.j // Remove rest of current block.
975 r.blockStart = u - int64(r.j) // Adjust current block start for accounting.
976 if u < absOffset {
977 // Forward inside block
978 return absOffset, r.Skip(absOffset - u)
979 }
980 if u > absOffset {
981 return 0, fmt.Errorf("s2 seek: (internal error) u (%d) > absOffset (%d)", u, absOffset)
982 }
983 return absOffset, nil
984}
985
986// ReadAt reads len(p) bytes into p starting at offset off in the
987// underlying input source. It returns the number of bytes
988// read (0 <= n <= len(p)) and any error encountered.
989//
990// When ReadAt returns n < len(p), it returns a non-nil error
991// explaining why more bytes were not returned. In this respect,
992// ReadAt is stricter than Read.
993//
994// Even if ReadAt returns n < len(p), it may use all of p as scratch
995// space during the call. If some data is available but not len(p) bytes,
996// ReadAt blocks until either all the data is available or an error occurs.
997// In this respect ReadAt is different from Read.
998//
999// If the n = len(p) bytes returned by ReadAt are at the end of the
1000// input source, ReadAt may return either err == EOF or err == nil.
1001//
1002// If ReadAt is reading from an input source with a seek offset,
1003// ReadAt should not affect nor be affected by the underlying
1004// seek offset.
1005//
1006// Clients of ReadAt can execute parallel ReadAt calls on the
1007// same input source. This is however not recommended.
1008func (r *ReadSeeker) ReadAt(p []byte, offset int64) (int, error) {
1009 r.readAtMu.Lock()
1010 defer r.readAtMu.Unlock()
1011 _, err := r.Seek(offset, io.SeekStart)
1012 if err != nil {
1013 return 0, err
1014 }
1015 n := 0
1016 for n < len(p) {
1017 n2, err := r.Read(p[n:])
1018 if err != nil {
1019 // This will include io.EOF
1020 return n + n2, err
1021 }
1022 n += n2
1023 }
1024 return n, nil
1025}
1026
1027// ReadByte satisfies the io.ByteReader interface.
1028func (r *Reader) ReadByte() (byte, error) {
1029 if r.err != nil {
1030 return 0, r.err
1031 }
1032 if r.i < r.j {
1033 c := r.decoded[r.i]
1034 r.i++
1035 return c, nil
1036 }
1037 var tmp [1]byte
1038 for i := 0; i < 10; i++ {
1039 n, err := r.Read(tmp[:])
1040 if err != nil {
1041 return 0, err
1042 }
1043 if n == 1 {
1044 return tmp[0], nil
1045 }
1046 }
1047 return 0, io.ErrNoProgress
1048}
1049
1050// SkippableCB will register a callback for chunks with the specified ID.
1051// ID must be a Reserved skippable chunks ID, 0x80-0xfe (inclusive).
1052// For each chunk with the ID, the callback is called with the content.
1053// Any returned non-nil error will abort decompression.
1054// Only one callback per ID is supported, latest sent will be used.
1055// Sending a nil function will disable previous callbacks.
1056func (r *Reader) SkippableCB(id uint8, fn func(r io.Reader) error) error {
1057 if id < 0x80 || id > chunkTypePadding {
1058 return fmt.Errorf("ReaderSkippableCB: Invalid id provided, must be 0x80-0xfe (inclusive)")
1059 }
1060 r.skippableCB[id] = fn
1061 return nil
1062}
diff --git a/vendor/github.com/klauspost/compress/s2/s2.go b/vendor/github.com/klauspost/compress/s2/s2.go
new file mode 100644
index 0000000..dae3f73
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/s2.go
@@ -0,0 +1,143 @@
1// Copyright 2011 The Snappy-Go Authors. All rights reserved.
2// Copyright (c) 2019 Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6// Package s2 implements the S2 compression format.
7//
8// S2 is an extension of Snappy. Similar to Snappy S2 is aimed for high throughput,
9// which is why it features concurrent compression for bigger payloads.
10//
11// Decoding is compatible with Snappy compressed content,
12// but content compressed with S2 cannot be decompressed by Snappy.
13//
14// For more information on Snappy/S2 differences see README in: https://github.com/klauspost/compress/tree/master/s2
15//
16// There are actually two S2 formats: block and stream. They are related,
17// but different: trying to decompress block-compressed data as a S2 stream
18// will fail, and vice versa. The block format is the Decode and Encode
19// functions and the stream format is the Reader and Writer types.
20//
21// A "better" compression option is available. This will trade some compression
22// speed
23//
24// The block format, the more common case, is used when the complete size (the
25// number of bytes) of the original data is known upfront, at the time
26// compression starts. The stream format, also known as the framing format, is
27// for when that isn't always true.
28//
29// Blocks to not offer much data protection, so it is up to you to
30// add data validation of decompressed blocks.
31//
32// Streams perform CRC validation of the decompressed data.
33// Stream compression will also be performed on multiple CPU cores concurrently
34// significantly improving throughput.
35package s2
36
37import (
38 "bytes"
39 "hash/crc32"
40)
41
42/*
43Each encoded block begins with the varint-encoded length of the decoded data,
44followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
45first byte of each chunk is broken into its 2 least and 6 most significant bits
46called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
47Zero means a literal tag. All other values mean a copy tag.
48
49For literal tags:
50 - If m < 60, the next 1 + m bytes are literal bytes.
51 - Otherwise, let n be the little-endian unsigned integer denoted by the next
52 m - 59 bytes. The next 1 + n bytes after that are literal bytes.
53
54For copy tags, length bytes are copied from offset bytes ago, in the style of
55Lempel-Ziv compression algorithms. In particular:
56 - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
57 The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
58 of the offset. The next byte is bits 0-7 of the offset.
59 - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
60 The length is 1 + m. The offset is the little-endian unsigned integer
61 denoted by the next 2 bytes.
62 - For l == 3, the offset ranges in [0, 1<<32) and the length in
63 [1, 65). The length is 1 + m. The offset is the little-endian unsigned
64 integer denoted by the next 4 bytes.
65*/
66const (
67 tagLiteral = 0x00
68 tagCopy1 = 0x01
69 tagCopy2 = 0x02
70 tagCopy4 = 0x03
71)
72
73const (
74 checksumSize = 4
75 chunkHeaderSize = 4
76 magicChunk = "\xff\x06\x00\x00" + magicBody
77 magicChunkSnappy = "\xff\x06\x00\x00" + magicBodySnappy
78 magicBodySnappy = "sNaPpY"
79 magicBody = "S2sTwO"
80
81 // maxBlockSize is the maximum size of the input to encodeBlock.
82 //
83 // For the framing format (Writer type instead of Encode function),
84 // this is the maximum uncompressed size of a block.
85 maxBlockSize = 4 << 20
86
87 // minBlockSize is the minimum size of block setting when creating a writer.
88 minBlockSize = 4 << 10
89
90 skippableFrameHeader = 4
91 maxChunkSize = 1<<24 - 1 // 16777215
92
93 // Default block size
94 defaultBlockSize = 1 << 20
95
96 // maxSnappyBlockSize is the maximum snappy block size.
97 maxSnappyBlockSize = 1 << 16
98
99 obufHeaderLen = checksumSize + chunkHeaderSize
100)
101
102const (
103 chunkTypeCompressedData = 0x00
104 chunkTypeUncompressedData = 0x01
105 ChunkTypeIndex = 0x99
106 chunkTypePadding = 0xfe
107 chunkTypeStreamIdentifier = 0xff
108)
109
110var crcTable = crc32.MakeTable(crc32.Castagnoli)
111
112// crc implements the checksum specified in section 3 of
113// https://github.com/google/snappy/blob/master/framing_format.txt
114func crc(b []byte) uint32 {
115 c := crc32.Update(0, crcTable, b)
116 return c>>15 | c<<17 + 0xa282ead8
117}
118
119// literalExtraSize returns the extra size of encoding n literals.
120// n should be >= 0 and <= math.MaxUint32.
121func literalExtraSize(n int64) int64 {
122 if n == 0 {
123 return 0
124 }
125 switch {
126 case n < 60:
127 return 1
128 case n < 1<<8:
129 return 2
130 case n < 1<<16:
131 return 3
132 case n < 1<<24:
133 return 4
134 default:
135 return 5
136 }
137}
138
139type byter interface {
140 Bytes() []byte
141}
142
143var _ byter = &bytes.Buffer{}
diff --git a/vendor/github.com/klauspost/compress/s2/writer.go b/vendor/github.com/klauspost/compress/s2/writer.go
new file mode 100644
index 0000000..089cd36
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2/writer.go
@@ -0,0 +1,1020 @@
1// Copyright 2011 The Snappy-Go Authors. All rights reserved.
2// Copyright (c) 2019+ Klaus Post. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6package s2
7
8import (
9 "crypto/rand"
10 "encoding/binary"
11 "errors"
12 "fmt"
13 "io"
14 "runtime"
15 "sync"
16)
17
18const (
19 levelUncompressed = iota + 1
20 levelFast
21 levelBetter
22 levelBest
23)
24
25// NewWriter returns a new Writer that compresses to w, using the
26// framing format described at
27// https://github.com/google/snappy/blob/master/framing_format.txt
28//
29// Users must call Close to guarantee all data has been forwarded to
30// the underlying io.Writer and that resources are released.
31// They may also call Flush zero or more times before calling Close.
32func NewWriter(w io.Writer, opts ...WriterOption) *Writer {
33 w2 := Writer{
34 blockSize: defaultBlockSize,
35 concurrency: runtime.GOMAXPROCS(0),
36 randSrc: rand.Reader,
37 level: levelFast,
38 }
39 for _, opt := range opts {
40 if err := opt(&w2); err != nil {
41 w2.errState = err
42 return &w2
43 }
44 }
45 w2.obufLen = obufHeaderLen + MaxEncodedLen(w2.blockSize)
46 w2.paramsOK = true
47 w2.ibuf = make([]byte, 0, w2.blockSize)
48 w2.buffers.New = func() interface{} {
49 return make([]byte, w2.obufLen)
50 }
51 w2.Reset(w)
52 return &w2
53}
54
55// Writer is an io.Writer that can write Snappy-compressed bytes.
56type Writer struct {
57 errMu sync.Mutex
58 errState error
59
60 // ibuf is a buffer for the incoming (uncompressed) bytes.
61 ibuf []byte
62
63 blockSize int
64 obufLen int
65 concurrency int
66 written int64
67 uncompWritten int64 // Bytes sent to compression
68 output chan chan result
69 buffers sync.Pool
70 pad int
71
72 writer io.Writer
73 randSrc io.Reader
74 writerWg sync.WaitGroup
75 index Index
76 customEnc func(dst, src []byte) int
77
78 // wroteStreamHeader is whether we have written the stream header.
79 wroteStreamHeader bool
80 paramsOK bool
81 snappy bool
82 flushOnWrite bool
83 appendIndex bool
84 level uint8
85}
86
87type result struct {
88 b []byte
89 // Uncompressed start offset
90 startOffset int64
91}
92
93// err returns the previously set error.
94// If no error has been set it is set to err if not nil.
95func (w *Writer) err(err error) error {
96 w.errMu.Lock()
97 errSet := w.errState
98 if errSet == nil && err != nil {
99 w.errState = err
100 errSet = err
101 }
102 w.errMu.Unlock()
103 return errSet
104}
105
106// Reset discards the writer's state and switches the Snappy writer to write to w.
107// This permits reusing a Writer rather than allocating a new one.
108func (w *Writer) Reset(writer io.Writer) {
109 if !w.paramsOK {
110 return
111 }
112 // Close previous writer, if any.
113 if w.output != nil {
114 close(w.output)
115 w.writerWg.Wait()
116 w.output = nil
117 }
118 w.errState = nil
119 w.ibuf = w.ibuf[:0]
120 w.wroteStreamHeader = false
121 w.written = 0
122 w.writer = writer
123 w.uncompWritten = 0
124 w.index.reset(w.blockSize)
125
126 // If we didn't get a writer, stop here.
127 if writer == nil {
128 return
129 }
130 // If no concurrency requested, don't spin up writer goroutine.
131 if w.concurrency == 1 {
132 return
133 }
134
135 toWrite := make(chan chan result, w.concurrency)
136 w.output = toWrite
137 w.writerWg.Add(1)
138
139 // Start a writer goroutine that will write all output in order.
140 go func() {
141 defer w.writerWg.Done()
142
143 // Get a queued write.
144 for write := range toWrite {
145 // Wait for the data to be available.
146 input := <-write
147 in := input.b
148 if len(in) > 0 {
149 if w.err(nil) == nil {
150 // Don't expose data from previous buffers.
151 toWrite := in[:len(in):len(in)]
152 // Write to output.
153 n, err := writer.Write(toWrite)
154 if err == nil && n != len(toWrite) {
155 err = io.ErrShortBuffer
156 }
157 _ = w.err(err)
158 w.err(w.index.add(w.written, input.startOffset))
159 w.written += int64(n)
160 }
161 }
162 if cap(in) >= w.obufLen {
163 w.buffers.Put(in)
164 }
165 // close the incoming write request.
166 // This can be used for synchronizing flushes.
167 close(write)
168 }
169 }()
170}
171
172// Write satisfies the io.Writer interface.
173func (w *Writer) Write(p []byte) (nRet int, errRet error) {
174 if err := w.err(nil); err != nil {
175 return 0, err
176 }
177 if w.flushOnWrite {
178 return w.write(p)
179 }
180 // If we exceed the input buffer size, start writing
181 for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err(nil) == nil {
182 var n int
183 if len(w.ibuf) == 0 {
184 // Large write, empty buffer.
185 // Write directly from p to avoid copy.
186 n, _ = w.write(p)
187 } else {
188 n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
189 w.ibuf = w.ibuf[:len(w.ibuf)+n]
190 w.write(w.ibuf)
191 w.ibuf = w.ibuf[:0]
192 }
193 nRet += n
194 p = p[n:]
195 }
196 if err := w.err(nil); err != nil {
197 return nRet, err
198 }
199 // p should always be able to fit into w.ibuf now.
200 n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
201 w.ibuf = w.ibuf[:len(w.ibuf)+n]
202 nRet += n
203 return nRet, nil
204}
205
206// ReadFrom implements the io.ReaderFrom interface.
207// Using this is typically more efficient since it avoids a memory copy.
208// ReadFrom reads data from r until EOF or error.
209// The return value n is the number of bytes read.
210// Any error except io.EOF encountered during the read is also returned.
211func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) {
212 if err := w.err(nil); err != nil {
213 return 0, err
214 }
215 if len(w.ibuf) > 0 {
216 err := w.Flush()
217 if err != nil {
218 return 0, err
219 }
220 }
221 if br, ok := r.(byter); ok {
222 buf := br.Bytes()
223 if err := w.EncodeBuffer(buf); err != nil {
224 return 0, err
225 }
226 return int64(len(buf)), w.Flush()
227 }
228 for {
229 inbuf := w.buffers.Get().([]byte)[:w.blockSize+obufHeaderLen]
230 n2, err := io.ReadFull(r, inbuf[obufHeaderLen:])
231 if err != nil {
232 if err == io.ErrUnexpectedEOF {
233 err = io.EOF
234 }
235 if err != io.EOF {
236 return n, w.err(err)
237 }
238 }
239 if n2 == 0 {
240 break
241 }
242 n += int64(n2)
243 err2 := w.writeFull(inbuf[:n2+obufHeaderLen])
244 if w.err(err2) != nil {
245 break
246 }
247
248 if err != nil {
249 // We got EOF and wrote everything
250 break
251 }
252 }
253
254 return n, w.err(nil)
255}
256
257// AddSkippableBlock will add a skippable block to the stream.
258// The ID must be 0x80-0xfe (inclusive).
259// Length of the skippable block must be <= 16777215 bytes.
260func (w *Writer) AddSkippableBlock(id uint8, data []byte) (err error) {
261 if err := w.err(nil); err != nil {
262 return err
263 }
264 if len(data) == 0 {
265 return nil
266 }
267 if id < 0x80 || id > chunkTypePadding {
268 return fmt.Errorf("invalid skippable block id %x", id)
269 }
270 if len(data) > maxChunkSize {
271 return fmt.Errorf("skippable block excessed maximum size")
272 }
273 var header [4]byte
274 chunkLen := 4 + len(data)
275 header[0] = id
276 header[1] = uint8(chunkLen >> 0)
277 header[2] = uint8(chunkLen >> 8)
278 header[3] = uint8(chunkLen >> 16)
279 if w.concurrency == 1 {
280 write := func(b []byte) error {
281 n, err := w.writer.Write(b)
282 if err = w.err(err); err != nil {
283 return err
284 }
285 if n != len(data) {
286 return w.err(io.ErrShortWrite)
287 }
288 w.written += int64(n)
289 return w.err(nil)
290 }
291 if !w.wroteStreamHeader {
292 w.wroteStreamHeader = true
293 if w.snappy {
294 if err := write([]byte(magicChunkSnappy)); err != nil {
295 return err
296 }
297 } else {
298 if err := write([]byte(magicChunk)); err != nil {
299 return err
300 }
301 }
302 }
303 if err := write(header[:]); err != nil {
304 return err
305 }
306 if err := write(data); err != nil {
307 return err
308 }
309 }
310
311 // Create output...
312 if !w.wroteStreamHeader {
313 w.wroteStreamHeader = true
314 hWriter := make(chan result)
315 w.output <- hWriter
316 if w.snappy {
317 hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)}
318 } else {
319 hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)}
320 }
321 }
322
323 // Copy input.
324 inbuf := w.buffers.Get().([]byte)[:4]
325 copy(inbuf, header[:])
326 inbuf = append(inbuf, data...)
327
328 output := make(chan result, 1)
329 // Queue output.
330 w.output <- output
331 output <- result{startOffset: w.uncompWritten, b: inbuf}
332
333 return nil
334}
335
336// EncodeBuffer will add a buffer to the stream.
337// This is the fastest way to encode a stream,
338// but the input buffer cannot be written to by the caller
339// until Flush or Close has been called when concurrency != 1.
340//
341// If you cannot control that, use the regular Write function.
342//
343// Note that input is not buffered.
344// This means that each write will result in discrete blocks being created.
345// For buffered writes, use the regular Write function.
346func (w *Writer) EncodeBuffer(buf []byte) (err error) {
347 if err := w.err(nil); err != nil {
348 return err
349 }
350
351 if w.flushOnWrite {
352 _, err := w.write(buf)
353 return err
354 }
355 // Flush queued data first.
356 if len(w.ibuf) > 0 {
357 err := w.Flush()
358 if err != nil {
359 return err
360 }
361 }
362 if w.concurrency == 1 {
363 _, err := w.writeSync(buf)
364 return err
365 }
366
367 // Spawn goroutine and write block to output channel.
368 if !w.wroteStreamHeader {
369 w.wroteStreamHeader = true
370 hWriter := make(chan result)
371 w.output <- hWriter
372 if w.snappy {
373 hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)}
374 } else {
375 hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)}
376 }
377 }
378
379 for len(buf) > 0 {
380 // Cut input.
381 uncompressed := buf
382 if len(uncompressed) > w.blockSize {
383 uncompressed = uncompressed[:w.blockSize]
384 }
385 buf = buf[len(uncompressed):]
386 // Get an output buffer.
387 obuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
388 output := make(chan result)
389 // Queue output now, so we keep order.
390 w.output <- output
391 res := result{
392 startOffset: w.uncompWritten,
393 }
394 w.uncompWritten += int64(len(uncompressed))
395 go func() {
396 checksum := crc(uncompressed)
397
398 // Set to uncompressed.
399 chunkType := uint8(chunkTypeUncompressedData)
400 chunkLen := 4 + len(uncompressed)
401
402 // Attempt compressing.
403 n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
404 n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
405
406 // Check if we should use this, or store as uncompressed instead.
407 if n2 > 0 {
408 chunkType = uint8(chunkTypeCompressedData)
409 chunkLen = 4 + n + n2
410 obuf = obuf[:obufHeaderLen+n+n2]
411 } else {
412 // copy uncompressed
413 copy(obuf[obufHeaderLen:], uncompressed)
414 }
415
416 // Fill in the per-chunk header that comes before the body.
417 obuf[0] = chunkType
418 obuf[1] = uint8(chunkLen >> 0)
419 obuf[2] = uint8(chunkLen >> 8)
420 obuf[3] = uint8(chunkLen >> 16)
421 obuf[4] = uint8(checksum >> 0)
422 obuf[5] = uint8(checksum >> 8)
423 obuf[6] = uint8(checksum >> 16)
424 obuf[7] = uint8(checksum >> 24)
425
426 // Queue final output.
427 res.b = obuf
428 output <- res
429 }()
430 }
431 return nil
432}
433
434func (w *Writer) encodeBlock(obuf, uncompressed []byte) int {
435 if w.customEnc != nil {
436 if ret := w.customEnc(obuf, uncompressed); ret >= 0 {
437 return ret
438 }
439 }
440 if w.snappy {
441 switch w.level {
442 case levelFast:
443 return encodeBlockSnappy(obuf, uncompressed)
444 case levelBetter:
445 return encodeBlockBetterSnappy(obuf, uncompressed)
446 case levelBest:
447 return encodeBlockBestSnappy(obuf, uncompressed)
448 }
449 return 0
450 }
451 switch w.level {
452 case levelFast:
453 return encodeBlock(obuf, uncompressed)
454 case levelBetter:
455 return encodeBlockBetter(obuf, uncompressed)
456 case levelBest:
457 return encodeBlockBest(obuf, uncompressed, nil)
458 }
459 return 0
460}
461
462func (w *Writer) write(p []byte) (nRet int, errRet error) {
463 if err := w.err(nil); err != nil {
464 return 0, err
465 }
466 if w.concurrency == 1 {
467 return w.writeSync(p)
468 }
469
470 // Spawn goroutine and write block to output channel.
471 for len(p) > 0 {
472 if !w.wroteStreamHeader {
473 w.wroteStreamHeader = true
474 hWriter := make(chan result)
475 w.output <- hWriter
476 if w.snappy {
477 hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)}
478 } else {
479 hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)}
480 }
481 }
482
483 var uncompressed []byte
484 if len(p) > w.blockSize {
485 uncompressed, p = p[:w.blockSize], p[w.blockSize:]
486 } else {
487 uncompressed, p = p, nil
488 }
489
490 // Copy input.
491 // If the block is incompressible, this is used for the result.
492 inbuf := w.buffers.Get().([]byte)[:len(uncompressed)+obufHeaderLen]
493 obuf := w.buffers.Get().([]byte)[:w.obufLen]
494 copy(inbuf[obufHeaderLen:], uncompressed)
495 uncompressed = inbuf[obufHeaderLen:]
496
497 output := make(chan result)
498 // Queue output now, so we keep order.
499 w.output <- output
500 res := result{
501 startOffset: w.uncompWritten,
502 }
503 w.uncompWritten += int64(len(uncompressed))
504
505 go func() {
506 checksum := crc(uncompressed)
507
508 // Set to uncompressed.
509 chunkType := uint8(chunkTypeUncompressedData)
510 chunkLen := 4 + len(uncompressed)
511
512 // Attempt compressing.
513 n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
514 n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
515
516 // Check if we should use this, or store as uncompressed instead.
517 if n2 > 0 {
518 chunkType = uint8(chunkTypeCompressedData)
519 chunkLen = 4 + n + n2
520 obuf = obuf[:obufHeaderLen+n+n2]
521 } else {
522 // Use input as output.
523 obuf, inbuf = inbuf, obuf
524 }
525
526 // Fill in the per-chunk header that comes before the body.
527 obuf[0] = chunkType
528 obuf[1] = uint8(chunkLen >> 0)
529 obuf[2] = uint8(chunkLen >> 8)
530 obuf[3] = uint8(chunkLen >> 16)
531 obuf[4] = uint8(checksum >> 0)
532 obuf[5] = uint8(checksum >> 8)
533 obuf[6] = uint8(checksum >> 16)
534 obuf[7] = uint8(checksum >> 24)
535
536 // Queue final output.
537 res.b = obuf
538 output <- res
539
540 // Put unused buffer back in pool.
541 w.buffers.Put(inbuf)
542 }()
543 nRet += len(uncompressed)
544 }
545 return nRet, nil
546}
547
548// writeFull is a special version of write that will always write the full buffer.
549// Data to be compressed should start at offset obufHeaderLen and fill the remainder of the buffer.
550// The data will be written as a single block.
551// The caller is not allowed to use inbuf after this function has been called.
552func (w *Writer) writeFull(inbuf []byte) (errRet error) {
553 if err := w.err(nil); err != nil {
554 return err
555 }
556
557 if w.concurrency == 1 {
558 _, err := w.writeSync(inbuf[obufHeaderLen:])
559 return err
560 }
561
562 // Spawn goroutine and write block to output channel.
563 if !w.wroteStreamHeader {
564 w.wroteStreamHeader = true
565 hWriter := make(chan result)
566 w.output <- hWriter
567 if w.snappy {
568 hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)}
569 } else {
570 hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)}
571 }
572 }
573
574 // Get an output buffer.
575 obuf := w.buffers.Get().([]byte)[:w.obufLen]
576 uncompressed := inbuf[obufHeaderLen:]
577
578 output := make(chan result)
579 // Queue output now, so we keep order.
580 w.output <- output
581 res := result{
582 startOffset: w.uncompWritten,
583 }
584 w.uncompWritten += int64(len(uncompressed))
585
586 go func() {
587 checksum := crc(uncompressed)
588
589 // Set to uncompressed.
590 chunkType := uint8(chunkTypeUncompressedData)
591 chunkLen := 4 + len(uncompressed)
592
593 // Attempt compressing.
594 n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
595 n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
596
597 // Check if we should use this, or store as uncompressed instead.
598 if n2 > 0 {
599 chunkType = uint8(chunkTypeCompressedData)
600 chunkLen = 4 + n + n2
601 obuf = obuf[:obufHeaderLen+n+n2]
602 } else {
603 // Use input as output.
604 obuf, inbuf = inbuf, obuf
605 }
606
607 // Fill in the per-chunk header that comes before the body.
608 obuf[0] = chunkType
609 obuf[1] = uint8(chunkLen >> 0)
610 obuf[2] = uint8(chunkLen >> 8)
611 obuf[3] = uint8(chunkLen >> 16)
612 obuf[4] = uint8(checksum >> 0)
613 obuf[5] = uint8(checksum >> 8)
614 obuf[6] = uint8(checksum >> 16)
615 obuf[7] = uint8(checksum >> 24)
616
617 // Queue final output.
618 res.b = obuf
619 output <- res
620
621 // Put unused buffer back in pool.
622 w.buffers.Put(inbuf)
623 }()
624 return nil
625}
626
627func (w *Writer) writeSync(p []byte) (nRet int, errRet error) {
628 if err := w.err(nil); err != nil {
629 return 0, err
630 }
631 if !w.wroteStreamHeader {
632 w.wroteStreamHeader = true
633 var n int
634 var err error
635 if w.snappy {
636 n, err = w.writer.Write([]byte(magicChunkSnappy))
637 } else {
638 n, err = w.writer.Write([]byte(magicChunk))
639 }
640 if err != nil {
641 return 0, w.err(err)
642 }
643 if n != len(magicChunk) {
644 return 0, w.err(io.ErrShortWrite)
645 }
646 w.written += int64(n)
647 }
648
649 for len(p) > 0 {
650 var uncompressed []byte
651 if len(p) > w.blockSize {
652 uncompressed, p = p[:w.blockSize], p[w.blockSize:]
653 } else {
654 uncompressed, p = p, nil
655 }
656
657 obuf := w.buffers.Get().([]byte)[:w.obufLen]
658 checksum := crc(uncompressed)
659
660 // Set to uncompressed.
661 chunkType := uint8(chunkTypeUncompressedData)
662 chunkLen := 4 + len(uncompressed)
663
664 // Attempt compressing.
665 n := binary.PutUvarint(obuf[obufHeaderLen:], uint64(len(uncompressed)))
666 n2 := w.encodeBlock(obuf[obufHeaderLen+n:], uncompressed)
667
668 if n2 > 0 {
669 chunkType = uint8(chunkTypeCompressedData)
670 chunkLen = 4 + n + n2
671 obuf = obuf[:obufHeaderLen+n+n2]
672 } else {
673 obuf = obuf[:8]
674 }
675
676 // Fill in the per-chunk header that comes before the body.
677 obuf[0] = chunkType
678 obuf[1] = uint8(chunkLen >> 0)
679 obuf[2] = uint8(chunkLen >> 8)
680 obuf[3] = uint8(chunkLen >> 16)
681 obuf[4] = uint8(checksum >> 0)
682 obuf[5] = uint8(checksum >> 8)
683 obuf[6] = uint8(checksum >> 16)
684 obuf[7] = uint8(checksum >> 24)
685
686 n, err := w.writer.Write(obuf)
687 if err != nil {
688 return 0, w.err(err)
689 }
690 if n != len(obuf) {
691 return 0, w.err(io.ErrShortWrite)
692 }
693 w.err(w.index.add(w.written, w.uncompWritten))
694 w.written += int64(n)
695 w.uncompWritten += int64(len(uncompressed))
696
697 if chunkType == chunkTypeUncompressedData {
698 // Write uncompressed data.
699 n, err := w.writer.Write(uncompressed)
700 if err != nil {
701 return 0, w.err(err)
702 }
703 if n != len(uncompressed) {
704 return 0, w.err(io.ErrShortWrite)
705 }
706 w.written += int64(n)
707 }
708 w.buffers.Put(obuf)
709 // Queue final output.
710 nRet += len(uncompressed)
711 }
712 return nRet, nil
713}
714
715// Flush flushes the Writer to its underlying io.Writer.
716// This does not apply padding.
717func (w *Writer) Flush() error {
718 if err := w.err(nil); err != nil {
719 return err
720 }
721
722 // Queue any data still in input buffer.
723 if len(w.ibuf) != 0 {
724 if !w.wroteStreamHeader {
725 _, err := w.writeSync(w.ibuf)
726 w.ibuf = w.ibuf[:0]
727 return w.err(err)
728 } else {
729 _, err := w.write(w.ibuf)
730 w.ibuf = w.ibuf[:0]
731 err = w.err(err)
732 if err != nil {
733 return err
734 }
735 }
736 }
737 if w.output == nil {
738 return w.err(nil)
739 }
740
741 // Send empty buffer
742 res := make(chan result)
743 w.output <- res
744 // Block until this has been picked up.
745 res <- result{b: nil, startOffset: w.uncompWritten}
746 // When it is closed, we have flushed.
747 <-res
748 return w.err(nil)
749}
750
751// Close calls Flush and then closes the Writer.
752// Calling Close multiple times is ok,
753// but calling CloseIndex after this will make it not return the index.
754func (w *Writer) Close() error {
755 _, err := w.closeIndex(w.appendIndex)
756 return err
757}
758
759// CloseIndex calls Close and returns an index on first call.
760// This is not required if you are only adding index to a stream.
761func (w *Writer) CloseIndex() ([]byte, error) {
762 return w.closeIndex(true)
763}
764
765func (w *Writer) closeIndex(idx bool) ([]byte, error) {
766 err := w.Flush()
767 if w.output != nil {
768 close(w.output)
769 w.writerWg.Wait()
770 w.output = nil
771 }
772
773 var index []byte
774 if w.err(err) == nil && w.writer != nil {
775 // Create index.
776 if idx {
777 compSize := int64(-1)
778 if w.pad <= 1 {
779 compSize = w.written
780 }
781 index = w.index.appendTo(w.ibuf[:0], w.uncompWritten, compSize)
782 // Count as written for padding.
783 if w.appendIndex {
784 w.written += int64(len(index))
785 }
786 }
787
788 if w.pad > 1 {
789 tmp := w.ibuf[:0]
790 if len(index) > 0 {
791 // Allocate another buffer.
792 tmp = w.buffers.Get().([]byte)[:0]
793 defer w.buffers.Put(tmp)
794 }
795 add := calcSkippableFrame(w.written, int64(w.pad))
796 frame, err := skippableFrame(tmp, add, w.randSrc)
797 if err = w.err(err); err != nil {
798 return nil, err
799 }
800 n, err2 := w.writer.Write(frame)
801 if err2 == nil && n != len(frame) {
802 err2 = io.ErrShortWrite
803 }
804 _ = w.err(err2)
805 }
806 if len(index) > 0 && w.appendIndex {
807 n, err2 := w.writer.Write(index)
808 if err2 == nil && n != len(index) {
809 err2 = io.ErrShortWrite
810 }
811 _ = w.err(err2)
812 }
813 }
814 err = w.err(errClosed)
815 if err == errClosed {
816 return index, nil
817 }
818 return nil, err
819}
820
821// calcSkippableFrame will return a total size to be added for written
822// to be divisible by multiple.
823// The value will always be > skippableFrameHeader.
824// The function will panic if written < 0 or wantMultiple <= 0.
825func calcSkippableFrame(written, wantMultiple int64) int {
826 if wantMultiple <= 0 {
827 panic("wantMultiple <= 0")
828 }
829 if written < 0 {
830 panic("written < 0")
831 }
832 leftOver := written % wantMultiple
833 if leftOver == 0 {
834 return 0
835 }
836 toAdd := wantMultiple - leftOver
837 for toAdd < skippableFrameHeader {
838 toAdd += wantMultiple
839 }
840 return int(toAdd)
841}
842
843// skippableFrame will add a skippable frame with a total size of bytes.
844// total should be >= skippableFrameHeader and < maxBlockSize + skippableFrameHeader
845func skippableFrame(dst []byte, total int, r io.Reader) ([]byte, error) {
846 if total == 0 {
847 return dst, nil
848 }
849 if total < skippableFrameHeader {
850 return dst, fmt.Errorf("s2: requested skippable frame (%d) < 4", total)
851 }
852 if int64(total) >= maxBlockSize+skippableFrameHeader {
853 return dst, fmt.Errorf("s2: requested skippable frame (%d) >= max 1<<24", total)
854 }
855 // Chunk type 0xfe "Section 4.4 Padding (chunk type 0xfe)"
856 dst = append(dst, chunkTypePadding)
857 f := uint32(total - skippableFrameHeader)
858 // Add chunk length.
859 dst = append(dst, uint8(f), uint8(f>>8), uint8(f>>16))
860 // Add data
861 start := len(dst)
862 dst = append(dst, make([]byte, f)...)
863 _, err := io.ReadFull(r, dst[start:])
864 return dst, err
865}
866
867var errClosed = errors.New("s2: Writer is closed")
868
869// WriterOption is an option for creating a encoder.
870type WriterOption func(*Writer) error
871
872// WriterConcurrency will set the concurrency,
873// meaning the maximum number of decoders to run concurrently.
874// The value supplied must be at least 1.
875// By default this will be set to GOMAXPROCS.
876func WriterConcurrency(n int) WriterOption {
877 return func(w *Writer) error {
878 if n <= 0 {
879 return errors.New("concurrency must be at least 1")
880 }
881 w.concurrency = n
882 return nil
883 }
884}
885
886// WriterAddIndex will append an index to the end of a stream
887// when it is closed.
888func WriterAddIndex() WriterOption {
889 return func(w *Writer) error {
890 w.appendIndex = true
891 return nil
892 }
893}
894
895// WriterBetterCompression will enable better compression.
896// EncodeBetter compresses better than Encode but typically with a
897// 10-40% speed decrease on both compression and decompression.
898func WriterBetterCompression() WriterOption {
899 return func(w *Writer) error {
900 w.level = levelBetter
901 return nil
902 }
903}
904
905// WriterBestCompression will enable better compression.
906// EncodeBetter compresses better than Encode but typically with a
907// big speed decrease on compression.
908func WriterBestCompression() WriterOption {
909 return func(w *Writer) error {
910 w.level = levelBest
911 return nil
912 }
913}
914
915// WriterUncompressed will bypass compression.
916// The stream will be written as uncompressed blocks only.
917// If concurrency is > 1 CRC and output will still be done async.
918func WriterUncompressed() WriterOption {
919 return func(w *Writer) error {
920 w.level = levelUncompressed
921 return nil
922 }
923}
924
925// WriterBlockSize allows to override the default block size.
926// Blocks will be this size or smaller.
927// Minimum size is 4KB and and maximum size is 4MB.
928//
929// Bigger blocks may give bigger throughput on systems with many cores,
930// and will increase compression slightly, but it will limit the possible
931// concurrency for smaller payloads for both encoding and decoding.
932// Default block size is 1MB.
933//
934// When writing Snappy compatible output using WriterSnappyCompat,
935// the maximum block size is 64KB.
936func WriterBlockSize(n int) WriterOption {
937 return func(w *Writer) error {
938 if w.snappy && n > maxSnappyBlockSize || n < minBlockSize {
939 return errors.New("s2: block size too large. Must be <= 64K and >=4KB on for snappy compatible output")
940 }
941 if n > maxBlockSize || n < minBlockSize {
942 return errors.New("s2: block size too large. Must be <= 4MB and >=4KB")
943 }
944 w.blockSize = n
945 return nil
946 }
947}
948
949// WriterPadding will add padding to all output so the size will be a multiple of n.
950// This can be used to obfuscate the exact output size or make blocks of a certain size.
951// The contents will be a skippable frame, so it will be invisible by the decoder.
952// n must be > 0 and <= 4MB.
953// The padded area will be filled with data from crypto/rand.Reader.
954// The padding will be applied whenever Close is called on the writer.
955func WriterPadding(n int) WriterOption {
956 return func(w *Writer) error {
957 if n <= 0 {
958 return fmt.Errorf("s2: padding must be at least 1")
959 }
960 // No need to waste our time.
961 if n == 1 {
962 w.pad = 0
963 }
964 if n > maxBlockSize {
965 return fmt.Errorf("s2: padding must less than 4MB")
966 }
967 w.pad = n
968 return nil
969 }
970}
971
972// WriterPaddingSrc will get random data for padding from the supplied source.
973// By default crypto/rand is used.
974func WriterPaddingSrc(reader io.Reader) WriterOption {
975 return func(w *Writer) error {
976 w.randSrc = reader
977 return nil
978 }
979}
980
981// WriterSnappyCompat will write snappy compatible output.
982// The output can be decompressed using either snappy or s2.
983// If block size is more than 64KB it is set to that.
984func WriterSnappyCompat() WriterOption {
985 return func(w *Writer) error {
986 w.snappy = true
987 if w.blockSize > 64<<10 {
988 // We choose 8 bytes less than 64K, since that will make literal emits slightly more effective.
989 // And allows us to skip some size checks.
990 w.blockSize = (64 << 10) - 8
991 }
992 return nil
993 }
994}
995
996// WriterFlushOnWrite will compress blocks on each call to the Write function.
997//
998// This is quite inefficient as blocks size will depend on the write size.
999//
1000// Use WriterConcurrency(1) to also make sure that output is flushed.
1001// When Write calls return, otherwise they will be written when compression is done.
1002func WriterFlushOnWrite() WriterOption {
1003 return func(w *Writer) error {
1004 w.flushOnWrite = true
1005 return nil
1006 }
1007}
1008
1009// WriterCustomEncoder allows to override the encoder for blocks on the stream.
1010// The function must compress 'src' into 'dst' and return the bytes used in dst as an integer.
1011// Block size (initial varint) should not be added by the encoder.
1012// Returning value 0 indicates the block could not be compressed.
1013// Returning a negative value indicates that compression should be attempted.
1014// The function should expect to be called concurrently.
1015func WriterCustomEncoder(fn func(dst, src []byte) int) WriterOption {
1016 return func(w *Writer) error {
1017 w.customEnc = fn
1018 return nil
1019 }
1020}
diff --git a/vendor/github.com/klauspost/cpuid/v2/.gitignore b/vendor/github.com/klauspost/cpuid/v2/.gitignore
new file mode 100644
index 0000000..daf913b
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/.gitignore
@@ -0,0 +1,24 @@
1# Compiled Object files, Static and Dynamic libs (Shared Objects)
2*.o
3*.a
4*.so
5
6# Folders
7_obj
8_test
9
10# Architecture specific extensions/prefixes
11*.[568vq]
12[568vq].out
13
14*.cgo1.go
15*.cgo2.c
16_cgo_defun.c
17_cgo_gotypes.go
18_cgo_export.*
19
20_testmain.go
21
22*.exe
23*.test
24*.prof
diff --git a/vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml b/vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml
new file mode 100644
index 0000000..944cc00
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/.goreleaser.yml
@@ -0,0 +1,74 @@
1# This is an example goreleaser.yaml file with some sane defaults.
2# Make sure to check the documentation at http://goreleaser.com
3
4builds:
5 -
6 id: "cpuid"
7 binary: cpuid
8 main: ./cmd/cpuid/main.go
9 env:
10 - CGO_ENABLED=0
11 flags:
12 - -ldflags=-s -w
13 goos:
14 - aix
15 - linux
16 - freebsd
17 - netbsd
18 - windows
19 - darwin
20 goarch:
21 - 386
22 - amd64
23 - arm64
24 goarm:
25 - 7
26
27archives:
28 -
29 id: cpuid
30 name_template: "cpuid-{{ .Os }}_{{ .Arch }}_{{ .Version }}"
31 replacements:
32 aix: AIX
33 darwin: OSX
34 linux: Linux
35 windows: Windows
36 386: i386
37 amd64: x86_64
38 freebsd: FreeBSD
39 netbsd: NetBSD
40 format_overrides:
41 - goos: windows
42 format: zip
43 files:
44 - LICENSE
45checksum:
46 name_template: 'checksums.txt'
47snapshot:
48 name_template: "{{ .Tag }}-next"
49changelog:
50 sort: asc
51 filters:
52 exclude:
53 - '^doc:'
54 - '^docs:'
55 - '^test:'
56 - '^tests:'
57 - '^Update\sREADME.md'
58
59nfpms:
60 -
61 file_name_template: "cpuid_package_{{ .Version }}_{{ .Os }}_{{ .Arch }}"
62 vendor: Klaus Post
63 homepage: https://github.com/klauspost/cpuid
64 maintainer: Klaus Post <[email protected]>
65 description: CPUID Tool
66 license: BSD 3-Clause
67 formats:
68 - deb
69 - rpm
70 replacements:
71 darwin: Darwin
72 linux: Linux
73 freebsd: FreeBSD
74 amd64: x86_64
diff --git a/vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt b/vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt
new file mode 100644
index 0000000..452d28e
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/CONTRIBUTING.txt
@@ -0,0 +1,35 @@
1Developer Certificate of Origin
2Version 1.1
3
4Copyright (C) 2015- Klaus Post & Contributors.
5Email: [email protected]
6
7Everyone is permitted to copy and distribute verbatim copies of this
8license document, but changing it is not allowed.
9
10
11Developer's Certificate of Origin 1.1
12
13By making a contribution to this project, I certify that:
14
15(a) The contribution was created in whole or in part by me and I
16 have the right to submit it under the open source license
17 indicated in the file; or
18
19(b) The contribution is based upon previous work that, to the best
20 of my knowledge, is covered under an appropriate open source
21 license and I have the right under that license to submit that
22 work with modifications, whether created in whole or in part
23 by me, under the same open source license (unless I am
24 permitted to submit under a different license), as indicated
25 in the file; or
26
27(c) The contribution was provided directly to me by some other
28 person who certified (a), (b) or (c) and I have not modified
29 it.
30
31(d) I understand and agree that this project and the contribution
32 are public and that a record of the contribution (including all
33 personal information I submit with it, including my sign-off) is
34 maintained indefinitely and may be redistributed consistent with
35 this project or the open source license(s) involved.
diff --git a/vendor/github.com/klauspost/cpuid/v2/LICENSE b/vendor/github.com/klauspost/cpuid/v2/LICENSE
new file mode 100644
index 0000000..5cec7ee
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/LICENSE
@@ -0,0 +1,22 @@
1The MIT License (MIT)
2
3Copyright (c) 2015 Klaus Post
4
5Permission is hereby granted, free of charge, to any person obtaining a copy
6of this software and associated documentation files (the "Software"), to deal
7in the Software without restriction, including without limitation the rights
8to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9copies of the Software, and to permit persons to whom the Software is
10furnished to do so, subject to the following conditions:
11
12The above copyright notice and this permission notice shall be included in all
13copies or substantial portions of the Software.
14
15THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21SOFTWARE.
22
diff --git a/vendor/github.com/klauspost/cpuid/v2/README.md b/vendor/github.com/klauspost/cpuid/v2/README.md
new file mode 100644
index 0000000..30f8d29
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/README.md
@@ -0,0 +1,497 @@
1# cpuid
2Package cpuid provides information about the CPU running the current program.
3
4CPU features are detected on startup, and kept for fast access through the life of the application.
5Currently x86 / x64 (AMD64/i386) and ARM (ARM64) is supported, and no external C (cgo) code is used, which should make the library very easy to use.
6
7You can access the CPU information by accessing the shared CPU variable of the cpuid library.
8
9Package home: https://github.com/klauspost/cpuid
10
11[![PkgGoDev](https://pkg.go.dev/badge/github.com/klauspost/cpuid)](https://pkg.go.dev/github.com/klauspost/cpuid/v2)
12[![Go](https://github.com/klauspost/cpuid/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/cpuid/actions/workflows/go.yml)
13
14## installing
15
16`go get -u github.com/klauspost/cpuid/v2` using modules.
17Drop `v2` for others.
18
19Installing binary:
20
21`go install github.com/klauspost/cpuid/v2/cmd/cpuid@latest`
22
23Or download binaries from release page: https://github.com/klauspost/cpuid/releases
24
25### Homebrew
26
27For macOS/Linux users, you can install via [brew](https://brew.sh/)
28
29```sh
30$ brew install cpuid
31```
32
33## example
34
35```Go
36package main
37
38import (
39 "fmt"
40 "strings"
41
42 . "github.com/klauspost/cpuid/v2"
43)
44
45func main() {
46 // Print basic CPU information:
47 fmt.Println("Name:", CPU.BrandName)
48 fmt.Println("PhysicalCores:", CPU.PhysicalCores)
49 fmt.Println("ThreadsPerCore:", CPU.ThreadsPerCore)
50 fmt.Println("LogicalCores:", CPU.LogicalCores)
51 fmt.Println("Family", CPU.Family, "Model:", CPU.Model, "Vendor ID:", CPU.VendorID)
52 fmt.Println("Features:", strings.Join(CPU.FeatureSet(), ","))
53 fmt.Println("Cacheline bytes:", CPU.CacheLine)
54 fmt.Println("L1 Data Cache:", CPU.Cache.L1D, "bytes")
55 fmt.Println("L1 Instruction Cache:", CPU.Cache.L1I, "bytes")
56 fmt.Println("L2 Cache:", CPU.Cache.L2, "bytes")
57 fmt.Println("L3 Cache:", CPU.Cache.L3, "bytes")
58 fmt.Println("Frequency", CPU.Hz, "hz")
59
60 // Test if we have these specific features:
61 if CPU.Supports(SSE, SSE2) {
62 fmt.Println("We have Streaming SIMD 2 Extensions")
63 }
64}
65```
66
67Sample output:
68```
69>go run main.go
70Name: AMD Ryzen 9 3950X 16-Core Processor
71PhysicalCores: 16
72ThreadsPerCore: 2
73LogicalCores: 32
74Family 23 Model: 113 Vendor ID: AMD
75Features: ADX,AESNI,AVX,AVX2,BMI1,BMI2,CLMUL,CMOV,CX16,F16C,FMA3,HTT,HYPERVISOR,LZCNT,MMX,MMXEXT,NX,POPCNT,RDRAND,RDSEED,RDTSCP,SHA,SSE,SSE2,SSE3,SSE4,SSE42,SSE4A,SSSE3
76Cacheline bytes: 64
77L1 Data Cache: 32768 bytes
78L1 Instruction Cache: 32768 bytes
79L2 Cache: 524288 bytes
80L3 Cache: 16777216 bytes
81Frequency 0 hz
82We have Streaming SIMD 2 Extensions
83```
84
85# usage
86
87The `cpuid.CPU` provides access to CPU features. Use `cpuid.CPU.Supports()` to check for CPU features.
88A faster `cpuid.CPU.Has()` is provided which will usually be inlined by the gc compiler.
89
90To test a larger number of features, they can be combined using `f := CombineFeatures(CMOV, CMPXCHG8, X87, FXSR, MMX, SYSCALL, SSE, SSE2)`, etc.
91This can be using with `cpuid.CPU.HasAll(f)` to quickly test if all features are supported.
92
93Note that for some cpu/os combinations some features will not be detected.
94`amd64` has rather good support and should work reliably on all platforms.
95
96Note that hypervisors may not pass through all CPU features through to the guest OS,
97so even if your host supports a feature it may not be visible on guests.
98
99## arm64 feature detection
100
101Not all operating systems provide ARM features directly
102and there is no safe way to do so for the rest.
103
104Currently `arm64/linux` and `arm64/freebsd` should be quite reliable.
105`arm64/darwin` adds features expected from the M1 processor, but a lot remains undetected.
106
107A `DetectARM()` can be used if you are able to control your deployment,
108it will detect CPU features, but may crash if the OS doesn't intercept the calls.
109A `-cpu.arm` flag for detecting unsafe ARM features can be added. See below.
110
111Note that currently only features are detected on ARM,
112no additional information is currently available.
113
114## flags
115
116It is possible to add flags that affects cpu detection.
117
118For this the `Flags()` command is provided.
119
120This must be called *before* `flag.Parse()` AND after the flags have been parsed `Detect()` must be called.
121
122This means that any detection used in `init()` functions will not contain these flags.
123
124Example:
125
126```Go
127package main
128
129import (
130 "flag"
131 "fmt"
132 "strings"
133
134 "github.com/klauspost/cpuid/v2"
135)
136
137func main() {
138 cpuid.Flags()
139 flag.Parse()
140 cpuid.Detect()
141
142 // Test if we have these specific features:
143 if cpuid.CPU.Supports(cpuid.SSE, cpuid.SSE2) {
144 fmt.Println("We have Streaming SIMD 2 Extensions")
145 }
146}
147```
148
149## commandline
150
151Download as binary from: https://github.com/klauspost/cpuid/releases
152
153Install from source:
154
155`go install github.com/klauspost/cpuid/v2/cmd/cpuid@latest`
156
157### Example
158
159```
160λ cpuid
161Name: AMD Ryzen 9 3950X 16-Core Processor
162Vendor String: AuthenticAMD
163Vendor ID: AMD
164PhysicalCores: 16
165Threads Per Core: 2
166Logical Cores: 32
167CPU Family 23 Model: 113
168Features: ADX,AESNI,AVX,AVX2,BMI1,BMI2,CLMUL,CLZERO,CMOV,CMPXCHG8,CPBOOST,CX16,F16C,FMA3,FXSR,FXSROPT,HTT,HYPERVISOR,LAHF,LZCNT,MCAOVERFLOW,MMX,MMXEXT,MOVBE,NX,OSXSAVE,POPCNT,RDRAND,RDSEED,RDTSCP,SCE,SHA,SSE,SSE2,SSE3,SSE4,SSE42,SSE4A,SSSE3,SUCCOR,X87,XSAVE
169Microarchitecture level: 3
170Cacheline bytes: 64
171L1 Instruction Cache: 32768 bytes
172L1 Data Cache: 32768 bytes
173L2 Cache: 524288 bytes
174L3 Cache: 16777216 bytes
175
176```
177### JSON Output:
178
179```
180λ cpuid --json
181{
182 "BrandName": "AMD Ryzen 9 3950X 16-Core Processor",
183 "VendorID": 2,
184 "VendorString": "AuthenticAMD",
185 "PhysicalCores": 16,
186 "ThreadsPerCore": 2,
187 "LogicalCores": 32,
188 "Family": 23,
189 "Model": 113,
190 "CacheLine": 64,
191 "Hz": 0,
192 "BoostFreq": 0,
193 "Cache": {
194 "L1I": 32768,
195 "L1D": 32768,
196 "L2": 524288,
197 "L3": 16777216
198 },
199 "SGX": {
200 "Available": false,
201 "LaunchControl": false,
202 "SGX1Supported": false,
203 "SGX2Supported": false,
204 "MaxEnclaveSizeNot64": 0,
205 "MaxEnclaveSize64": 0,
206 "EPCSections": null
207 },
208 "Features": [
209 "ADX",
210 "AESNI",
211 "AVX",
212 "AVX2",
213 "BMI1",
214 "BMI2",
215 "CLMUL",
216 "CLZERO",
217 "CMOV",
218 "CMPXCHG8",
219 "CPBOOST",
220 "CX16",
221 "F16C",
222 "FMA3",
223 "FXSR",
224 "FXSROPT",
225 "HTT",
226 "HYPERVISOR",
227 "LAHF",
228 "LZCNT",
229 "MCAOVERFLOW",
230 "MMX",
231 "MMXEXT",
232 "MOVBE",
233 "NX",
234 "OSXSAVE",
235 "POPCNT",
236 "RDRAND",
237 "RDSEED",
238 "RDTSCP",
239 "SCE",
240 "SHA",
241 "SSE",
242 "SSE2",
243 "SSE3",
244 "SSE4",
245 "SSE42",
246 "SSE4A",
247 "SSSE3",
248 "SUCCOR",
249 "X87",
250 "XSAVE"
251 ],
252 "X64Level": 3
253}
254```
255
256### Check CPU microarch level
257
258```
259λ cpuid --check-level=3
2602022/03/18 17:04:40 AMD Ryzen 9 3950X 16-Core Processor
2612022/03/18 17:04:40 Microarchitecture level 3 is supported. Max level is 3.
262Exit Code 0
263
264λ cpuid --check-level=4
2652022/03/18 17:06:18 AMD Ryzen 9 3950X 16-Core Processor
2662022/03/18 17:06:18 Microarchitecture level 4 not supported. Max level is 3.
267Exit Code 1
268```
269
270
271## Available flags
272
273### x86 & amd64
274
275| Feature Flag | Description |
276|--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
277| ADX | Intel ADX (Multi-Precision Add-Carry Instruction Extensions) |
278| AESNI | Advanced Encryption Standard New Instructions |
279| AMD3DNOW | AMD 3DNOW |
280| AMD3DNOWEXT | AMD 3DNowExt |
281| AMXBF16 | Tile computational operations on BFLOAT16 numbers |
282| AMXINT8 | Tile computational operations on 8-bit integers |
283| AMXFP16 | Tile computational operations on FP16 numbers |
284| AMXTILE | Tile architecture |
285| APX_F | Intel APX |
286| AVX | AVX functions |
287| AVX10 | If set the Intel AVX10 Converged Vector ISA is supported |
288| AVX10_128 | If set indicates that AVX10 128-bit vector support is present |
289| AVX10_256 | If set indicates that AVX10 256-bit vector support is present |
290| AVX10_512 | If set indicates that AVX10 512-bit vector support is present |
291| AVX2 | AVX2 functions |
292| AVX512BF16 | AVX-512 BFLOAT16 Instructions |
293| AVX512BITALG | AVX-512 Bit Algorithms |
294| AVX512BW | AVX-512 Byte and Word Instructions |
295| AVX512CD | AVX-512 Conflict Detection Instructions |
296| AVX512DQ | AVX-512 Doubleword and Quadword Instructions |
297| AVX512ER | AVX-512 Exponential and Reciprocal Instructions |
298| AVX512F | AVX-512 Foundation |
299| AVX512FP16 | AVX-512 FP16 Instructions |
300| AVX512IFMA | AVX-512 Integer Fused Multiply-Add Instructions |
301| AVX512PF | AVX-512 Prefetch Instructions |
302| AVX512VBMI | AVX-512 Vector Bit Manipulation Instructions |
303| AVX512VBMI2 | AVX-512 Vector Bit Manipulation Instructions, Version 2 |
304| AVX512VL | AVX-512 Vector Length Extensions |
305| AVX512VNNI | AVX-512 Vector Neural Network Instructions |
306| AVX512VP2INTERSECT | AVX-512 Intersect for D/Q |
307| AVX512VPOPCNTDQ | AVX-512 Vector Population Count Doubleword and Quadword |
308| AVXIFMA | AVX-IFMA instructions |
309| AVXNECONVERT | AVX-NE-CONVERT instructions |
310| AVXSLOW | Indicates the CPU performs 2 128 bit operations instead of one |
311| AVXVNNI | AVX (VEX encoded) VNNI neural network instructions |
312| AVXVNNIINT8 | AVX-VNNI-INT8 instructions |
313| BHI_CTRL | Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 |
314| BMI1 | Bit Manipulation Instruction Set 1 |
315| BMI2 | Bit Manipulation Instruction Set 2 |
316| CETIBT | Intel CET Indirect Branch Tracking |
317| CETSS | Intel CET Shadow Stack |
318| CLDEMOTE | Cache Line Demote |
319| CLMUL | Carry-less Multiplication |
320| CLZERO | CLZERO instruction supported |
321| CMOV | i686 CMOV |
322| CMPCCXADD | CMPCCXADD instructions |
323| CMPSB_SCADBS_SHORT | Fast short CMPSB and SCASB |
324| CMPXCHG8 | CMPXCHG8 instruction |
325| CPBOOST | Core Performance Boost |
326| CPPC | AMD: Collaborative Processor Performance Control |
327| CX16 | CMPXCHG16B Instruction |
328| EFER_LMSLE_UNS | AMD: =Core::X86::Msr::EFER[LMSLE] is not supported, and MBZ |
329| ENQCMD | Enqueue Command |
330| ERMS | Enhanced REP MOVSB/STOSB |
331| F16C | Half-precision floating-point conversion |
332| FLUSH_L1D | Flush L1D cache |
333| FMA3 | Intel FMA 3. Does not imply AVX. |
334| FMA4 | Bulldozer FMA4 functions |
335| FP128 | AMD: When set, the internal FP/SIMD execution datapath is 128-bits wide |
336| FP256 | AMD: When set, the internal FP/SIMD execution datapath is 256-bits wide |
337| FSRM | Fast Short Rep Mov |
338| FXSR | FXSAVE, FXRESTOR instructions, CR4 bit 9 |
339| FXSROPT | FXSAVE/FXRSTOR optimizations |
340| GFNI | Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage. |
341| HLE | Hardware Lock Elision |
342| HRESET | If set CPU supports history reset and the IA32_HRESET_ENABLE MSR |
343| HTT | Hyperthreading (enabled) |
344| HWA | Hardware assert supported. Indicates support for MSRC001_10 |
345| HYBRID_CPU | This part has CPUs of more than one type. |
346| HYPERVISOR | This bit has been reserved by Intel & AMD for use by hypervisors |
347| IA32_ARCH_CAP | IA32_ARCH_CAPABILITIES MSR (Intel) |
348| IA32_CORE_CAP | IA32_CORE_CAPABILITIES MSR |
349| IBPB | Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB) |
350| IBRS | AMD: Indirect Branch Restricted Speculation |
351| IBRS_PREFERRED | AMD: IBRS is preferred over software solution |
352| IBRS_PROVIDES_SMP | AMD: IBRS provides Same Mode Protection |
353| IBS | Instruction Based Sampling (AMD) |
354| IBSBRNTRGT | Instruction Based Sampling Feature (AMD) |
355| IBSFETCHSAM | Instruction Based Sampling Feature (AMD) |
356| IBSFFV | Instruction Based Sampling Feature (AMD) |
357| IBSOPCNT | Instruction Based Sampling Feature (AMD) |
358| IBSOPCNTEXT | Instruction Based Sampling Feature (AMD) |
359| IBSOPSAM | Instruction Based Sampling Feature (AMD) |
360| IBSRDWROPCNT | Instruction Based Sampling Feature (AMD) |
361| IBSRIPINVALIDCHK | Instruction Based Sampling Feature (AMD) |
362| IBS_FETCH_CTLX | AMD: IBS fetch control extended MSR supported |
363| IBS_OPDATA4 | AMD: IBS op data 4 MSR supported |
364| IBS_OPFUSE | AMD: Indicates support for IbsOpFuse |
365| IBS_PREVENTHOST | Disallowing IBS use by the host supported |
366| IBS_ZEN4 | Fetch and Op IBS support IBS extensions added with Zen4 |
367| IDPRED_CTRL | IPRED_DIS |
368| INT_WBINVD | WBINVD/WBNOINVD are interruptible. |
369| INVLPGB | NVLPGB and TLBSYNC instruction supported |
370| KEYLOCKER | Key locker |
371| KEYLOCKERW | Key locker wide |
372| LAHF | LAHF/SAHF in long mode |
373| LAM | If set, CPU supports Linear Address Masking |
374| LBRVIRT | LBR virtualization |
375| LZCNT | LZCNT instruction |
376| MCAOVERFLOW | MCA overflow recovery support. |
377| MCDT_NO | Processor do not exhibit MXCSR Configuration Dependent Timing behavior and do not need to mitigate it. |
378| MCOMMIT | MCOMMIT instruction supported |
379| MD_CLEAR | VERW clears CPU buffers |
380| MMX | standard MMX |
381| MMXEXT | SSE integer functions or AMD MMX ext |
382| MOVBE | MOVBE instruction (big-endian) |
383| MOVDIR64B | Move 64 Bytes as Direct Store |
384| MOVDIRI | Move Doubleword as Direct Store |
385| MOVSB_ZL | Fast Zero-Length MOVSB |
386| MPX | Intel MPX (Memory Protection Extensions) |
387| MOVU | MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD |
388| MSRIRC | Instruction Retired Counter MSR available |
389| MSRLIST | Read/Write List of Model Specific Registers |
390| MSR_PAGEFLUSH | Page Flush MSR available |
391| NRIPS | Indicates support for NRIP save on VMEXIT |
392| NX | NX (No-Execute) bit |
393| OSXSAVE | XSAVE enabled by OS |
394| PCONFIG | PCONFIG for Intel Multi-Key Total Memory Encryption |
395| POPCNT | POPCNT instruction |
396| PPIN | AMD: Protected Processor Inventory Number support. Indicates that Protected Processor Inventory Number (PPIN) capability can be enabled |
397| PREFETCHI | PREFETCHIT0/1 instructions |
398| PSFD | Predictive Store Forward Disable |
399| RDPRU | RDPRU instruction supported |
400| RDRAND | RDRAND instruction is available |
401| RDSEED | RDSEED instruction is available |
402| RDTSCP | RDTSCP Instruction |
403| RRSBA_CTRL | Restricted RSB Alternate |
404| RTM | Restricted Transactional Memory |
405| RTM_ALWAYS_ABORT | Indicates that the loaded microcode is forcing RTM abort. |
406| SERIALIZE | Serialize Instruction Execution |
407| SEV | AMD Secure Encrypted Virtualization supported |
408| SEV_64BIT | AMD SEV guest execution only allowed from a 64-bit host |
409| SEV_ALTERNATIVE | AMD SEV Alternate Injection supported |
410| SEV_DEBUGSWAP | Full debug state swap supported for SEV-ES guests |
411| SEV_ES | AMD SEV Encrypted State supported |
412| SEV_RESTRICTED | AMD SEV Restricted Injection supported |
413| SEV_SNP | AMD SEV Secure Nested Paging supported |
414| SGX | Software Guard Extensions |
415| SGXLC | Software Guard Extensions Launch Control |
416| SHA | Intel SHA Extensions |
417| SME | AMD Secure Memory Encryption supported |
418| SME_COHERENT | AMD Hardware cache coherency across encryption domains enforced |
419| SPEC_CTRL_SSBD | Speculative Store Bypass Disable |
420| SRBDS_CTRL | SRBDS mitigation MSR available |
421| SSE | SSE functions |
422| SSE2 | P4 SSE functions |
423| SSE3 | Prescott SSE3 functions |
424| SSE4 | Penryn SSE4.1 functions |
425| SSE42 | Nehalem SSE4.2 functions |
426| SSE4A | AMD Barcelona microarchitecture SSE4a instructions |
427| SSSE3 | Conroe SSSE3 functions |
428| STIBP | Single Thread Indirect Branch Predictors |
429| STIBP_ALWAYSON | AMD: Single Thread Indirect Branch Prediction Mode has Enhanced Performance and may be left Always On |
430| STOSB_SHORT | Fast short STOSB |
431| SUCCOR | Software uncorrectable error containment and recovery capability. |
432| SVM | AMD Secure Virtual Machine |
433| SVMDA | Indicates support for the SVM decode assists. |
434| SVMFBASID | SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control |
435| SVML | AMD SVM lock. Indicates support for SVM-Lock. |
436| SVMNP | AMD SVM nested paging |
437| SVMPF | SVM pause intercept filter. Indicates support for the pause intercept filter |
438| SVMPFT | SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold |
439| SYSCALL | System-Call Extension (SCE): SYSCALL and SYSRET instructions. |
440| SYSEE | SYSENTER and SYSEXIT instructions |
441| TBM | AMD Trailing Bit Manipulation |
442| TDX_GUEST | Intel Trust Domain Extensions Guest |
443| TLB_FLUSH_NESTED | AMD: Flushing includes all the nested translations for guest translations |
444| TME | Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE. |
445| TOPEXT | TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX. |
446| TSCRATEMSR | MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104 |
447| TSXLDTRK | Intel TSX Suspend Load Address Tracking |
448| VAES | Vector AES. AVX(512) versions requires additional checks. |
449| VMCBCLEAN | VMCB clean bits. Indicates support for VMCB clean bits. |
450| VMPL | AMD VM Permission Levels supported |
451| VMSA_REGPROT | AMD VMSA Register Protection supported |
452| VMX | Virtual Machine Extensions |
453| VPCLMULQDQ | Carry-Less Multiplication Quadword. Requires AVX for 3 register versions. |
454| VTE | AMD Virtual Transparent Encryption supported |
455| WAITPKG | TPAUSE, UMONITOR, UMWAIT |
456| WBNOINVD | Write Back and Do Not Invalidate Cache |
457| WRMSRNS | Non-Serializing Write to Model Specific Register |
458| X87 | FPU |
459| XGETBV1 | Supports XGETBV with ECX = 1 |
460| XOP | Bulldozer XOP functions |
461| XSAVE | XSAVE, XRESTOR, XSETBV, XGETBV |
462| XSAVEC | Supports XSAVEC and the compacted form of XRSTOR. |
463| XSAVEOPT | XSAVEOPT available |
464| XSAVES | Supports XSAVES/XRSTORS and IA32_XSS |
465
466# ARM features:
467
468| Feature Flag | Description |
469|--------------|------------------------------------------------------------------|
470| AESARM | AES instructions |
471| ARMCPUID | Some CPU ID registers readable at user-level |
472| ASIMD | Advanced SIMD |
473| ASIMDDP | SIMD Dot Product |
474| ASIMDHP | Advanced SIMD half-precision floating point |
475| ASIMDRDM | Rounding Double Multiply Accumulate/Subtract (SQRDMLAH/SQRDMLSH) |
476| ATOMICS | Large System Extensions (LSE) |
477| CRC32 | CRC32/CRC32C instructions |
478| DCPOP | Data cache clean to Point of Persistence (DC CVAP) |
479| EVTSTRM | Generic timer |
480| FCMA | Floatin point complex number addition and multiplication |
481| FP | Single-precision and double-precision floating point |
482| FPHP | Half-precision floating point |
483| GPA | Generic Pointer Authentication |
484| JSCVT | Javascript-style double->int convert (FJCVTZS) |
485| LRCPC | Weaker release consistency (LDAPR, etc) |
486| PMULL | Polynomial Multiply instructions (PMULL/PMULL2) |
487| SHA1 | SHA-1 instructions (SHA1C, etc) |
488| SHA2 | SHA-2 instructions (SHA256H, etc) |
489| SHA3 | SHA-3 instructions (EOR3, RAXI, XAR, BCAX) |
490| SHA512 | SHA512 instructions |
491| SM3 | SM3 instructions |
492| SM4 | SM4 instructions |
493| SVE | Scalable Vector Extension |
494
495# license
496
497This code is published under an MIT license. See LICENSE file for more information.
diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid.go b/vendor/github.com/klauspost/cpuid/v2/cpuid.go
new file mode 100644
index 0000000..15b7603
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/cpuid.go
@@ -0,0 +1,1473 @@
1// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
2
3// Package cpuid provides information about the CPU running the current program.
4//
5// CPU features are detected on startup, and kept for fast access through the life of the application.
6// Currently x86 / x64 (AMD64) as well as arm64 is supported.
7//
8// You can access the CPU information by accessing the shared CPU variable of the cpuid library.
9//
10// Package home: https://github.com/klauspost/cpuid
11package cpuid
12
13import (
14 "flag"
15 "fmt"
16 "math"
17 "math/bits"
18 "os"
19 "runtime"
20 "strings"
21)
22
23// AMD refererence: https://www.amd.com/system/files/TechDocs/25481.pdf
24// and Processor Programming Reference (PPR)
25
26// Vendor is a representation of a CPU vendor.
27type Vendor int
28
29const (
30 VendorUnknown Vendor = iota
31 Intel
32 AMD
33 VIA
34 Transmeta
35 NSC
36 KVM // Kernel-based Virtual Machine
37 MSVM // Microsoft Hyper-V or Windows Virtual PC
38 VMware
39 XenHVM
40 Bhyve
41 Hygon
42 SiS
43 RDC
44
45 Ampere
46 ARM
47 Broadcom
48 Cavium
49 DEC
50 Fujitsu
51 Infineon
52 Motorola
53 NVIDIA
54 AMCC
55 Qualcomm
56 Marvell
57
58 lastVendor
59)
60
61//go:generate stringer -type=FeatureID,Vendor
62
63// FeatureID is the ID of a specific cpu feature.
64type FeatureID int
65
66const (
67 // Keep index -1 as unknown
68 UNKNOWN = -1
69
70 // Add features
71 ADX FeatureID = iota // Intel ADX (Multi-Precision Add-Carry Instruction Extensions)
72 AESNI // Advanced Encryption Standard New Instructions
73 AMD3DNOW // AMD 3DNOW
74 AMD3DNOWEXT // AMD 3DNowExt
75 AMXBF16 // Tile computational operations on BFLOAT16 numbers
76 AMXFP16 // Tile computational operations on FP16 numbers
77 AMXINT8 // Tile computational operations on 8-bit integers
78 AMXTILE // Tile architecture
79 APX_F // Intel APX
80 AVX // AVX functions
81 AVX10 // If set the Intel AVX10 Converged Vector ISA is supported
82 AVX10_128 // If set indicates that AVX10 128-bit vector support is present
83 AVX10_256 // If set indicates that AVX10 256-bit vector support is present
84 AVX10_512 // If set indicates that AVX10 512-bit vector support is present
85 AVX2 // AVX2 functions
86 AVX512BF16 // AVX-512 BFLOAT16 Instructions
87 AVX512BITALG // AVX-512 Bit Algorithms
88 AVX512BW // AVX-512 Byte and Word Instructions
89 AVX512CD // AVX-512 Conflict Detection Instructions
90 AVX512DQ // AVX-512 Doubleword and Quadword Instructions
91 AVX512ER // AVX-512 Exponential and Reciprocal Instructions
92 AVX512F // AVX-512 Foundation
93 AVX512FP16 // AVX-512 FP16 Instructions
94 AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions
95 AVX512PF // AVX-512 Prefetch Instructions
96 AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions
97 AVX512VBMI2 // AVX-512 Vector Bit Manipulation Instructions, Version 2
98 AVX512VL // AVX-512 Vector Length Extensions
99 AVX512VNNI // AVX-512 Vector Neural Network Instructions
100 AVX512VP2INTERSECT // AVX-512 Intersect for D/Q
101 AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword
102 AVXIFMA // AVX-IFMA instructions
103 AVXNECONVERT // AVX-NE-CONVERT instructions
104 AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one
105 AVXVNNI // AVX (VEX encoded) VNNI neural network instructions
106 AVXVNNIINT8 // AVX-VNNI-INT8 instructions
107 BHI_CTRL // Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598
108 BMI1 // Bit Manipulation Instruction Set 1
109 BMI2 // Bit Manipulation Instruction Set 2
110 CETIBT // Intel CET Indirect Branch Tracking
111 CETSS // Intel CET Shadow Stack
112 CLDEMOTE // Cache Line Demote
113 CLMUL // Carry-less Multiplication
114 CLZERO // CLZERO instruction supported
115 CMOV // i686 CMOV
116 CMPCCXADD // CMPCCXADD instructions
117 CMPSB_SCADBS_SHORT // Fast short CMPSB and SCASB
118 CMPXCHG8 // CMPXCHG8 instruction
119 CPBOOST // Core Performance Boost
120 CPPC // AMD: Collaborative Processor Performance Control
121 CX16 // CMPXCHG16B Instruction
122 EFER_LMSLE_UNS // AMD: =Core::X86::Msr::EFER[LMSLE] is not supported, and MBZ
123 ENQCMD // Enqueue Command
124 ERMS // Enhanced REP MOVSB/STOSB
125 F16C // Half-precision floating-point conversion
126 FLUSH_L1D // Flush L1D cache
127 FMA3 // Intel FMA 3. Does not imply AVX.
128 FMA4 // Bulldozer FMA4 functions
129 FP128 // AMD: When set, the internal FP/SIMD execution datapath is no more than 128-bits wide
130 FP256 // AMD: When set, the internal FP/SIMD execution datapath is no more than 256-bits wide
131 FSRM // Fast Short Rep Mov
132 FXSR // FXSAVE, FXRESTOR instructions, CR4 bit 9
133 FXSROPT // FXSAVE/FXRSTOR optimizations
134 GFNI // Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage.
135 HLE // Hardware Lock Elision
136 HRESET // If set CPU supports history reset and the IA32_HRESET_ENABLE MSR
137 HTT // Hyperthreading (enabled)
138 HWA // Hardware assert supported. Indicates support for MSRC001_10
139 HYBRID_CPU // This part has CPUs of more than one type.
140 HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors
141 IA32_ARCH_CAP // IA32_ARCH_CAPABILITIES MSR (Intel)
142 IA32_CORE_CAP // IA32_CORE_CAPABILITIES MSR
143 IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB)
144 IBRS // AMD: Indirect Branch Restricted Speculation
145 IBRS_PREFERRED // AMD: IBRS is preferred over software solution
146 IBRS_PROVIDES_SMP // AMD: IBRS provides Same Mode Protection
147 IBS // Instruction Based Sampling (AMD)
148 IBSBRNTRGT // Instruction Based Sampling Feature (AMD)
149 IBSFETCHSAM // Instruction Based Sampling Feature (AMD)
150 IBSFFV // Instruction Based Sampling Feature (AMD)
151 IBSOPCNT // Instruction Based Sampling Feature (AMD)
152 IBSOPCNTEXT // Instruction Based Sampling Feature (AMD)
153 IBSOPSAM // Instruction Based Sampling Feature (AMD)
154 IBSRDWROPCNT // Instruction Based Sampling Feature (AMD)
155 IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD)
156 IBS_FETCH_CTLX // AMD: IBS fetch control extended MSR supported
157 IBS_OPDATA4 // AMD: IBS op data 4 MSR supported
158 IBS_OPFUSE // AMD: Indicates support for IbsOpFuse
159 IBS_PREVENTHOST // Disallowing IBS use by the host supported
160 IBS_ZEN4 // AMD: Fetch and Op IBS support IBS extensions added with Zen4
161 IDPRED_CTRL // IPRED_DIS
162 INT_WBINVD // WBINVD/WBNOINVD are interruptible.
163 INVLPGB // NVLPGB and TLBSYNC instruction supported
164 KEYLOCKER // Key locker
165 KEYLOCKERW // Key locker wide
166 LAHF // LAHF/SAHF in long mode
167 LAM // If set, CPU supports Linear Address Masking
168 LBRVIRT // LBR virtualization
169 LZCNT // LZCNT instruction
170 MCAOVERFLOW // MCA overflow recovery support.
171 MCDT_NO // Processor do not exhibit MXCSR Configuration Dependent Timing behavior and do not need to mitigate it.
172 MCOMMIT // MCOMMIT instruction supported
173 MD_CLEAR // VERW clears CPU buffers
174 MMX // standard MMX
175 MMXEXT // SSE integer functions or AMD MMX ext
176 MOVBE // MOVBE instruction (big-endian)
177 MOVDIR64B // Move 64 Bytes as Direct Store
178 MOVDIRI // Move Doubleword as Direct Store
179 MOVSB_ZL // Fast Zero-Length MOVSB
180 MOVU // AMD: MOVU SSE instructions are more efficient and should be preferred to SSE MOVL/MOVH. MOVUPS is more efficient than MOVLPS/MOVHPS. MOVUPD is more efficient than MOVLPD/MOVHPD
181 MPX // Intel MPX (Memory Protection Extensions)
182 MSRIRC // Instruction Retired Counter MSR available
183 MSRLIST // Read/Write List of Model Specific Registers
184 MSR_PAGEFLUSH // Page Flush MSR available
185 NRIPS // Indicates support for NRIP save on VMEXIT
186 NX // NX (No-Execute) bit
187 OSXSAVE // XSAVE enabled by OS
188 PCONFIG // PCONFIG for Intel Multi-Key Total Memory Encryption
189 POPCNT // POPCNT instruction
190 PPIN // AMD: Protected Processor Inventory Number support. Indicates that Protected Processor Inventory Number (PPIN) capability can be enabled
191 PREFETCHI // PREFETCHIT0/1 instructions
192 PSFD // Predictive Store Forward Disable
193 RDPRU // RDPRU instruction supported
194 RDRAND // RDRAND instruction is available
195 RDSEED // RDSEED instruction is available
196 RDTSCP // RDTSCP Instruction
197 RRSBA_CTRL // Restricted RSB Alternate
198 RTM // Restricted Transactional Memory
199 RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort.
200 SERIALIZE // Serialize Instruction Execution
201 SEV // AMD Secure Encrypted Virtualization supported
202 SEV_64BIT // AMD SEV guest execution only allowed from a 64-bit host
203 SEV_ALTERNATIVE // AMD SEV Alternate Injection supported
204 SEV_DEBUGSWAP // Full debug state swap supported for SEV-ES guests
205 SEV_ES // AMD SEV Encrypted State supported
206 SEV_RESTRICTED // AMD SEV Restricted Injection supported
207 SEV_SNP // AMD SEV Secure Nested Paging supported
208 SGX // Software Guard Extensions
209 SGXLC // Software Guard Extensions Launch Control
210 SHA // Intel SHA Extensions
211 SME // AMD Secure Memory Encryption supported
212 SME_COHERENT // AMD Hardware cache coherency across encryption domains enforced
213 SPEC_CTRL_SSBD // Speculative Store Bypass Disable
214 SRBDS_CTRL // SRBDS mitigation MSR available
215 SSE // SSE functions
216 SSE2 // P4 SSE functions
217 SSE3 // Prescott SSE3 functions
218 SSE4 // Penryn SSE4.1 functions
219 SSE42 // Nehalem SSE4.2 functions
220 SSE4A // AMD Barcelona microarchitecture SSE4a instructions
221 SSSE3 // Conroe SSSE3 functions
222 STIBP // Single Thread Indirect Branch Predictors
223 STIBP_ALWAYSON // AMD: Single Thread Indirect Branch Prediction Mode has Enhanced Performance and may be left Always On
224 STOSB_SHORT // Fast short STOSB
225 SUCCOR // Software uncorrectable error containment and recovery capability.
226 SVM // AMD Secure Virtual Machine
227 SVMDA // Indicates support for the SVM decode assists.
228 SVMFBASID // SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control
229 SVML // AMD SVM lock. Indicates support for SVM-Lock.
230 SVMNP // AMD SVM nested paging
231 SVMPF // SVM pause intercept filter. Indicates support for the pause intercept filter
232 SVMPFT // SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold
233 SYSCALL // System-Call Extension (SCE): SYSCALL and SYSRET instructions.
234 SYSEE // SYSENTER and SYSEXIT instructions
235 TBM // AMD Trailing Bit Manipulation
236 TDX_GUEST // Intel Trust Domain Extensions Guest
237 TLB_FLUSH_NESTED // AMD: Flushing includes all the nested translations for guest translations
238 TME // Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE.
239 TOPEXT // TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX.
240 TSCRATEMSR // MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104
241 TSXLDTRK // Intel TSX Suspend Load Address Tracking
242 VAES // Vector AES. AVX(512) versions requires additional checks.
243 VMCBCLEAN // VMCB clean bits. Indicates support for VMCB clean bits.
244 VMPL // AMD VM Permission Levels supported
245 VMSA_REGPROT // AMD VMSA Register Protection supported
246 VMX // Virtual Machine Extensions
247 VPCLMULQDQ // Carry-Less Multiplication Quadword. Requires AVX for 3 register versions.
248 VTE // AMD Virtual Transparent Encryption supported
249 WAITPKG // TPAUSE, UMONITOR, UMWAIT
250 WBNOINVD // Write Back and Do Not Invalidate Cache
251 WRMSRNS // Non-Serializing Write to Model Specific Register
252 X87 // FPU
253 XGETBV1 // Supports XGETBV with ECX = 1
254 XOP // Bulldozer XOP functions
255 XSAVE // XSAVE, XRESTOR, XSETBV, XGETBV
256 XSAVEC // Supports XSAVEC and the compacted form of XRSTOR.
257 XSAVEOPT // XSAVEOPT available
258 XSAVES // Supports XSAVES/XRSTORS and IA32_XSS
259
260 // ARM features:
261 AESARM // AES instructions
262 ARMCPUID // Some CPU ID registers readable at user-level
263 ASIMD // Advanced SIMD
264 ASIMDDP // SIMD Dot Product
265 ASIMDHP // Advanced SIMD half-precision floating point
266 ASIMDRDM // Rounding Double Multiply Accumulate/Subtract (SQRDMLAH/SQRDMLSH)
267 ATOMICS // Large System Extensions (LSE)
268 CRC32 // CRC32/CRC32C instructions
269 DCPOP // Data cache clean to Point of Persistence (DC CVAP)
270 EVTSTRM // Generic timer
271 FCMA // Floatin point complex number addition and multiplication
272 FP // Single-precision and double-precision floating point
273 FPHP // Half-precision floating point
274 GPA // Generic Pointer Authentication
275 JSCVT // Javascript-style double->int convert (FJCVTZS)
276 LRCPC // Weaker release consistency (LDAPR, etc)
277 PMULL // Polynomial Multiply instructions (PMULL/PMULL2)
278 SHA1 // SHA-1 instructions (SHA1C, etc)
279 SHA2 // SHA-2 instructions (SHA256H, etc)
280 SHA3 // SHA-3 instructions (EOR3, RAXI, XAR, BCAX)
281 SHA512 // SHA512 instructions
282 SM3 // SM3 instructions
283 SM4 // SM4 instructions
284 SVE // Scalable Vector Extension
285 // Keep it last. It automatically defines the size of []flagSet
286 lastID
287
288 firstID FeatureID = UNKNOWN + 1
289)
290
291// CPUInfo contains information about the detected system CPU.
292type CPUInfo struct {
293 BrandName string // Brand name reported by the CPU
294 VendorID Vendor // Comparable CPU vendor ID
295 VendorString string // Raw vendor string.
296 featureSet flagSet // Features of the CPU
297 PhysicalCores int // Number of physical processor cores in your CPU. Will be 0 if undetectable.
298 ThreadsPerCore int // Number of threads per physical core. Will be 1 if undetectable.
299 LogicalCores int // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable.
300 Family int // CPU family number
301 Model int // CPU model number
302 Stepping int // CPU stepping info
303 CacheLine int // Cache line size in bytes. Will be 0 if undetectable.
304 Hz int64 // Clock speed, if known, 0 otherwise. Will attempt to contain base clock speed.
305 BoostFreq int64 // Max clock speed, if known, 0 otherwise
306 Cache struct {
307 L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected
308 L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected
309 L2 int // L2 Cache (per core or shared). Will be -1 if undetected
310 L3 int // L3 Cache (per core, per ccx or shared). Will be -1 if undetected
311 }
312 SGX SGXSupport
313 AVX10Level uint8
314 maxFunc uint32
315 maxExFunc uint32
316}
317
318var cpuid func(op uint32) (eax, ebx, ecx, edx uint32)
319var cpuidex func(op, op2 uint32) (eax, ebx, ecx, edx uint32)
320var xgetbv func(index uint32) (eax, edx uint32)
321var rdtscpAsm func() (eax, ebx, ecx, edx uint32)
322var darwinHasAVX512 = func() bool { return false }
323
324// CPU contains information about the CPU as detected on startup,
325// or when Detect last was called.
326//
327// Use this as the primary entry point to you data.
328var CPU CPUInfo
329
330func init() {
331 initCPU()
332 Detect()
333}
334
335// Detect will re-detect current CPU info.
336// This will replace the content of the exported CPU variable.
337//
338// Unless you expect the CPU to change while you are running your program
339// you should not need to call this function.
340// If you call this, you must ensure that no other goroutine is accessing the
341// exported CPU variable.
342func Detect() {
343 // Set defaults
344 CPU.ThreadsPerCore = 1
345 CPU.Cache.L1I = -1
346 CPU.Cache.L1D = -1
347 CPU.Cache.L2 = -1
348 CPU.Cache.L3 = -1
349 safe := true
350 if detectArmFlag != nil {
351 safe = !*detectArmFlag
352 }
353 addInfo(&CPU, safe)
354 if displayFeats != nil && *displayFeats {
355 fmt.Println("cpu features:", strings.Join(CPU.FeatureSet(), ","))
356 // Exit with non-zero so tests will print value.
357 os.Exit(1)
358 }
359 if disableFlag != nil {
360 s := strings.Split(*disableFlag, ",")
361 for _, feat := range s {
362 feat := ParseFeature(strings.TrimSpace(feat))
363 if feat != UNKNOWN {
364 CPU.featureSet.unset(feat)
365 }
366 }
367 }
368}
369
370// DetectARM will detect ARM64 features.
371// This is NOT done automatically since it can potentially crash
372// if the OS does not handle the command.
373// If in the future this can be done safely this function may not
374// do anything.
375func DetectARM() {
376 addInfo(&CPU, false)
377}
378
379var detectArmFlag *bool
380var displayFeats *bool
381var disableFlag *string
382
383// Flags will enable flags.
384// This must be called *before* flag.Parse AND
385// Detect must be called after the flags have been parsed.
386// Note that this means that any detection used in init() functions
387// will not contain these flags.
388func Flags() {
389 disableFlag = flag.String("cpu.disable", "", "disable cpu features; comma separated list")
390 displayFeats = flag.Bool("cpu.features", false, "lists cpu features and exits")
391 detectArmFlag = flag.Bool("cpu.arm", false, "allow ARM features to be detected; can potentially crash")
392}
393
394// Supports returns whether the CPU supports all of the requested features.
395func (c CPUInfo) Supports(ids ...FeatureID) bool {
396 for _, id := range ids {
397 if !c.featureSet.inSet(id) {
398 return false
399 }
400 }
401 return true
402}
403
404// Has allows for checking a single feature.
405// Should be inlined by the compiler.
406func (c *CPUInfo) Has(id FeatureID) bool {
407 return c.featureSet.inSet(id)
408}
409
410// AnyOf returns whether the CPU supports one or more of the requested features.
411func (c CPUInfo) AnyOf(ids ...FeatureID) bool {
412 for _, id := range ids {
413 if c.featureSet.inSet(id) {
414 return true
415 }
416 }
417 return false
418}
419
420// Features contains several features combined for a fast check using
421// CpuInfo.HasAll
422type Features *flagSet
423
424// CombineFeatures allows to combine several features for a close to constant time lookup.
425func CombineFeatures(ids ...FeatureID) Features {
426 var v flagSet
427 for _, id := range ids {
428 v.set(id)
429 }
430 return &v
431}
432
433func (c *CPUInfo) HasAll(f Features) bool {
434 return c.featureSet.hasSetP(f)
435}
436
437// https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels
438var oneOfLevel = CombineFeatures(SYSEE, SYSCALL)
439var level1Features = CombineFeatures(CMOV, CMPXCHG8, X87, FXSR, MMX, SSE, SSE2)
440var level2Features = CombineFeatures(CMOV, CMPXCHG8, X87, FXSR, MMX, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3)
441var level3Features = CombineFeatures(CMOV, CMPXCHG8, X87, FXSR, MMX, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3, AVX, AVX2, BMI1, BMI2, F16C, FMA3, LZCNT, MOVBE, OSXSAVE)
442var level4Features = CombineFeatures(CMOV, CMPXCHG8, X87, FXSR, MMX, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3, AVX, AVX2, BMI1, BMI2, F16C, FMA3, LZCNT, MOVBE, OSXSAVE, AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL)
443
444// X64Level returns the microarchitecture level detected on the CPU.
445// If features are lacking or non x64 mode, 0 is returned.
446// See https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels
447func (c CPUInfo) X64Level() int {
448 if !c.featureSet.hasOneOf(oneOfLevel) {
449 return 0
450 }
451 if c.featureSet.hasSetP(level4Features) {
452 return 4
453 }
454 if c.featureSet.hasSetP(level3Features) {
455 return 3
456 }
457 if c.featureSet.hasSetP(level2Features) {
458 return 2
459 }
460 if c.featureSet.hasSetP(level1Features) {
461 return 1
462 }
463 return 0
464}
465
466// Disable will disable one or several features.
467func (c *CPUInfo) Disable(ids ...FeatureID) bool {
468 for _, id := range ids {
469 c.featureSet.unset(id)
470 }
471 return true
472}
473
474// Enable will disable one or several features even if they were undetected.
475// This is of course not recommended for obvious reasons.
476func (c *CPUInfo) Enable(ids ...FeatureID) bool {
477 for _, id := range ids {
478 c.featureSet.set(id)
479 }
480 return true
481}
482
483// IsVendor returns true if vendor is recognized as Intel
484func (c CPUInfo) IsVendor(v Vendor) bool {
485 return c.VendorID == v
486}
487
488// FeatureSet returns all available features as strings.
489func (c CPUInfo) FeatureSet() []string {
490 s := make([]string, 0, c.featureSet.nEnabled())
491 s = append(s, c.featureSet.Strings()...)
492 return s
493}
494
495// RTCounter returns the 64-bit time-stamp counter
496// Uses the RDTSCP instruction. The value 0 is returned
497// if the CPU does not support the instruction.
498func (c CPUInfo) RTCounter() uint64 {
499 if !c.Supports(RDTSCP) {
500 return 0
501 }
502 a, _, _, d := rdtscpAsm()
503 return uint64(a) | (uint64(d) << 32)
504}
505
506// Ia32TscAux returns the IA32_TSC_AUX part of the RDTSCP.
507// This variable is OS dependent, but on Linux contains information
508// about the current cpu/core the code is running on.
509// If the RDTSCP instruction isn't supported on the CPU, the value 0 is returned.
510func (c CPUInfo) Ia32TscAux() uint32 {
511 if !c.Supports(RDTSCP) {
512 return 0
513 }
514 _, _, ecx, _ := rdtscpAsm()
515 return ecx
516}
517
518// LogicalCPU will return the Logical CPU the code is currently executing on.
519// This is likely to change when the OS re-schedules the running thread
520// to another CPU.
521// If the current core cannot be detected, -1 will be returned.
522func (c CPUInfo) LogicalCPU() int {
523 if c.maxFunc < 1 {
524 return -1
525 }
526 _, ebx, _, _ := cpuid(1)
527 return int(ebx >> 24)
528}
529
530// frequencies tries to compute the clock speed of the CPU. If leaf 15 is
531// supported, use it, otherwise parse the brand string. Yes, really.
532func (c *CPUInfo) frequencies() {
533 c.Hz, c.BoostFreq = 0, 0
534 mfi := maxFunctionID()
535 if mfi >= 0x15 {
536 eax, ebx, ecx, _ := cpuid(0x15)
537 if eax != 0 && ebx != 0 && ecx != 0 {
538 c.Hz = (int64(ecx) * int64(ebx)) / int64(eax)
539 }
540 }
541 if mfi >= 0x16 {
542 a, b, _, _ := cpuid(0x16)
543 // Base...
544 if a&0xffff > 0 {
545 c.Hz = int64(a&0xffff) * 1_000_000
546 }
547 // Boost...
548 if b&0xffff > 0 {
549 c.BoostFreq = int64(b&0xffff) * 1_000_000
550 }
551 }
552 if c.Hz > 0 {
553 return
554 }
555
556 // computeHz determines the official rated speed of a CPU from its brand
557 // string. This insanity is *actually the official documented way to do
558 // this according to Intel*, prior to leaf 0x15 existing. The official
559 // documentation only shows this working for exactly `x.xx` or `xxxx`
560 // cases, e.g., `2.50GHz` or `1300MHz`; this parser will accept other
561 // sizes.
562 model := c.BrandName
563 hz := strings.LastIndex(model, "Hz")
564 if hz < 3 {
565 return
566 }
567 var multiplier int64
568 switch model[hz-1] {
569 case 'M':
570 multiplier = 1000 * 1000
571 case 'G':
572 multiplier = 1000 * 1000 * 1000
573 case 'T':
574 multiplier = 1000 * 1000 * 1000 * 1000
575 }
576 if multiplier == 0 {
577 return
578 }
579 freq := int64(0)
580 divisor := int64(0)
581 decimalShift := int64(1)
582 var i int
583 for i = hz - 2; i >= 0 && model[i] != ' '; i-- {
584 if model[i] >= '0' && model[i] <= '9' {
585 freq += int64(model[i]-'0') * decimalShift
586 decimalShift *= 10
587 } else if model[i] == '.' {
588 if divisor != 0 {
589 return
590 }
591 divisor = decimalShift
592 } else {
593 return
594 }
595 }
596 // we didn't find a space
597 if i < 0 {
598 return
599 }
600 if divisor != 0 {
601 c.Hz = (freq * multiplier) / divisor
602 return
603 }
604 c.Hz = freq * multiplier
605}
606
607// VM Will return true if the cpu id indicates we are in
608// a virtual machine.
609func (c CPUInfo) VM() bool {
610 return CPU.featureSet.inSet(HYPERVISOR)
611}
612
613// flags contains detected cpu features and characteristics
614type flags uint64
615
616// log2(bits_in_uint64)
617const flagBitsLog2 = 6
618const flagBits = 1 << flagBitsLog2
619const flagMask = flagBits - 1
620
621// flagSet contains detected cpu features and characteristics in an array of flags
622type flagSet [(lastID + flagMask) / flagBits]flags
623
624func (s *flagSet) inSet(feat FeatureID) bool {
625 return s[feat>>flagBitsLog2]&(1<<(feat&flagMask)) != 0
626}
627
628func (s *flagSet) set(feat FeatureID) {
629 s[feat>>flagBitsLog2] |= 1 << (feat & flagMask)
630}
631
632// setIf will set a feature if boolean is true.
633func (s *flagSet) setIf(cond bool, features ...FeatureID) {
634 if cond {
635 for _, offset := range features {
636 s[offset>>flagBitsLog2] |= 1 << (offset & flagMask)
637 }
638 }
639}
640
641func (s *flagSet) unset(offset FeatureID) {
642 bit := flags(1 << (offset & flagMask))
643 s[offset>>flagBitsLog2] = s[offset>>flagBitsLog2] & ^bit
644}
645
646// or with another flagset.
647func (s *flagSet) or(other flagSet) {
648 for i, v := range other[:] {
649 s[i] |= v
650 }
651}
652
653// hasSet returns whether all features are present.
654func (s *flagSet) hasSet(other flagSet) bool {
655 for i, v := range other[:] {
656 if s[i]&v != v {
657 return false
658 }
659 }
660 return true
661}
662
663// hasSet returns whether all features are present.
664func (s *flagSet) hasSetP(other *flagSet) bool {
665 for i, v := range other[:] {
666 if s[i]&v != v {
667 return false
668 }
669 }
670 return true
671}
672
673// hasOneOf returns whether one or more features are present.
674func (s *flagSet) hasOneOf(other *flagSet) bool {
675 for i, v := range other[:] {
676 if s[i]&v != 0 {
677 return true
678 }
679 }
680 return false
681}
682
683// nEnabled will return the number of enabled flags.
684func (s *flagSet) nEnabled() (n int) {
685 for _, v := range s[:] {
686 n += bits.OnesCount64(uint64(v))
687 }
688 return n
689}
690
691func flagSetWith(feat ...FeatureID) flagSet {
692 var res flagSet
693 for _, f := range feat {
694 res.set(f)
695 }
696 return res
697}
698
699// ParseFeature will parse the string and return the ID of the matching feature.
700// Will return UNKNOWN if not found.
701func ParseFeature(s string) FeatureID {
702 s = strings.ToUpper(s)
703 for i := firstID; i < lastID; i++ {
704 if i.String() == s {
705 return i
706 }
707 }
708 return UNKNOWN
709}
710
711// Strings returns an array of the detected features for FlagsSet.
712func (s flagSet) Strings() []string {
713 if len(s) == 0 {
714 return []string{""}
715 }
716 r := make([]string, 0)
717 for i := firstID; i < lastID; i++ {
718 if s.inSet(i) {
719 r = append(r, i.String())
720 }
721 }
722 return r
723}
724
725func maxExtendedFunction() uint32 {
726 eax, _, _, _ := cpuid(0x80000000)
727 return eax
728}
729
730func maxFunctionID() uint32 {
731 a, _, _, _ := cpuid(0)
732 return a
733}
734
735func brandName() string {
736 if maxExtendedFunction() >= 0x80000004 {
737 v := make([]uint32, 0, 48)
738 for i := uint32(0); i < 3; i++ {
739 a, b, c, d := cpuid(0x80000002 + i)
740 v = append(v, a, b, c, d)
741 }
742 return strings.Trim(string(valAsString(v...)), " ")
743 }
744 return "unknown"
745}
746
747func threadsPerCore() int {
748 mfi := maxFunctionID()
749 vend, _ := vendorID()
750
751 if mfi < 0x4 || (vend != Intel && vend != AMD) {
752 return 1
753 }
754
755 if mfi < 0xb {
756 if vend != Intel {
757 return 1
758 }
759 _, b, _, d := cpuid(1)
760 if (d & (1 << 28)) != 0 {
761 // v will contain logical core count
762 v := (b >> 16) & 255
763 if v > 1 {
764 a4, _, _, _ := cpuid(4)
765 // physical cores
766 v2 := (a4 >> 26) + 1
767 if v2 > 0 {
768 return int(v) / int(v2)
769 }
770 }
771 }
772 return 1
773 }
774 _, b, _, _ := cpuidex(0xb, 0)
775 if b&0xffff == 0 {
776 if vend == AMD {
777 // Workaround for AMD returning 0, assume 2 if >= Zen 2
778 // It will be more correct than not.
779 fam, _, _ := familyModel()
780 _, _, _, d := cpuid(1)
781 if (d&(1<<28)) != 0 && fam >= 23 {
782 return 2
783 }
784 }
785 return 1
786 }
787 return int(b & 0xffff)
788}
789
790func logicalCores() int {
791 mfi := maxFunctionID()
792 v, _ := vendorID()
793 switch v {
794 case Intel:
795 // Use this on old Intel processors
796 if mfi < 0xb {
797 if mfi < 1 {
798 return 0
799 }
800 // CPUID.1:EBX[23:16] represents the maximum number of addressable IDs (initial APIC ID)
801 // that can be assigned to logical processors in a physical package.
802 // The value may not be the same as the number of logical processors that are present in the hardware of a physical package.
803 _, ebx, _, _ := cpuid(1)
804 logical := (ebx >> 16) & 0xff
805 return int(logical)
806 }
807 _, b, _, _ := cpuidex(0xb, 1)
808 return int(b & 0xffff)
809 case AMD, Hygon:
810 _, b, _, _ := cpuid(1)
811 return int((b >> 16) & 0xff)
812 default:
813 return 0
814 }
815}
816
817func familyModel() (family, model, stepping int) {
818 if maxFunctionID() < 0x1 {
819 return 0, 0, 0
820 }
821 eax, _, _, _ := cpuid(1)
822 // If BaseFamily[3:0] is less than Fh then ExtendedFamily[7:0] is reserved and Family is equal to BaseFamily[3:0].
823 family = int((eax >> 8) & 0xf)
824 extFam := family == 0x6 // Intel is 0x6, needs extended model.
825 if family == 0xf {
826 // Add ExtFamily
827 family += int((eax >> 20) & 0xff)
828 extFam = true
829 }
830 // If BaseFamily[3:0] is less than 0Fh then ExtendedModel[3:0] is reserved and Model is equal to BaseModel[3:0].
831 model = int((eax >> 4) & 0xf)
832 if extFam {
833 // Add ExtModel
834 model += int((eax >> 12) & 0xf0)
835 }
836 stepping = int(eax & 0xf)
837 return family, model, stepping
838}
839
840func physicalCores() int {
841 v, _ := vendorID()
842 switch v {
843 case Intel:
844 return logicalCores() / threadsPerCore()
845 case AMD, Hygon:
846 lc := logicalCores()
847 tpc := threadsPerCore()
848 if lc > 0 && tpc > 0 {
849 return lc / tpc
850 }
851
852 // The following is inaccurate on AMD EPYC 7742 64-Core Processor
853 if maxExtendedFunction() >= 0x80000008 {
854 _, _, c, _ := cpuid(0x80000008)
855 if c&0xff > 0 {
856 return int(c&0xff) + 1
857 }
858 }
859 }
860 return 0
861}
862
863// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
864var vendorMapping = map[string]Vendor{
865 "AMDisbetter!": AMD,
866 "AuthenticAMD": AMD,
867 "CentaurHauls": VIA,
868 "GenuineIntel": Intel,
869 "TransmetaCPU": Transmeta,
870 "GenuineTMx86": Transmeta,
871 "Geode by NSC": NSC,
872 "VIA VIA VIA ": VIA,
873 "KVMKVMKVMKVM": KVM,
874 "Microsoft Hv": MSVM,
875 "VMwareVMware": VMware,
876 "XenVMMXenVMM": XenHVM,
877 "bhyve bhyve ": Bhyve,
878 "HygonGenuine": Hygon,
879 "Vortex86 SoC": SiS,
880 "SiS SiS SiS ": SiS,
881 "RiseRiseRise": SiS,
882 "Genuine RDC": RDC,
883}
884
885func vendorID() (Vendor, string) {
886 _, b, c, d := cpuid(0)
887 v := string(valAsString(b, d, c))
888 vend, ok := vendorMapping[v]
889 if !ok {
890 return VendorUnknown, v
891 }
892 return vend, v
893}
894
895func cacheLine() int {
896 if maxFunctionID() < 0x1 {
897 return 0
898 }
899
900 _, ebx, _, _ := cpuid(1)
901 cache := (ebx & 0xff00) >> 5 // cflush size
902 if cache == 0 && maxExtendedFunction() >= 0x80000006 {
903 _, _, ecx, _ := cpuid(0x80000006)
904 cache = ecx & 0xff // cacheline size
905 }
906 // TODO: Read from Cache and TLB Information
907 return int(cache)
908}
909
910func (c *CPUInfo) cacheSize() {
911 c.Cache.L1D = -1
912 c.Cache.L1I = -1
913 c.Cache.L2 = -1
914 c.Cache.L3 = -1
915 vendor, _ := vendorID()
916 switch vendor {
917 case Intel:
918 if maxFunctionID() < 4 {
919 return
920 }
921 c.Cache.L1I, c.Cache.L1D, c.Cache.L2, c.Cache.L3 = 0, 0, 0, 0
922 for i := uint32(0); ; i++ {
923 eax, ebx, ecx, _ := cpuidex(4, i)
924 cacheType := eax & 15
925 if cacheType == 0 {
926 break
927 }
928 cacheLevel := (eax >> 5) & 7
929 coherency := int(ebx&0xfff) + 1
930 partitions := int((ebx>>12)&0x3ff) + 1
931 associativity := int((ebx>>22)&0x3ff) + 1
932 sets := int(ecx) + 1
933 size := associativity * partitions * coherency * sets
934 switch cacheLevel {
935 case 1:
936 if cacheType == 1 {
937 // 1 = Data Cache
938 c.Cache.L1D = size
939 } else if cacheType == 2 {
940 // 2 = Instruction Cache
941 c.Cache.L1I = size
942 } else {
943 if c.Cache.L1D < 0 {
944 c.Cache.L1I = size
945 }
946 if c.Cache.L1I < 0 {
947 c.Cache.L1I = size
948 }
949 }
950 case 2:
951 c.Cache.L2 = size
952 case 3:
953 c.Cache.L3 = size
954 }
955 }
956 case AMD, Hygon:
957 // Untested.
958 if maxExtendedFunction() < 0x80000005 {
959 return
960 }
961 _, _, ecx, edx := cpuid(0x80000005)
962 c.Cache.L1D = int(((ecx >> 24) & 0xFF) * 1024)
963 c.Cache.L1I = int(((edx >> 24) & 0xFF) * 1024)
964
965 if maxExtendedFunction() < 0x80000006 {
966 return
967 }
968 _, _, ecx, _ = cpuid(0x80000006)
969 c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
970
971 // CPUID Fn8000_001D_EAX_x[N:0] Cache Properties
972 if maxExtendedFunction() < 0x8000001D || !c.Has(TOPEXT) {
973 return
974 }
975
976 // Xen Hypervisor is buggy and returns the same entry no matter ECX value.
977 // Hack: When we encounter the same entry 100 times we break.
978 nSame := 0
979 var last uint32
980 for i := uint32(0); i < math.MaxUint32; i++ {
981 eax, ebx, ecx, _ := cpuidex(0x8000001D, i)
982
983 level := (eax >> 5) & 7
984 cacheNumSets := ecx + 1
985 cacheLineSize := 1 + (ebx & 2047)
986 cachePhysPartitions := 1 + ((ebx >> 12) & 511)
987 cacheNumWays := 1 + ((ebx >> 22) & 511)
988
989 typ := eax & 15
990 size := int(cacheNumSets * cacheLineSize * cachePhysPartitions * cacheNumWays)
991 if typ == 0 {
992 return
993 }
994
995 // Check for the same value repeated.
996 comb := eax ^ ebx ^ ecx
997 if comb == last {
998 nSame++
999 if nSame == 100 {
1000 return
1001 }
1002 }
1003 last = comb
1004
1005 switch level {
1006 case 1:
1007 switch typ {
1008 case 1:
1009 // Data cache
1010 c.Cache.L1D = size
1011 case 2:
1012 // Inst cache
1013 c.Cache.L1I = size
1014 default:
1015 if c.Cache.L1D < 0 {
1016 c.Cache.L1I = size
1017 }
1018 if c.Cache.L1I < 0 {
1019 c.Cache.L1I = size
1020 }
1021 }
1022 case 2:
1023 c.Cache.L2 = size
1024 case 3:
1025 c.Cache.L3 = size
1026 }
1027 }
1028 }
1029}
1030
1031type SGXEPCSection struct {
1032 BaseAddress uint64
1033 EPCSize uint64
1034}
1035
1036type SGXSupport struct {
1037 Available bool
1038 LaunchControl bool
1039 SGX1Supported bool
1040 SGX2Supported bool
1041 MaxEnclaveSizeNot64 int64
1042 MaxEnclaveSize64 int64
1043 EPCSections []SGXEPCSection
1044}
1045
1046func hasSGX(available, lc bool) (rval SGXSupport) {
1047 rval.Available = available
1048
1049 if !available {
1050 return
1051 }
1052
1053 rval.LaunchControl = lc
1054
1055 a, _, _, d := cpuidex(0x12, 0)
1056 rval.SGX1Supported = a&0x01 != 0
1057 rval.SGX2Supported = a&0x02 != 0
1058 rval.MaxEnclaveSizeNot64 = 1 << (d & 0xFF) // pow 2
1059 rval.MaxEnclaveSize64 = 1 << ((d >> 8) & 0xFF) // pow 2
1060 rval.EPCSections = make([]SGXEPCSection, 0)
1061
1062 for subleaf := uint32(2); subleaf < 2+8; subleaf++ {
1063 eax, ebx, ecx, edx := cpuidex(0x12, subleaf)
1064 leafType := eax & 0xf
1065
1066 if leafType == 0 {
1067 // Invalid subleaf, stop iterating
1068 break
1069 } else if leafType == 1 {
1070 // EPC Section subleaf
1071 baseAddress := uint64(eax&0xfffff000) + (uint64(ebx&0x000fffff) << 32)
1072 size := uint64(ecx&0xfffff000) + (uint64(edx&0x000fffff) << 32)
1073
1074 section := SGXEPCSection{BaseAddress: baseAddress, EPCSize: size}
1075 rval.EPCSections = append(rval.EPCSections, section)
1076 }
1077 }
1078
1079 return
1080}
1081
1082func support() flagSet {
1083 var fs flagSet
1084 mfi := maxFunctionID()
1085 vend, _ := vendorID()
1086 if mfi < 0x1 {
1087 return fs
1088 }
1089 family, model, _ := familyModel()
1090
1091 _, _, c, d := cpuid(1)
1092 fs.setIf((d&(1<<0)) != 0, X87)
1093 fs.setIf((d&(1<<8)) != 0, CMPXCHG8)
1094 fs.setIf((d&(1<<11)) != 0, SYSEE)
1095 fs.setIf((d&(1<<15)) != 0, CMOV)
1096 fs.setIf((d&(1<<23)) != 0, MMX)
1097 fs.setIf((d&(1<<24)) != 0, FXSR)
1098 fs.setIf((d&(1<<25)) != 0, FXSROPT)
1099 fs.setIf((d&(1<<25)) != 0, SSE)
1100 fs.setIf((d&(1<<26)) != 0, SSE2)
1101 fs.setIf((c&1) != 0, SSE3)
1102 fs.setIf((c&(1<<5)) != 0, VMX)
1103 fs.setIf((c&(1<<9)) != 0, SSSE3)
1104 fs.setIf((c&(1<<19)) != 0, SSE4)
1105 fs.setIf((c&(1<<20)) != 0, SSE42)
1106 fs.setIf((c&(1<<25)) != 0, AESNI)
1107 fs.setIf((c&(1<<1)) != 0, CLMUL)
1108 fs.setIf(c&(1<<22) != 0, MOVBE)
1109 fs.setIf(c&(1<<23) != 0, POPCNT)
1110 fs.setIf(c&(1<<30) != 0, RDRAND)
1111
1112 // This bit has been reserved by Intel & AMD for use by hypervisors,
1113 // and indicates the presence of a hypervisor.
1114 fs.setIf(c&(1<<31) != 0, HYPERVISOR)
1115 fs.setIf(c&(1<<29) != 0, F16C)
1116 fs.setIf(c&(1<<13) != 0, CX16)
1117
1118 if vend == Intel && (d&(1<<28)) != 0 && mfi >= 4 {
1119 fs.setIf(threadsPerCore() > 1, HTT)
1120 }
1121 if vend == AMD && (d&(1<<28)) != 0 && mfi >= 4 {
1122 fs.setIf(threadsPerCore() > 1, HTT)
1123 }
1124 fs.setIf(c&1<<26 != 0, XSAVE)
1125 fs.setIf(c&1<<27 != 0, OSXSAVE)
1126 // Check XGETBV/XSAVE (26), OXSAVE (27) and AVX (28) bits
1127 const avxCheck = 1<<26 | 1<<27 | 1<<28
1128 if c&avxCheck == avxCheck {
1129 // Check for OS support
1130 eax, _ := xgetbv(0)
1131 if (eax & 0x6) == 0x6 {
1132 fs.set(AVX)
1133 switch vend {
1134 case Intel:
1135 // Older than Haswell.
1136 fs.setIf(family == 6 && model < 60, AVXSLOW)
1137 case AMD:
1138 // Older than Zen 2
1139 fs.setIf(family < 23 || (family == 23 && model < 49), AVXSLOW)
1140 }
1141 }
1142 }
1143 // FMA3 can be used with SSE registers, so no OS support is strictly needed.
1144 // fma3 and OSXSAVE needed.
1145 const fma3Check = 1<<12 | 1<<27
1146 fs.setIf(c&fma3Check == fma3Check, FMA3)
1147
1148 // Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
1149 if mfi >= 7 {
1150 _, ebx, ecx, edx := cpuidex(7, 0)
1151 if fs.inSet(AVX) && (ebx&0x00000020) != 0 {
1152 fs.set(AVX2)
1153 }
1154 // CPUID.(EAX=7, ECX=0).EBX
1155 if (ebx & 0x00000008) != 0 {
1156 fs.set(BMI1)
1157 fs.setIf((ebx&0x00000100) != 0, BMI2)
1158 }
1159 fs.setIf(ebx&(1<<2) != 0, SGX)
1160 fs.setIf(ebx&(1<<4) != 0, HLE)
1161 fs.setIf(ebx&(1<<9) != 0, ERMS)
1162 fs.setIf(ebx&(1<<11) != 0, RTM)
1163 fs.setIf(ebx&(1<<14) != 0, MPX)
1164 fs.setIf(ebx&(1<<18) != 0, RDSEED)
1165 fs.setIf(ebx&(1<<19) != 0, ADX)
1166 fs.setIf(ebx&(1<<29) != 0, SHA)
1167
1168 // CPUID.(EAX=7, ECX=0).ECX
1169 fs.setIf(ecx&(1<<5) != 0, WAITPKG)
1170 fs.setIf(ecx&(1<<7) != 0, CETSS)
1171 fs.setIf(ecx&(1<<8) != 0, GFNI)
1172 fs.setIf(ecx&(1<<9) != 0, VAES)
1173 fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ)
1174 fs.setIf(ecx&(1<<13) != 0, TME)
1175 fs.setIf(ecx&(1<<25) != 0, CLDEMOTE)
1176 fs.setIf(ecx&(1<<23) != 0, KEYLOCKER)
1177 fs.setIf(ecx&(1<<27) != 0, MOVDIRI)
1178 fs.setIf(ecx&(1<<28) != 0, MOVDIR64B)
1179 fs.setIf(ecx&(1<<29) != 0, ENQCMD)
1180 fs.setIf(ecx&(1<<30) != 0, SGXLC)
1181
1182 // CPUID.(EAX=7, ECX=0).EDX
1183 fs.setIf(edx&(1<<4) != 0, FSRM)
1184 fs.setIf(edx&(1<<9) != 0, SRBDS_CTRL)
1185 fs.setIf(edx&(1<<10) != 0, MD_CLEAR)
1186 fs.setIf(edx&(1<<11) != 0, RTM_ALWAYS_ABORT)
1187 fs.setIf(edx&(1<<14) != 0, SERIALIZE)
1188 fs.setIf(edx&(1<<15) != 0, HYBRID_CPU)
1189 fs.setIf(edx&(1<<16) != 0, TSXLDTRK)
1190 fs.setIf(edx&(1<<18) != 0, PCONFIG)
1191 fs.setIf(edx&(1<<20) != 0, CETIBT)
1192 fs.setIf(edx&(1<<26) != 0, IBPB)
1193 fs.setIf(edx&(1<<27) != 0, STIBP)
1194 fs.setIf(edx&(1<<28) != 0, FLUSH_L1D)
1195 fs.setIf(edx&(1<<29) != 0, IA32_ARCH_CAP)
1196 fs.setIf(edx&(1<<30) != 0, IA32_CORE_CAP)
1197 fs.setIf(edx&(1<<31) != 0, SPEC_CTRL_SSBD)
1198
1199 // CPUID.(EAX=7, ECX=1).EAX
1200 eax1, _, _, edx1 := cpuidex(7, 1)
1201 fs.setIf(fs.inSet(AVX) && eax1&(1<<4) != 0, AVXVNNI)
1202 fs.setIf(eax1&(1<<7) != 0, CMPCCXADD)
1203 fs.setIf(eax1&(1<<10) != 0, MOVSB_ZL)
1204 fs.setIf(eax1&(1<<11) != 0, STOSB_SHORT)
1205 fs.setIf(eax1&(1<<12) != 0, CMPSB_SCADBS_SHORT)
1206 fs.setIf(eax1&(1<<22) != 0, HRESET)
1207 fs.setIf(eax1&(1<<23) != 0, AVXIFMA)
1208 fs.setIf(eax1&(1<<26) != 0, LAM)
1209
1210 // CPUID.(EAX=7, ECX=1).EDX
1211 fs.setIf(edx1&(1<<4) != 0, AVXVNNIINT8)
1212 fs.setIf(edx1&(1<<5) != 0, AVXNECONVERT)
1213 fs.setIf(edx1&(1<<14) != 0, PREFETCHI)
1214 fs.setIf(edx1&(1<<19) != 0, AVX10)
1215 fs.setIf(edx1&(1<<21) != 0, APX_F)
1216
1217 // Only detect AVX-512 features if XGETBV is supported
1218 if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) {
1219 // Check for OS support
1220 eax, _ := xgetbv(0)
1221
1222 // Verify that XCR0[7:5] = ‘111b’ (OPMASK state, upper 256-bit of ZMM0-ZMM15 and
1223 // ZMM16-ZMM31 state are enabled by OS)
1224 /// and that XCR0[2:1] = ‘11b’ (XMM state and YMM state are enabled by OS).
1225 hasAVX512 := (eax>>5)&7 == 7 && (eax>>1)&3 == 3
1226 if runtime.GOOS == "darwin" {
1227 hasAVX512 = fs.inSet(AVX) && darwinHasAVX512()
1228 }
1229 if hasAVX512 {
1230 fs.setIf(ebx&(1<<16) != 0, AVX512F)
1231 fs.setIf(ebx&(1<<17) != 0, AVX512DQ)
1232 fs.setIf(ebx&(1<<21) != 0, AVX512IFMA)
1233 fs.setIf(ebx&(1<<26) != 0, AVX512PF)
1234 fs.setIf(ebx&(1<<27) != 0, AVX512ER)
1235 fs.setIf(ebx&(1<<28) != 0, AVX512CD)
1236 fs.setIf(ebx&(1<<30) != 0, AVX512BW)
1237 fs.setIf(ebx&(1<<31) != 0, AVX512VL)
1238 // ecx
1239 fs.setIf(ecx&(1<<1) != 0, AVX512VBMI)
1240 fs.setIf(ecx&(1<<6) != 0, AVX512VBMI2)
1241 fs.setIf(ecx&(1<<11) != 0, AVX512VNNI)
1242 fs.setIf(ecx&(1<<12) != 0, AVX512BITALG)
1243 fs.setIf(ecx&(1<<14) != 0, AVX512VPOPCNTDQ)
1244 // edx
1245 fs.setIf(edx&(1<<8) != 0, AVX512VP2INTERSECT)
1246 fs.setIf(edx&(1<<22) != 0, AMXBF16)
1247 fs.setIf(edx&(1<<23) != 0, AVX512FP16)
1248 fs.setIf(edx&(1<<24) != 0, AMXTILE)
1249 fs.setIf(edx&(1<<25) != 0, AMXINT8)
1250 // eax1 = CPUID.(EAX=7, ECX=1).EAX
1251 fs.setIf(eax1&(1<<5) != 0, AVX512BF16)
1252 fs.setIf(eax1&(1<<19) != 0, WRMSRNS)
1253 fs.setIf(eax1&(1<<21) != 0, AMXFP16)
1254 fs.setIf(eax1&(1<<27) != 0, MSRLIST)
1255 }
1256 }
1257
1258 // CPUID.(EAX=7, ECX=2)
1259 _, _, _, edx = cpuidex(7, 2)
1260 fs.setIf(edx&(1<<0) != 0, PSFD)
1261 fs.setIf(edx&(1<<1) != 0, IDPRED_CTRL)
1262 fs.setIf(edx&(1<<2) != 0, RRSBA_CTRL)
1263 fs.setIf(edx&(1<<4) != 0, BHI_CTRL)
1264 fs.setIf(edx&(1<<5) != 0, MCDT_NO)
1265
1266 // Add keylocker features.
1267 if fs.inSet(KEYLOCKER) && mfi >= 0x19 {
1268 _, ebx, _, _ := cpuidex(0x19, 0)
1269 fs.setIf(ebx&5 == 5, KEYLOCKERW) // Bit 0 and 2 (1+4)
1270 }
1271
1272 // Add AVX10 features.
1273 if fs.inSet(AVX10) && mfi >= 0x24 {
1274 _, ebx, _, _ := cpuidex(0x24, 0)
1275 fs.setIf(ebx&(1<<16) != 0, AVX10_128)
1276 fs.setIf(ebx&(1<<17) != 0, AVX10_256)
1277 fs.setIf(ebx&(1<<18) != 0, AVX10_512)
1278 }
1279 }
1280
1281 // Processor Extended State Enumeration Sub-leaf (EAX = 0DH, ECX = 1)
1282 // EAX
1283 // Bit 00: XSAVEOPT is available.
1284 // Bit 01: Supports XSAVEC and the compacted form of XRSTOR if set.
1285 // Bit 02: Supports XGETBV with ECX = 1 if set.
1286 // Bit 03: Supports XSAVES/XRSTORS and IA32_XSS if set.
1287 // Bits 31 - 04: Reserved.
1288 // EBX
1289 // Bits 31 - 00: The size in bytes of the XSAVE area containing all states enabled by XCRO | IA32_XSS.
1290 // ECX
1291 // Bits 31 - 00: Reports the supported bits of the lower 32 bits of the IA32_XSS MSR. IA32_XSS[n] can be set to 1 only if ECX[n] is 1.
1292 // EDX?
1293 // Bits 07 - 00: Used for XCR0. Bit 08: PT state. Bit 09: Used for XCR0. Bits 12 - 10: Reserved. Bit 13: HWP state. Bits 31 - 14: Reserved.
1294 if mfi >= 0xd {
1295 if fs.inSet(XSAVE) {
1296 eax, _, _, _ := cpuidex(0xd, 1)
1297 fs.setIf(eax&(1<<0) != 0, XSAVEOPT)
1298 fs.setIf(eax&(1<<1) != 0, XSAVEC)
1299 fs.setIf(eax&(1<<2) != 0, XGETBV1)
1300 fs.setIf(eax&(1<<3) != 0, XSAVES)
1301 }
1302 }
1303 if maxExtendedFunction() >= 0x80000001 {
1304 _, _, c, d := cpuid(0x80000001)
1305 if (c & (1 << 5)) != 0 {
1306 fs.set(LZCNT)
1307 fs.set(POPCNT)
1308 }
1309 // ECX
1310 fs.setIf((c&(1<<0)) != 0, LAHF)
1311 fs.setIf((c&(1<<2)) != 0, SVM)
1312 fs.setIf((c&(1<<6)) != 0, SSE4A)
1313 fs.setIf((c&(1<<10)) != 0, IBS)
1314 fs.setIf((c&(1<<22)) != 0, TOPEXT)
1315
1316 // EDX
1317 fs.setIf(d&(1<<11) != 0, SYSCALL)
1318 fs.setIf(d&(1<<20) != 0, NX)
1319 fs.setIf(d&(1<<22) != 0, MMXEXT)
1320 fs.setIf(d&(1<<23) != 0, MMX)
1321 fs.setIf(d&(1<<24) != 0, FXSR)
1322 fs.setIf(d&(1<<25) != 0, FXSROPT)
1323 fs.setIf(d&(1<<27) != 0, RDTSCP)
1324 fs.setIf(d&(1<<30) != 0, AMD3DNOWEXT)
1325 fs.setIf(d&(1<<31) != 0, AMD3DNOW)
1326
1327 /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
1328 * used unless the OS has AVX support. */
1329 if fs.inSet(AVX) {
1330 fs.setIf((c&(1<<11)) != 0, XOP)
1331 fs.setIf((c&(1<<16)) != 0, FMA4)
1332 }
1333
1334 }
1335 if maxExtendedFunction() >= 0x80000007 {
1336 _, b, _, d := cpuid(0x80000007)
1337 fs.setIf((b&(1<<0)) != 0, MCAOVERFLOW)
1338 fs.setIf((b&(1<<1)) != 0, SUCCOR)
1339 fs.setIf((b&(1<<2)) != 0, HWA)
1340 fs.setIf((d&(1<<9)) != 0, CPBOOST)
1341 }
1342
1343 if maxExtendedFunction() >= 0x80000008 {
1344 _, b, _, _ := cpuid(0x80000008)
1345 fs.setIf(b&(1<<28) != 0, PSFD)
1346 fs.setIf(b&(1<<27) != 0, CPPC)
1347 fs.setIf(b&(1<<24) != 0, SPEC_CTRL_SSBD)
1348 fs.setIf(b&(1<<23) != 0, PPIN)
1349 fs.setIf(b&(1<<21) != 0, TLB_FLUSH_NESTED)
1350 fs.setIf(b&(1<<20) != 0, EFER_LMSLE_UNS)
1351 fs.setIf(b&(1<<19) != 0, IBRS_PROVIDES_SMP)
1352 fs.setIf(b&(1<<18) != 0, IBRS_PREFERRED)
1353 fs.setIf(b&(1<<17) != 0, STIBP_ALWAYSON)
1354 fs.setIf(b&(1<<15) != 0, STIBP)
1355 fs.setIf(b&(1<<14) != 0, IBRS)
1356 fs.setIf((b&(1<<13)) != 0, INT_WBINVD)
1357 fs.setIf(b&(1<<12) != 0, IBPB)
1358 fs.setIf((b&(1<<9)) != 0, WBNOINVD)
1359 fs.setIf((b&(1<<8)) != 0, MCOMMIT)
1360 fs.setIf((b&(1<<4)) != 0, RDPRU)
1361 fs.setIf((b&(1<<3)) != 0, INVLPGB)
1362 fs.setIf((b&(1<<1)) != 0, MSRIRC)
1363 fs.setIf((b&(1<<0)) != 0, CLZERO)
1364 }
1365
1366 if fs.inSet(SVM) && maxExtendedFunction() >= 0x8000000A {
1367 _, _, _, edx := cpuid(0x8000000A)
1368 fs.setIf((edx>>0)&1 == 1, SVMNP)
1369 fs.setIf((edx>>1)&1 == 1, LBRVIRT)
1370 fs.setIf((edx>>2)&1 == 1, SVML)
1371 fs.setIf((edx>>3)&1 == 1, NRIPS)
1372 fs.setIf((edx>>4)&1 == 1, TSCRATEMSR)
1373 fs.setIf((edx>>5)&1 == 1, VMCBCLEAN)
1374 fs.setIf((edx>>6)&1 == 1, SVMFBASID)
1375 fs.setIf((edx>>7)&1 == 1, SVMDA)
1376 fs.setIf((edx>>10)&1 == 1, SVMPF)
1377 fs.setIf((edx>>12)&1 == 1, SVMPFT)
1378 }
1379
1380 if maxExtendedFunction() >= 0x8000001a {
1381 eax, _, _, _ := cpuid(0x8000001a)
1382 fs.setIf((eax>>0)&1 == 1, FP128)
1383 fs.setIf((eax>>1)&1 == 1, MOVU)
1384 fs.setIf((eax>>2)&1 == 1, FP256)
1385 }
1386
1387 if maxExtendedFunction() >= 0x8000001b && fs.inSet(IBS) {
1388 eax, _, _, _ := cpuid(0x8000001b)
1389 fs.setIf((eax>>0)&1 == 1, IBSFFV)
1390 fs.setIf((eax>>1)&1 == 1, IBSFETCHSAM)
1391 fs.setIf((eax>>2)&1 == 1, IBSOPSAM)
1392 fs.setIf((eax>>3)&1 == 1, IBSRDWROPCNT)
1393 fs.setIf((eax>>4)&1 == 1, IBSOPCNT)
1394 fs.setIf((eax>>5)&1 == 1, IBSBRNTRGT)
1395 fs.setIf((eax>>6)&1 == 1, IBSOPCNTEXT)
1396 fs.setIf((eax>>7)&1 == 1, IBSRIPINVALIDCHK)
1397 fs.setIf((eax>>8)&1 == 1, IBS_OPFUSE)
1398 fs.setIf((eax>>9)&1 == 1, IBS_FETCH_CTLX)
1399 fs.setIf((eax>>10)&1 == 1, IBS_OPDATA4) // Doc says "Fixed,0. IBS op data 4 MSR supported", but assuming they mean 1.
1400 fs.setIf((eax>>11)&1 == 1, IBS_ZEN4)
1401 }
1402
1403 if maxExtendedFunction() >= 0x8000001f && vend == AMD {
1404 a, _, _, _ := cpuid(0x8000001f)
1405 fs.setIf((a>>0)&1 == 1, SME)
1406 fs.setIf((a>>1)&1 == 1, SEV)
1407 fs.setIf((a>>2)&1 == 1, MSR_PAGEFLUSH)
1408 fs.setIf((a>>3)&1 == 1, SEV_ES)
1409 fs.setIf((a>>4)&1 == 1, SEV_SNP)
1410 fs.setIf((a>>5)&1 == 1, VMPL)
1411 fs.setIf((a>>10)&1 == 1, SME_COHERENT)
1412 fs.setIf((a>>11)&1 == 1, SEV_64BIT)
1413 fs.setIf((a>>12)&1 == 1, SEV_RESTRICTED)
1414 fs.setIf((a>>13)&1 == 1, SEV_ALTERNATIVE)
1415 fs.setIf((a>>14)&1 == 1, SEV_DEBUGSWAP)
1416 fs.setIf((a>>15)&1 == 1, IBS_PREVENTHOST)
1417 fs.setIf((a>>16)&1 == 1, VTE)
1418 fs.setIf((a>>24)&1 == 1, VMSA_REGPROT)
1419 }
1420
1421 if mfi >= 0x20 {
1422 // Microsoft has decided to purposefully hide the information
1423 // of the guest TEE when VMs are being created using Hyper-V.
1424 //
1425 // This leads us to check for the Hyper-V cpuid features
1426 // (0x4000000C), and then for the `ebx` value set.
1427 //
1428 // For Intel TDX, `ebx` is set as `0xbe3`, being 3 the part
1429 // we're mostly interested about,according to:
1430 // https://github.com/torvalds/linux/blob/d2f51b3516dade79269ff45eae2a7668ae711b25/arch/x86/include/asm/hyperv-tlfs.h#L169-L174
1431 _, ebx, _, _ := cpuid(0x4000000C)
1432 fs.setIf(ebx == 0xbe3, TDX_GUEST)
1433 }
1434
1435 if mfi >= 0x21 {
1436 // Intel Trusted Domain Extensions Guests have their own cpuid leaf (0x21).
1437 _, ebx, ecx, edx := cpuid(0x21)
1438 identity := string(valAsString(ebx, edx, ecx))
1439 fs.setIf(identity == "IntelTDX ", TDX_GUEST)
1440 }
1441
1442 return fs
1443}
1444
1445func (c *CPUInfo) supportAVX10() uint8 {
1446 if c.maxFunc >= 0x24 && c.featureSet.inSet(AVX10) {
1447 _, ebx, _, _ := cpuidex(0x24, 0)
1448 return uint8(ebx)
1449 }
1450 return 0
1451}
1452
1453func valAsString(values ...uint32) []byte {
1454 r := make([]byte, 4*len(values))
1455 for i, v := range values {
1456 dst := r[i*4:]
1457 dst[0] = byte(v & 0xff)
1458 dst[1] = byte((v >> 8) & 0xff)
1459 dst[2] = byte((v >> 16) & 0xff)
1460 dst[3] = byte((v >> 24) & 0xff)
1461 switch {
1462 case dst[0] == 0:
1463 return r[:i*4]
1464 case dst[1] == 0:
1465 return r[:i*4+1]
1466 case dst[2] == 0:
1467 return r[:i*4+2]
1468 case dst[3] == 0:
1469 return r[:i*4+3]
1470 }
1471 }
1472 return r
1473}
diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid_386.s b/vendor/github.com/klauspost/cpuid/v2/cpuid_386.s
new file mode 100644
index 0000000..8587c3a
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/cpuid_386.s
@@ -0,0 +1,47 @@
1// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
2
3//+build 386,!gccgo,!noasm,!appengine
4
5// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
6TEXT ·asmCpuid(SB), 7, $0
7 XORL CX, CX
8 MOVL op+0(FP), AX
9 CPUID
10 MOVL AX, eax+4(FP)
11 MOVL BX, ebx+8(FP)
12 MOVL CX, ecx+12(FP)
13 MOVL DX, edx+16(FP)
14 RET
15
16// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
17TEXT ·asmCpuidex(SB), 7, $0
18 MOVL op+0(FP), AX
19 MOVL op2+4(FP), CX
20 CPUID
21 MOVL AX, eax+8(FP)
22 MOVL BX, ebx+12(FP)
23 MOVL CX, ecx+16(FP)
24 MOVL DX, edx+20(FP)
25 RET
26
27// func xgetbv(index uint32) (eax, edx uint32)
28TEXT ·asmXgetbv(SB), 7, $0
29 MOVL index+0(FP), CX
30 BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
31 MOVL AX, eax+4(FP)
32 MOVL DX, edx+8(FP)
33 RET
34
35// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
36TEXT ·asmRdtscpAsm(SB), 7, $0
37 BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
38 MOVL AX, eax+0(FP)
39 MOVL BX, ebx+4(FP)
40 MOVL CX, ecx+8(FP)
41 MOVL DX, edx+12(FP)
42 RET
43
44// func asmDarwinHasAVX512() bool
45TEXT ·asmDarwinHasAVX512(SB), 7, $0
46 MOVL $0, eax+0(FP)
47 RET
diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s b/vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s
new file mode 100644
index 0000000..bc11f89
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/cpuid_amd64.s
@@ -0,0 +1,72 @@
1// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
2
3//+build amd64,!gccgo,!noasm,!appengine
4
5// func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
6TEXT ·asmCpuid(SB), 7, $0
7 XORQ CX, CX
8 MOVL op+0(FP), AX
9 CPUID
10 MOVL AX, eax+8(FP)
11 MOVL BX, ebx+12(FP)
12 MOVL CX, ecx+16(FP)
13 MOVL DX, edx+20(FP)
14 RET
15
16// func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
17TEXT ·asmCpuidex(SB), 7, $0
18 MOVL op+0(FP), AX
19 MOVL op2+4(FP), CX
20 CPUID
21 MOVL AX, eax+8(FP)
22 MOVL BX, ebx+12(FP)
23 MOVL CX, ecx+16(FP)
24 MOVL DX, edx+20(FP)
25 RET
26
27// func asmXgetbv(index uint32) (eax, edx uint32)
28TEXT ·asmXgetbv(SB), 7, $0
29 MOVL index+0(FP), CX
30 BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
31 MOVL AX, eax+8(FP)
32 MOVL DX, edx+12(FP)
33 RET
34
35// func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
36TEXT ·asmRdtscpAsm(SB), 7, $0
37 BYTE $0x0F; BYTE $0x01; BYTE $0xF9 // RDTSCP
38 MOVL AX, eax+0(FP)
39 MOVL BX, ebx+4(FP)
40 MOVL CX, ecx+8(FP)
41 MOVL DX, edx+12(FP)
42 RET
43
44// From https://go-review.googlesource.com/c/sys/+/285572/
45// func asmDarwinHasAVX512() bool
46TEXT ·asmDarwinHasAVX512(SB), 7, $0-1
47 MOVB $0, ret+0(FP) // default to false
48
49#ifdef GOOS_darwin // return if not darwin
50#ifdef GOARCH_amd64 // return if not amd64
51// These values from:
52// https://github.com/apple/darwin-xnu/blob/xnu-4570.1.46/osfmk/i386/cpu_capabilities.h
53#define commpage64_base_address 0x00007fffffe00000
54#define commpage64_cpu_capabilities64 (commpage64_base_address+0x010)
55#define commpage64_version (commpage64_base_address+0x01E)
56#define hasAVX512F 0x0000004000000000
57 MOVQ $commpage64_version, BX
58 MOVW (BX), AX
59 CMPW AX, $13 // versions < 13 do not support AVX512
60 JL no_avx512
61 MOVQ $commpage64_cpu_capabilities64, BX
62 MOVQ (BX), AX
63 MOVQ $hasAVX512F, CX
64 ANDQ CX, AX
65 JZ no_avx512
66 MOVB $1, ret+0(FP)
67
68no_avx512:
69#endif
70#endif
71 RET
72
diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s b/vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s
new file mode 100644
index 0000000..b31d6ae
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/cpuid_arm64.s
@@ -0,0 +1,26 @@
1// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
2
3//+build arm64,!gccgo,!noasm,!appengine
4
5// See https://www.kernel.org/doc/Documentation/arm64/cpu-feature-registers.txt
6
7// func getMidr
8TEXT ·getMidr(SB), 7, $0
9 WORD $0xd5380000 // mrs x0, midr_el1 /* Main ID Register */
10 MOVD R0, midr+0(FP)
11 RET
12
13// func getProcFeatures
14TEXT ·getProcFeatures(SB), 7, $0
15 WORD $0xd5380400 // mrs x0, id_aa64pfr0_el1 /* Processor Feature Register 0 */
16 MOVD R0, procFeatures+0(FP)
17 RET
18
19// func getInstAttributes
20TEXT ·getInstAttributes(SB), 7, $0
21 WORD $0xd5380600 // mrs x0, id_aa64isar0_el1 /* Instruction Set Attribute Register 0 */
22 WORD $0xd5380621 // mrs x1, id_aa64isar1_el1 /* Instruction Set Attribute Register 1 */
23 MOVD R0, instAttrReg0+0(FP)
24 MOVD R1, instAttrReg1+8(FP)
25 RET
26
diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go b/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go
new file mode 100644
index 0000000..9a53504
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go
@@ -0,0 +1,247 @@
1// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
2
3//go:build arm64 && !gccgo && !noasm && !appengine
4// +build arm64,!gccgo,!noasm,!appengine
5
6package cpuid
7
8import "runtime"
9
10func getMidr() (midr uint64)
11func getProcFeatures() (procFeatures uint64)
12func getInstAttributes() (instAttrReg0, instAttrReg1 uint64)
13
14func initCPU() {
15 cpuid = func(uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 }
16 cpuidex = func(x, y uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 }
17 xgetbv = func(uint32) (a, b uint32) { return 0, 0 }
18 rdtscpAsm = func() (a, b, c, d uint32) { return 0, 0, 0, 0 }
19}
20
21func addInfo(c *CPUInfo, safe bool) {
22 // Seems to be safe to assume on ARM64
23 c.CacheLine = 64
24 detectOS(c)
25
26 // ARM64 disabled since it may crash if interrupt is not intercepted by OS.
27 if safe && !c.Supports(ARMCPUID) && runtime.GOOS != "freebsd" {
28 return
29 }
30 midr := getMidr()
31
32 // MIDR_EL1 - Main ID Register
33 // https://developer.arm.com/docs/ddi0595/h/aarch64-system-registers/midr_el1
34 // x--------------------------------------------------x
35 // | Name | bits | visible |
36 // |--------------------------------------------------|
37 // | Implementer | [31-24] | y |
38 // |--------------------------------------------------|
39 // | Variant | [23-20] | y |
40 // |--------------------------------------------------|
41 // | Architecture | [19-16] | y |
42 // |--------------------------------------------------|
43 // | PartNum | [15-4] | y |
44 // |--------------------------------------------------|
45 // | Revision | [3-0] | y |
46 // x--------------------------------------------------x
47
48 switch (midr >> 24) & 0xff {
49 case 0xC0:
50 c.VendorString = "Ampere Computing"
51 c.VendorID = Ampere
52 case 0x41:
53 c.VendorString = "Arm Limited"
54 c.VendorID = ARM
55 case 0x42:
56 c.VendorString = "Broadcom Corporation"
57 c.VendorID = Broadcom
58 case 0x43:
59 c.VendorString = "Cavium Inc"
60 c.VendorID = Cavium
61 case 0x44:
62 c.VendorString = "Digital Equipment Corporation"
63 c.VendorID = DEC
64 case 0x46:
65 c.VendorString = "Fujitsu Ltd"
66 c.VendorID = Fujitsu
67 case 0x49:
68 c.VendorString = "Infineon Technologies AG"
69 c.VendorID = Infineon
70 case 0x4D:
71 c.VendorString = "Motorola or Freescale Semiconductor Inc"
72 c.VendorID = Motorola
73 case 0x4E:
74 c.VendorString = "NVIDIA Corporation"
75 c.VendorID = NVIDIA
76 case 0x50:
77 c.VendorString = "Applied Micro Circuits Corporation"
78 c.VendorID = AMCC
79 case 0x51:
80 c.VendorString = "Qualcomm Inc"
81 c.VendorID = Qualcomm
82 case 0x56:
83 c.VendorString = "Marvell International Ltd"
84 c.VendorID = Marvell
85 case 0x69:
86 c.VendorString = "Intel Corporation"
87 c.VendorID = Intel
88 }
89
90 // Lower 4 bits: Architecture
91 // Architecture Meaning
92 // 0b0001 Armv4.
93 // 0b0010 Armv4T.
94 // 0b0011 Armv5 (obsolete).
95 // 0b0100 Armv5T.
96 // 0b0101 Armv5TE.
97 // 0b0110 Armv5TEJ.
98 // 0b0111 Armv6.
99 // 0b1111 Architectural features are individually identified in the ID_* registers, see 'ID registers'.
100 // Upper 4 bit: Variant
101 // An IMPLEMENTATION DEFINED variant number.
102 // Typically, this field is used to distinguish between different product variants, or major revisions of a product.
103 c.Family = int(midr>>16) & 0xff
104
105 // PartNum, bits [15:4]
106 // An IMPLEMENTATION DEFINED primary part number for the device.
107 // On processors implemented by Arm, if the top four bits of the primary
108 // part number are 0x0 or 0x7, the variant and architecture are encoded differently.
109 // Revision, bits [3:0]
110 // An IMPLEMENTATION DEFINED revision number for the device.
111 c.Model = int(midr) & 0xffff
112
113 procFeatures := getProcFeatures()
114
115 // ID_AA64PFR0_EL1 - Processor Feature Register 0
116 // x--------------------------------------------------x
117 // | Name | bits | visible |
118 // |--------------------------------------------------|
119 // | DIT | [51-48] | y |
120 // |--------------------------------------------------|
121 // | SVE | [35-32] | y |
122 // |--------------------------------------------------|
123 // | GIC | [27-24] | n |
124 // |--------------------------------------------------|
125 // | AdvSIMD | [23-20] | y |
126 // |--------------------------------------------------|
127 // | FP | [19-16] | y |
128 // |--------------------------------------------------|
129 // | EL3 | [15-12] | n |
130 // |--------------------------------------------------|
131 // | EL2 | [11-8] | n |
132 // |--------------------------------------------------|
133 // | EL1 | [7-4] | n |
134 // |--------------------------------------------------|
135 // | EL0 | [3-0] | n |
136 // x--------------------------------------------------x
137
138 var f flagSet
139 // if procFeatures&(0xf<<48) != 0 {
140 // fmt.Println("DIT")
141 // }
142 f.setIf(procFeatures&(0xf<<32) != 0, SVE)
143 if procFeatures&(0xf<<20) != 15<<20 {
144 f.set(ASIMD)
145 // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64pfr0_el1
146 // 0b0001 --> As for 0b0000, and also includes support for half-precision floating-point arithmetic.
147 f.setIf(procFeatures&(0xf<<20) == 1<<20, FPHP, ASIMDHP)
148 }
149 f.setIf(procFeatures&(0xf<<16) != 0, FP)
150
151 instAttrReg0, instAttrReg1 := getInstAttributes()
152
153 // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar0_el1
154 //
155 // ID_AA64ISAR0_EL1 - Instruction Set Attribute Register 0
156 // x--------------------------------------------------x
157 // | Name | bits | visible |
158 // |--------------------------------------------------|
159 // | TS | [55-52] | y |
160 // |--------------------------------------------------|
161 // | FHM | [51-48] | y |
162 // |--------------------------------------------------|
163 // | DP | [47-44] | y |
164 // |--------------------------------------------------|
165 // | SM4 | [43-40] | y |
166 // |--------------------------------------------------|
167 // | SM3 | [39-36] | y |
168 // |--------------------------------------------------|
169 // | SHA3 | [35-32] | y |
170 // |--------------------------------------------------|
171 // | RDM | [31-28] | y |
172 // |--------------------------------------------------|
173 // | ATOMICS | [23-20] | y |
174 // |--------------------------------------------------|
175 // | CRC32 | [19-16] | y |
176 // |--------------------------------------------------|
177 // | SHA2 | [15-12] | y |
178 // |--------------------------------------------------|
179 // | SHA1 | [11-8] | y |
180 // |--------------------------------------------------|
181 // | AES | [7-4] | y |
182 // x--------------------------------------------------x
183
184 // if instAttrReg0&(0xf<<52) != 0 {
185 // fmt.Println("TS")
186 // }
187 // if instAttrReg0&(0xf<<48) != 0 {
188 // fmt.Println("FHM")
189 // }
190 f.setIf(instAttrReg0&(0xf<<44) != 0, ASIMDDP)
191 f.setIf(instAttrReg0&(0xf<<40) != 0, SM4)
192 f.setIf(instAttrReg0&(0xf<<36) != 0, SM3)
193 f.setIf(instAttrReg0&(0xf<<32) != 0, SHA3)
194 f.setIf(instAttrReg0&(0xf<<28) != 0, ASIMDRDM)
195 f.setIf(instAttrReg0&(0xf<<20) != 0, ATOMICS)
196 f.setIf(instAttrReg0&(0xf<<16) != 0, CRC32)
197 f.setIf(instAttrReg0&(0xf<<12) != 0, SHA2)
198 // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar0_el1
199 // 0b0010 --> As 0b0001, plus SHA512H, SHA512H2, SHA512SU0, and SHA512SU1 instructions implemented.
200 f.setIf(instAttrReg0&(0xf<<12) == 2<<12, SHA512)
201 f.setIf(instAttrReg0&(0xf<<8) != 0, SHA1)
202 f.setIf(instAttrReg0&(0xf<<4) != 0, AESARM)
203 // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar0_el1
204 // 0b0010 --> As for 0b0001, plus PMULL/PMULL2 instructions operating on 64-bit data quantities.
205 f.setIf(instAttrReg0&(0xf<<4) == 2<<4, PMULL)
206
207 // https://developer.arm.com/docs/ddi0595/b/aarch64-system-registers/id_aa64isar1_el1
208 //
209 // ID_AA64ISAR1_EL1 - Instruction set attribute register 1
210 // x--------------------------------------------------x
211 // | Name | bits | visible |
212 // |--------------------------------------------------|
213 // | GPI | [31-28] | y |
214 // |--------------------------------------------------|
215 // | GPA | [27-24] | y |
216 // |--------------------------------------------------|
217 // | LRCPC | [23-20] | y |
218 // |--------------------------------------------------|
219 // | FCMA | [19-16] | y |
220 // |--------------------------------------------------|
221 // | JSCVT | [15-12] | y |
222 // |--------------------------------------------------|
223 // | API | [11-8] | y |
224 // |--------------------------------------------------|
225 // | APA | [7-4] | y |
226 // |--------------------------------------------------|
227 // | DPB | [3-0] | y |
228 // x--------------------------------------------------x
229
230 // if instAttrReg1&(0xf<<28) != 0 {
231 // fmt.Println("GPI")
232 // }
233 f.setIf(instAttrReg1&(0xf<<28) != 24, GPA)
234 f.setIf(instAttrReg1&(0xf<<20) != 0, LRCPC)
235 f.setIf(instAttrReg1&(0xf<<16) != 0, FCMA)
236 f.setIf(instAttrReg1&(0xf<<12) != 0, JSCVT)
237 // if instAttrReg1&(0xf<<8) != 0 {
238 // fmt.Println("API")
239 // }
240 // if instAttrReg1&(0xf<<4) != 0 {
241 // fmt.Println("APA")
242 // }
243 f.setIf(instAttrReg1&(0xf<<0) != 0, DCPOP)
244
245 // Store
246 c.featureSet.or(f)
247}
diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_ref.go b/vendor/github.com/klauspost/cpuid/v2/detect_ref.go
new file mode 100644
index 0000000..9636c2b
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/detect_ref.go
@@ -0,0 +1,15 @@
1// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
2
3//go:build (!amd64 && !386 && !arm64) || gccgo || noasm || appengine
4// +build !amd64,!386,!arm64 gccgo noasm appengine
5
6package cpuid
7
8func initCPU() {
9 cpuid = func(uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 }
10 cpuidex = func(x, y uint32) (a, b, c, d uint32) { return 0, 0, 0, 0 }
11 xgetbv = func(uint32) (a, b uint32) { return 0, 0 }
12 rdtscpAsm = func() (a, b, c, d uint32) { return 0, 0, 0, 0 }
13}
14
15func addInfo(info *CPUInfo, safe bool) {}
diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go
new file mode 100644
index 0000000..c7dfa12
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go
@@ -0,0 +1,37 @@
1// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
2
3//go:build (386 && !gccgo && !noasm && !appengine) || (amd64 && !gccgo && !noasm && !appengine)
4// +build 386,!gccgo,!noasm,!appengine amd64,!gccgo,!noasm,!appengine
5
6package cpuid
7
8func asmCpuid(op uint32) (eax, ebx, ecx, edx uint32)
9func asmCpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
10func asmXgetbv(index uint32) (eax, edx uint32)
11func asmRdtscpAsm() (eax, ebx, ecx, edx uint32)
12func asmDarwinHasAVX512() bool
13
14func initCPU() {
15 cpuid = asmCpuid
16 cpuidex = asmCpuidex
17 xgetbv = asmXgetbv
18 rdtscpAsm = asmRdtscpAsm
19 darwinHasAVX512 = asmDarwinHasAVX512
20}
21
22func addInfo(c *CPUInfo, safe bool) {
23 c.maxFunc = maxFunctionID()
24 c.maxExFunc = maxExtendedFunction()
25 c.BrandName = brandName()
26 c.CacheLine = cacheLine()
27 c.Family, c.Model, c.Stepping = familyModel()
28 c.featureSet = support()
29 c.SGX = hasSGX(c.featureSet.inSet(SGX), c.featureSet.inSet(SGXLC))
30 c.ThreadsPerCore = threadsPerCore()
31 c.LogicalCores = logicalCores()
32 c.PhysicalCores = physicalCores()
33 c.VendorID, c.VendorString = vendorID()
34 c.AVX10Level = c.supportAVX10()
35 c.cacheSize()
36 c.frequencies()
37}
diff --git a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go
new file mode 100644
index 0000000..43bd05f
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go
@@ -0,0 +1,279 @@
1// Code generated by "stringer -type=FeatureID,Vendor"; DO NOT EDIT.
2
3package cpuid
4
5import "strconv"
6
7func _() {
8 // An "invalid array index" compiler error signifies that the constant values have changed.
9 // Re-run the stringer command to generate them again.
10 var x [1]struct{}
11 _ = x[ADX-1]
12 _ = x[AESNI-2]
13 _ = x[AMD3DNOW-3]
14 _ = x[AMD3DNOWEXT-4]
15 _ = x[AMXBF16-5]
16 _ = x[AMXFP16-6]
17 _ = x[AMXINT8-7]
18 _ = x[AMXTILE-8]
19 _ = x[APX_F-9]
20 _ = x[AVX-10]
21 _ = x[AVX10-11]
22 _ = x[AVX10_128-12]
23 _ = x[AVX10_256-13]
24 _ = x[AVX10_512-14]
25 _ = x[AVX2-15]
26 _ = x[AVX512BF16-16]
27 _ = x[AVX512BITALG-17]
28 _ = x[AVX512BW-18]
29 _ = x[AVX512CD-19]
30 _ = x[AVX512DQ-20]
31 _ = x[AVX512ER-21]
32 _ = x[AVX512F-22]
33 _ = x[AVX512FP16-23]
34 _ = x[AVX512IFMA-24]
35 _ = x[AVX512PF-25]
36 _ = x[AVX512VBMI-26]
37 _ = x[AVX512VBMI2-27]
38 _ = x[AVX512VL-28]
39 _ = x[AVX512VNNI-29]
40 _ = x[AVX512VP2INTERSECT-30]
41 _ = x[AVX512VPOPCNTDQ-31]
42 _ = x[AVXIFMA-32]
43 _ = x[AVXNECONVERT-33]
44 _ = x[AVXSLOW-34]
45 _ = x[AVXVNNI-35]
46 _ = x[AVXVNNIINT8-36]
47 _ = x[BHI_CTRL-37]
48 _ = x[BMI1-38]
49 _ = x[BMI2-39]
50 _ = x[CETIBT-40]
51 _ = x[CETSS-41]
52 _ = x[CLDEMOTE-42]
53 _ = x[CLMUL-43]
54 _ = x[CLZERO-44]
55 _ = x[CMOV-45]
56 _ = x[CMPCCXADD-46]
57 _ = x[CMPSB_SCADBS_SHORT-47]
58 _ = x[CMPXCHG8-48]
59 _ = x[CPBOOST-49]
60 _ = x[CPPC-50]
61 _ = x[CX16-51]
62 _ = x[EFER_LMSLE_UNS-52]
63 _ = x[ENQCMD-53]
64 _ = x[ERMS-54]
65 _ = x[F16C-55]
66 _ = x[FLUSH_L1D-56]
67 _ = x[FMA3-57]
68 _ = x[FMA4-58]
69 _ = x[FP128-59]
70 _ = x[FP256-60]
71 _ = x[FSRM-61]
72 _ = x[FXSR-62]
73 _ = x[FXSROPT-63]
74 _ = x[GFNI-64]
75 _ = x[HLE-65]
76 _ = x[HRESET-66]
77 _ = x[HTT-67]
78 _ = x[HWA-68]
79 _ = x[HYBRID_CPU-69]
80 _ = x[HYPERVISOR-70]
81 _ = x[IA32_ARCH_CAP-71]
82 _ = x[IA32_CORE_CAP-72]
83 _ = x[IBPB-73]
84 _ = x[IBRS-74]
85 _ = x[IBRS_PREFERRED-75]
86 _ = x[IBRS_PROVIDES_SMP-76]
87 _ = x[IBS-77]
88 _ = x[IBSBRNTRGT-78]
89 _ = x[IBSFETCHSAM-79]
90 _ = x[IBSFFV-80]
91 _ = x[IBSOPCNT-81]
92 _ = x[IBSOPCNTEXT-82]
93 _ = x[IBSOPSAM-83]
94 _ = x[IBSRDWROPCNT-84]
95 _ = x[IBSRIPINVALIDCHK-85]
96 _ = x[IBS_FETCH_CTLX-86]
97 _ = x[IBS_OPDATA4-87]
98 _ = x[IBS_OPFUSE-88]
99 _ = x[IBS_PREVENTHOST-89]
100 _ = x[IBS_ZEN4-90]
101 _ = x[IDPRED_CTRL-91]
102 _ = x[INT_WBINVD-92]
103 _ = x[INVLPGB-93]
104 _ = x[KEYLOCKER-94]
105 _ = x[KEYLOCKERW-95]
106 _ = x[LAHF-96]
107 _ = x[LAM-97]
108 _ = x[LBRVIRT-98]
109 _ = x[LZCNT-99]
110 _ = x[MCAOVERFLOW-100]
111 _ = x[MCDT_NO-101]
112 _ = x[MCOMMIT-102]
113 _ = x[MD_CLEAR-103]
114 _ = x[MMX-104]
115 _ = x[MMXEXT-105]
116 _ = x[MOVBE-106]
117 _ = x[MOVDIR64B-107]
118 _ = x[MOVDIRI-108]
119 _ = x[MOVSB_ZL-109]
120 _ = x[MOVU-110]
121 _ = x[MPX-111]
122 _ = x[MSRIRC-112]
123 _ = x[MSRLIST-113]
124 _ = x[MSR_PAGEFLUSH-114]
125 _ = x[NRIPS-115]
126 _ = x[NX-116]
127 _ = x[OSXSAVE-117]
128 _ = x[PCONFIG-118]
129 _ = x[POPCNT-119]
130 _ = x[PPIN-120]
131 _ = x[PREFETCHI-121]
132 _ = x[PSFD-122]
133 _ = x[RDPRU-123]
134 _ = x[RDRAND-124]
135 _ = x[RDSEED-125]
136 _ = x[RDTSCP-126]
137 _ = x[RRSBA_CTRL-127]
138 _ = x[RTM-128]
139 _ = x[RTM_ALWAYS_ABORT-129]
140 _ = x[SERIALIZE-130]
141 _ = x[SEV-131]
142 _ = x[SEV_64BIT-132]
143 _ = x[SEV_ALTERNATIVE-133]
144 _ = x[SEV_DEBUGSWAP-134]
145 _ = x[SEV_ES-135]
146 _ = x[SEV_RESTRICTED-136]
147 _ = x[SEV_SNP-137]
148 _ = x[SGX-138]
149 _ = x[SGXLC-139]
150 _ = x[SHA-140]
151 _ = x[SME-141]
152 _ = x[SME_COHERENT-142]
153 _ = x[SPEC_CTRL_SSBD-143]
154 _ = x[SRBDS_CTRL-144]
155 _ = x[SSE-145]
156 _ = x[SSE2-146]
157 _ = x[SSE3-147]
158 _ = x[SSE4-148]
159 _ = x[SSE42-149]
160 _ = x[SSE4A-150]
161 _ = x[SSSE3-151]
162 _ = x[STIBP-152]
163 _ = x[STIBP_ALWAYSON-153]
164 _ = x[STOSB_SHORT-154]
165 _ = x[SUCCOR-155]
166 _ = x[SVM-156]
167 _ = x[SVMDA-157]
168 _ = x[SVMFBASID-158]
169 _ = x[SVML-159]
170 _ = x[SVMNP-160]
171 _ = x[SVMPF-161]
172 _ = x[SVMPFT-162]
173 _ = x[SYSCALL-163]
174 _ = x[SYSEE-164]
175 _ = x[TBM-165]
176 _ = x[TDX_GUEST-166]
177 _ = x[TLB_FLUSH_NESTED-167]
178 _ = x[TME-168]
179 _ = x[TOPEXT-169]
180 _ = x[TSCRATEMSR-170]
181 _ = x[TSXLDTRK-171]
182 _ = x[VAES-172]
183 _ = x[VMCBCLEAN-173]
184 _ = x[VMPL-174]
185 _ = x[VMSA_REGPROT-175]
186 _ = x[VMX-176]
187 _ = x[VPCLMULQDQ-177]
188 _ = x[VTE-178]
189 _ = x[WAITPKG-179]
190 _ = x[WBNOINVD-180]
191 _ = x[WRMSRNS-181]
192 _ = x[X87-182]
193 _ = x[XGETBV1-183]
194 _ = x[XOP-184]
195 _ = x[XSAVE-185]
196 _ = x[XSAVEC-186]
197 _ = x[XSAVEOPT-187]
198 _ = x[XSAVES-188]
199 _ = x[AESARM-189]
200 _ = x[ARMCPUID-190]
201 _ = x[ASIMD-191]
202 _ = x[ASIMDDP-192]
203 _ = x[ASIMDHP-193]
204 _ = x[ASIMDRDM-194]
205 _ = x[ATOMICS-195]
206 _ = x[CRC32-196]
207 _ = x[DCPOP-197]
208 _ = x[EVTSTRM-198]
209 _ = x[FCMA-199]
210 _ = x[FP-200]
211 _ = x[FPHP-201]
212 _ = x[GPA-202]
213 _ = x[JSCVT-203]
214 _ = x[LRCPC-204]
215 _ = x[PMULL-205]
216 _ = x[SHA1-206]
217 _ = x[SHA2-207]
218 _ = x[SHA3-208]
219 _ = x[SHA512-209]
220 _ = x[SM3-210]
221 _ = x[SM4-211]
222 _ = x[SVE-212]
223 _ = x[lastID-213]
224 _ = x[firstID-0]
225}
226
227const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXFP16AMXINT8AMXTILEAPX_FAVXAVX10AVX10_128AVX10_256AVX10_512AVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXIFMAAVXNECONVERTAVXSLOWAVXVNNIAVXVNNIINT8BHI_CTRLBMI1BMI2CETIBTCETSSCLDEMOTECLMULCLZEROCMOVCMPCCXADDCMPSB_SCADBS_SHORTCMPXCHG8CPBOOSTCPPCCX16EFER_LMSLE_UNSENQCMDERMSF16CFLUSH_L1DFMA3FMA4FP128FP256FSRMFXSRFXSROPTGFNIHLEHRESETHTTHWAHYBRID_CPUHYPERVISORIA32_ARCH_CAPIA32_CORE_CAPIBPBIBRSIBRS_PREFERREDIBRS_PROVIDES_SMPIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKIBS_FETCH_CTLXIBS_OPDATA4IBS_OPFUSEIBS_PREVENTHOSTIBS_ZEN4IDPRED_CTRLINT_WBINVDINVLPGBKEYLOCKERKEYLOCKERWLAHFLAMLBRVIRTLZCNTMCAOVERFLOWMCDT_NOMCOMMITMD_CLEARMMXMMXEXTMOVBEMOVDIR64BMOVDIRIMOVSB_ZLMOVUMPXMSRIRCMSRLISTMSR_PAGEFLUSHNRIPSNXOSXSAVEPCONFIGPOPCNTPPINPREFETCHIPSFDRDPRURDRANDRDSEEDRDTSCPRRSBA_CTRLRTMRTM_ALWAYS_ABORTSERIALIZESEVSEV_64BITSEV_ALTERNATIVESEV_DEBUGSWAPSEV_ESSEV_RESTRICTEDSEV_SNPSGXSGXLCSHASMESME_COHERENTSPEC_CTRL_SSBDSRBDS_CTRLSSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSTIBP_ALWAYSONSTOSB_SHORTSUCCORSVMSVMDASVMFBASIDSVMLSVMNPSVMPFSVMPFTSYSCALLSYSEETBMTDX_GUESTTLB_FLUSH_NESTEDTMETOPEXTTSCRATEMSRTSXLDTRKVAESVMCBCLEANVMPLVMSA_REGPROTVMXVPCLMULQDQVTEWAITPKGWBNOINVDWRMSRNSX87XGETBV1XOPXSAVEXSAVECXSAVEOPTXSAVESAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID"
228
229var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 62, 67, 70, 75, 84, 93, 102, 106, 116, 128, 136, 144, 152, 160, 167, 177, 187, 195, 205, 216, 224, 234, 252, 267, 274, 286, 293, 300, 311, 319, 323, 327, 333, 338, 346, 351, 357, 361, 370, 388, 396, 403, 407, 411, 425, 431, 435, 439, 448, 452, 456, 461, 466, 470, 474, 481, 485, 488, 494, 497, 500, 510, 520, 533, 546, 550, 554, 568, 585, 588, 598, 609, 615, 623, 634, 642, 654, 670, 684, 695, 705, 720, 728, 739, 749, 756, 765, 775, 779, 782, 789, 794, 805, 812, 819, 827, 830, 836, 841, 850, 857, 865, 869, 872, 878, 885, 898, 903, 905, 912, 919, 925, 929, 938, 942, 947, 953, 959, 965, 975, 978, 994, 1003, 1006, 1015, 1030, 1043, 1049, 1063, 1070, 1073, 1078, 1081, 1084, 1096, 1110, 1120, 1123, 1127, 1131, 1135, 1140, 1145, 1150, 1155, 1169, 1180, 1186, 1189, 1194, 1203, 1207, 1212, 1217, 1223, 1230, 1235, 1238, 1247, 1263, 1266, 1272, 1282, 1290, 1294, 1303, 1307, 1319, 1322, 1332, 1335, 1342, 1350, 1357, 1360, 1367, 1370, 1375, 1381, 1389, 1395, 1401, 1409, 1414, 1421, 1428, 1436, 1443, 1448, 1453, 1460, 1464, 1466, 1470, 1473, 1478, 1483, 1488, 1492, 1496, 1500, 1506, 1509, 1512, 1515, 1521}
230
231func (i FeatureID) String() string {
232 if i < 0 || i >= FeatureID(len(_FeatureID_index)-1) {
233 return "FeatureID(" + strconv.FormatInt(int64(i), 10) + ")"
234 }
235 return _FeatureID_name[_FeatureID_index[i]:_FeatureID_index[i+1]]
236}
237func _() {
238 // An "invalid array index" compiler error signifies that the constant values have changed.
239 // Re-run the stringer command to generate them again.
240 var x [1]struct{}
241 _ = x[VendorUnknown-0]
242 _ = x[Intel-1]
243 _ = x[AMD-2]
244 _ = x[VIA-3]
245 _ = x[Transmeta-4]
246 _ = x[NSC-5]
247 _ = x[KVM-6]
248 _ = x[MSVM-7]
249 _ = x[VMware-8]
250 _ = x[XenHVM-9]
251 _ = x[Bhyve-10]
252 _ = x[Hygon-11]
253 _ = x[SiS-12]
254 _ = x[RDC-13]
255 _ = x[Ampere-14]
256 _ = x[ARM-15]
257 _ = x[Broadcom-16]
258 _ = x[Cavium-17]
259 _ = x[DEC-18]
260 _ = x[Fujitsu-19]
261 _ = x[Infineon-20]
262 _ = x[Motorola-21]
263 _ = x[NVIDIA-22]
264 _ = x[AMCC-23]
265 _ = x[Qualcomm-24]
266 _ = x[Marvell-25]
267 _ = x[lastVendor-26]
268}
269
270const _Vendor_name = "VendorUnknownIntelAMDVIATransmetaNSCKVMMSVMVMwareXenHVMBhyveHygonSiSRDCAmpereARMBroadcomCaviumDECFujitsuInfineonMotorolaNVIDIAAMCCQualcommMarvelllastVendor"
271
272var _Vendor_index = [...]uint8{0, 13, 18, 21, 24, 33, 36, 39, 43, 49, 55, 60, 65, 68, 71, 77, 80, 88, 94, 97, 104, 112, 120, 126, 130, 138, 145, 155}
273
274func (i Vendor) String() string {
275 if i < 0 || i >= Vendor(len(_Vendor_index)-1) {
276 return "Vendor(" + strconv.FormatInt(int64(i), 10) + ")"
277 }
278 return _Vendor_name[_Vendor_index[i]:_Vendor_index[i+1]]
279}
diff --git a/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go
new file mode 100644
index 0000000..84b1acd
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go
@@ -0,0 +1,121 @@
1// Copyright (c) 2020 Klaus Post, released under MIT License. See LICENSE file.
2
3package cpuid
4
5import (
6 "runtime"
7 "strings"
8
9 "golang.org/x/sys/unix"
10)
11
12func detectOS(c *CPUInfo) bool {
13 if runtime.GOOS != "ios" {
14 tryToFillCPUInfoFomSysctl(c)
15 }
16 // There are no hw.optional sysctl values for the below features on Mac OS 11.0
17 // to detect their supported state dynamically. Assume the CPU features that
18 // Apple Silicon M1 supports to be available as a minimal set of features
19 // to all Go programs running on darwin/arm64.
20 // TODO: Add more if we know them.
21 c.featureSet.setIf(runtime.GOOS != "ios", AESARM, PMULL, SHA1, SHA2)
22
23 return true
24}
25
26func sysctlGetBool(name string) bool {
27 value, err := unix.SysctlUint32(name)
28 if err != nil {
29 return false
30 }
31 return value != 0
32}
33
34func sysctlGetString(name string) string {
35 value, err := unix.Sysctl(name)
36 if err != nil {
37 return ""
38 }
39 return value
40}
41
42func sysctlGetInt(unknown int, names ...string) int {
43 for _, name := range names {
44 value, err := unix.SysctlUint32(name)
45 if err != nil {
46 continue
47 }
48 if value != 0 {
49 return int(value)
50 }
51 }
52 return unknown
53}
54
55func sysctlGetInt64(unknown int, names ...string) int {
56 for _, name := range names {
57 value64, err := unix.SysctlUint64(name)
58 if err != nil {
59 continue
60 }
61 if int(value64) != unknown {
62 return int(value64)
63 }
64 }
65 return unknown
66}
67
68func setFeature(c *CPUInfo, name string, feature FeatureID) {
69 c.featureSet.setIf(sysctlGetBool(name), feature)
70}
71func tryToFillCPUInfoFomSysctl(c *CPUInfo) {
72 c.BrandName = sysctlGetString("machdep.cpu.brand_string")
73
74 if len(c.BrandName) != 0 {
75 c.VendorString = strings.Fields(c.BrandName)[0]
76 }
77
78 c.PhysicalCores = sysctlGetInt(runtime.NumCPU(), "hw.physicalcpu")
79 c.ThreadsPerCore = sysctlGetInt(1, "machdep.cpu.thread_count", "kern.num_threads") /
80 sysctlGetInt(1, "hw.physicalcpu")
81 c.LogicalCores = sysctlGetInt(runtime.NumCPU(), "machdep.cpu.core_count")
82 c.Family = sysctlGetInt(0, "machdep.cpu.family", "hw.cpufamily")
83 c.Model = sysctlGetInt(0, "machdep.cpu.model")
84 c.CacheLine = sysctlGetInt64(0, "hw.cachelinesize")
85 c.Cache.L1I = sysctlGetInt64(-1, "hw.l1icachesize")
86 c.Cache.L1D = sysctlGetInt64(-1, "hw.l1dcachesize")
87 c.Cache.L2 = sysctlGetInt64(-1, "hw.l2cachesize")
88 c.Cache.L3 = sysctlGetInt64(-1, "hw.l3cachesize")
89
90 // from https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile
91 setFeature(c, "hw.optional.arm.FEAT_AES", AESARM)
92 setFeature(c, "hw.optional.AdvSIMD", ASIMD)
93 setFeature(c, "hw.optional.arm.FEAT_DotProd", ASIMDDP)
94 setFeature(c, "hw.optional.arm.FEAT_RDM", ASIMDRDM)
95 setFeature(c, "hw.optional.FEAT_CRC32", CRC32)
96 setFeature(c, "hw.optional.arm.FEAT_DPB", DCPOP)
97 // setFeature(c, "", EVTSTRM)
98 setFeature(c, "hw.optional.arm.FEAT_FCMA", FCMA)
99 setFeature(c, "hw.optional.arm.FEAT_FP", FP)
100 setFeature(c, "hw.optional.arm.FEAT_FP16", FPHP)
101 setFeature(c, "hw.optional.arm.FEAT_PAuth", GPA)
102 setFeature(c, "hw.optional.arm.FEAT_JSCVT", JSCVT)
103 setFeature(c, "hw.optional.arm.FEAT_LRCPC", LRCPC)
104 setFeature(c, "hw.optional.arm.FEAT_PMULL", PMULL)
105 setFeature(c, "hw.optional.arm.FEAT_SHA1", SHA1)
106 setFeature(c, "hw.optional.arm.FEAT_SHA256", SHA2)
107 setFeature(c, "hw.optional.arm.FEAT_SHA3", SHA3)
108 setFeature(c, "hw.optional.arm.FEAT_SHA512", SHA512)
109 // setFeature(c, "", SM3)
110 // setFeature(c, "", SM4)
111 setFeature(c, "hw.optional.arm.FEAT_SVE", SVE)
112
113 // from empirical observation
114 setFeature(c, "hw.optional.AdvSIMD_HPFPCvt", ASIMDHP)
115 setFeature(c, "hw.optional.armv8_1_atomics", ATOMICS)
116 setFeature(c, "hw.optional.floatingpoint", FP)
117 setFeature(c, "hw.optional.armv8_2_sha3", SHA3)
118 setFeature(c, "hw.optional.armv8_2_sha512", SHA512)
119 setFeature(c, "hw.optional.armv8_3_compnum", FCMA)
120 setFeature(c, "hw.optional.armv8_crc32", CRC32)
121}
diff --git a/vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go
new file mode 100644
index 0000000..ee278b9
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/os_linux_arm64.go
@@ -0,0 +1,130 @@
1// Copyright (c) 2020 Klaus Post, released under MIT License. See LICENSE file.
2
3// Copyright 2018 The Go Authors. All rights reserved.
4// Use of this source code is governed by a BSD-style
5// license that can be found in the LICENSE file located
6// here https://github.com/golang/sys/blob/master/LICENSE
7
8package cpuid
9
10import (
11 "encoding/binary"
12 "io/ioutil"
13 "runtime"
14)
15
16// HWCAP bits.
17const (
18 hwcap_FP = 1 << 0
19 hwcap_ASIMD = 1 << 1
20 hwcap_EVTSTRM = 1 << 2
21 hwcap_AES = 1 << 3
22 hwcap_PMULL = 1 << 4
23 hwcap_SHA1 = 1 << 5
24 hwcap_SHA2 = 1 << 6
25 hwcap_CRC32 = 1 << 7
26 hwcap_ATOMICS = 1 << 8
27 hwcap_FPHP = 1 << 9
28 hwcap_ASIMDHP = 1 << 10
29 hwcap_CPUID = 1 << 11
30 hwcap_ASIMDRDM = 1 << 12
31 hwcap_JSCVT = 1 << 13
32 hwcap_FCMA = 1 << 14
33 hwcap_LRCPC = 1 << 15
34 hwcap_DCPOP = 1 << 16
35 hwcap_SHA3 = 1 << 17
36 hwcap_SM3 = 1 << 18
37 hwcap_SM4 = 1 << 19
38 hwcap_ASIMDDP = 1 << 20
39 hwcap_SHA512 = 1 << 21
40 hwcap_SVE = 1 << 22
41 hwcap_ASIMDFHM = 1 << 23
42)
43
44func detectOS(c *CPUInfo) bool {
45 // For now assuming no hyperthreading is reasonable.
46 c.LogicalCores = runtime.NumCPU()
47 c.PhysicalCores = c.LogicalCores
48 c.ThreadsPerCore = 1
49 if hwcap == 0 {
50 // We did not get values from the runtime.
51 // Try reading /proc/self/auxv
52
53 // From https://github.com/golang/sys
54 const (
55 _AT_HWCAP = 16
56 _AT_HWCAP2 = 26
57
58 uintSize = int(32 << (^uint(0) >> 63))
59 )
60
61 buf, err := ioutil.ReadFile("/proc/self/auxv")
62 if err != nil {
63 // e.g. on android /proc/self/auxv is not accessible, so silently
64 // ignore the error and leave Initialized = false. On some
65 // architectures (e.g. arm64) doinit() implements a fallback
66 // readout and will set Initialized = true again.
67 return false
68 }
69 bo := binary.LittleEndian
70 for len(buf) >= 2*(uintSize/8) {
71 var tag, val uint
72 switch uintSize {
73 case 32:
74 tag = uint(bo.Uint32(buf[0:]))
75 val = uint(bo.Uint32(buf[4:]))
76 buf = buf[8:]
77 case 64:
78 tag = uint(bo.Uint64(buf[0:]))
79 val = uint(bo.Uint64(buf[8:]))
80 buf = buf[16:]
81 }
82 switch tag {
83 case _AT_HWCAP:
84 hwcap = val
85 case _AT_HWCAP2:
86 // Not used
87 }
88 }
89 if hwcap == 0 {
90 return false
91 }
92 }
93
94 // HWCap was populated by the runtime from the auxiliary vector.
95 // Use HWCap information since reading aarch64 system registers
96 // is not supported in user space on older linux kernels.
97 c.featureSet.setIf(isSet(hwcap, hwcap_AES), AESARM)
98 c.featureSet.setIf(isSet(hwcap, hwcap_ASIMD), ASIMD)
99 c.featureSet.setIf(isSet(hwcap, hwcap_ASIMDDP), ASIMDDP)
100 c.featureSet.setIf(isSet(hwcap, hwcap_ASIMDHP), ASIMDHP)
101 c.featureSet.setIf(isSet(hwcap, hwcap_ASIMDRDM), ASIMDRDM)
102 c.featureSet.setIf(isSet(hwcap, hwcap_CPUID), ARMCPUID)
103 c.featureSet.setIf(isSet(hwcap, hwcap_CRC32), CRC32)
104 c.featureSet.setIf(isSet(hwcap, hwcap_DCPOP), DCPOP)
105 c.featureSet.setIf(isSet(hwcap, hwcap_EVTSTRM), EVTSTRM)
106 c.featureSet.setIf(isSet(hwcap, hwcap_FCMA), FCMA)
107 c.featureSet.setIf(isSet(hwcap, hwcap_FP), FP)
108 c.featureSet.setIf(isSet(hwcap, hwcap_FPHP), FPHP)
109 c.featureSet.setIf(isSet(hwcap, hwcap_JSCVT), JSCVT)
110 c.featureSet.setIf(isSet(hwcap, hwcap_LRCPC), LRCPC)
111 c.featureSet.setIf(isSet(hwcap, hwcap_PMULL), PMULL)
112 c.featureSet.setIf(isSet(hwcap, hwcap_SHA1), SHA1)
113 c.featureSet.setIf(isSet(hwcap, hwcap_SHA2), SHA2)
114 c.featureSet.setIf(isSet(hwcap, hwcap_SHA3), SHA3)
115 c.featureSet.setIf(isSet(hwcap, hwcap_SHA512), SHA512)
116 c.featureSet.setIf(isSet(hwcap, hwcap_SM3), SM3)
117 c.featureSet.setIf(isSet(hwcap, hwcap_SM4), SM4)
118 c.featureSet.setIf(isSet(hwcap, hwcap_SVE), SVE)
119
120 // The Samsung S9+ kernel reports support for atomics, but not all cores
121 // actually support them, resulting in SIGILL. See issue #28431.
122 // TODO(elias.naur): Only disable the optimization on bad chipsets on android.
123 c.featureSet.setIf(isSet(hwcap, hwcap_ATOMICS) && runtime.GOOS != "android", ATOMICS)
124
125 return true
126}
127
128func isSet(hwc uint, value uint) bool {
129 return hwc&value != 0
130}
diff --git a/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go
new file mode 100644
index 0000000..8733ba3
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go
@@ -0,0 +1,16 @@
1// Copyright (c) 2020 Klaus Post, released under MIT License. See LICENSE file.
2
3//go:build arm64 && !linux && !darwin
4// +build arm64,!linux,!darwin
5
6package cpuid
7
8import "runtime"
9
10func detectOS(c *CPUInfo) bool {
11 c.PhysicalCores = runtime.NumCPU()
12 // For now assuming 1 thread per core...
13 c.ThreadsPerCore = 1
14 c.LogicalCores = c.PhysicalCores
15 return false
16}
diff --git a/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go
new file mode 100644
index 0000000..f8f201b
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go
@@ -0,0 +1,8 @@
1// Copyright (c) 2021 Klaus Post, released under MIT License. See LICENSE file.
2
3//go:build nounsafe
4// +build nounsafe
5
6package cpuid
7
8var hwcap uint
diff --git a/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go
new file mode 100644
index 0000000..92af622
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go
@@ -0,0 +1,11 @@
1// Copyright (c) 2021 Klaus Post, released under MIT License. See LICENSE file.
2
3//go:build !nounsafe
4// +build !nounsafe
5
6package cpuid
7
8import _ "unsafe" // needed for go:linkname
9
10//go:linkname hwcap internal/cpu.HWCap
11var hwcap uint
diff --git a/vendor/github.com/klauspost/cpuid/v2/test-architectures.sh b/vendor/github.com/klauspost/cpuid/v2/test-architectures.sh
new file mode 100644
index 0000000..471d986
--- /dev/null
+++ b/vendor/github.com/klauspost/cpuid/v2/test-architectures.sh
@@ -0,0 +1,15 @@
1#!/bin/sh
2
3set -e
4
5go tool dist list | while IFS=/ read os arch; do
6 echo "Checking $os/$arch..."
7 echo " normal"
8 GOARCH=$arch GOOS=$os go build -o /dev/null .
9 echo " noasm"
10 GOARCH=$arch GOOS=$os go build -tags noasm -o /dev/null .
11 echo " appengine"
12 GOARCH=$arch GOOS=$os go build -tags appengine -o /dev/null .
13 echo " noasm,appengine"
14 GOARCH=$arch GOOS=$os go build -tags 'appengine noasm' -o /dev/null .
15done