diff options
Diffstat (limited to 'vendor/golang.org/x/text/unicode/norm/iter.go')
-rw-r--r-- | vendor/golang.org/x/text/unicode/norm/iter.go | 458 |
1 files changed, 458 insertions, 0 deletions
diff --git a/vendor/golang.org/x/text/unicode/norm/iter.go b/vendor/golang.org/x/text/unicode/norm/iter.go new file mode 100644 index 0000000..417c6b2 --- /dev/null +++ b/vendor/golang.org/x/text/unicode/norm/iter.go | |||
@@ -0,0 +1,458 @@ | |||
1 | // Copyright 2011 The Go Authors. All rights reserved. | ||
2 | // Use of this source code is governed by a BSD-style | ||
3 | // license that can be found in the LICENSE file. | ||
4 | |||
5 | package norm | ||
6 | |||
7 | import ( | ||
8 | "fmt" | ||
9 | "unicode/utf8" | ||
10 | ) | ||
11 | |||
12 | // MaxSegmentSize is the maximum size of a byte buffer needed to consider any | ||
13 | // sequence of starter and non-starter runes for the purpose of normalization. | ||
14 | const MaxSegmentSize = maxByteBufferSize | ||
15 | |||
16 | // An Iter iterates over a string or byte slice, while normalizing it | ||
17 | // to a given Form. | ||
18 | type Iter struct { | ||
19 | rb reorderBuffer | ||
20 | buf [maxByteBufferSize]byte | ||
21 | info Properties // first character saved from previous iteration | ||
22 | next iterFunc // implementation of next depends on form | ||
23 | asciiF iterFunc | ||
24 | |||
25 | p int // current position in input source | ||
26 | multiSeg []byte // remainder of multi-segment decomposition | ||
27 | } | ||
28 | |||
29 | type iterFunc func(*Iter) []byte | ||
30 | |||
31 | // Init initializes i to iterate over src after normalizing it to Form f. | ||
32 | func (i *Iter) Init(f Form, src []byte) { | ||
33 | i.p = 0 | ||
34 | if len(src) == 0 { | ||
35 | i.setDone() | ||
36 | i.rb.nsrc = 0 | ||
37 | return | ||
38 | } | ||
39 | i.multiSeg = nil | ||
40 | i.rb.init(f, src) | ||
41 | i.next = i.rb.f.nextMain | ||
42 | i.asciiF = nextASCIIBytes | ||
43 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
44 | i.rb.ss.first(i.info) | ||
45 | } | ||
46 | |||
47 | // InitString initializes i to iterate over src after normalizing it to Form f. | ||
48 | func (i *Iter) InitString(f Form, src string) { | ||
49 | i.p = 0 | ||
50 | if len(src) == 0 { | ||
51 | i.setDone() | ||
52 | i.rb.nsrc = 0 | ||
53 | return | ||
54 | } | ||
55 | i.multiSeg = nil | ||
56 | i.rb.initString(f, src) | ||
57 | i.next = i.rb.f.nextMain | ||
58 | i.asciiF = nextASCIIString | ||
59 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
60 | i.rb.ss.first(i.info) | ||
61 | } | ||
62 | |||
63 | // Seek sets the segment to be returned by the next call to Next to start | ||
64 | // at position p. It is the responsibility of the caller to set p to the | ||
65 | // start of a segment. | ||
66 | func (i *Iter) Seek(offset int64, whence int) (int64, error) { | ||
67 | var abs int64 | ||
68 | switch whence { | ||
69 | case 0: | ||
70 | abs = offset | ||
71 | case 1: | ||
72 | abs = int64(i.p) + offset | ||
73 | case 2: | ||
74 | abs = int64(i.rb.nsrc) + offset | ||
75 | default: | ||
76 | return 0, fmt.Errorf("norm: invalid whence") | ||
77 | } | ||
78 | if abs < 0 { | ||
79 | return 0, fmt.Errorf("norm: negative position") | ||
80 | } | ||
81 | if int(abs) >= i.rb.nsrc { | ||
82 | i.setDone() | ||
83 | return int64(i.p), nil | ||
84 | } | ||
85 | i.p = int(abs) | ||
86 | i.multiSeg = nil | ||
87 | i.next = i.rb.f.nextMain | ||
88 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
89 | i.rb.ss.first(i.info) | ||
90 | return abs, nil | ||
91 | } | ||
92 | |||
93 | // returnSlice returns a slice of the underlying input type as a byte slice. | ||
94 | // If the underlying is of type []byte, it will simply return a slice. | ||
95 | // If the underlying is of type string, it will copy the slice to the buffer | ||
96 | // and return that. | ||
97 | func (i *Iter) returnSlice(a, b int) []byte { | ||
98 | if i.rb.src.bytes == nil { | ||
99 | return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] | ||
100 | } | ||
101 | return i.rb.src.bytes[a:b] | ||
102 | } | ||
103 | |||
104 | // Pos returns the byte position at which the next call to Next will commence processing. | ||
105 | func (i *Iter) Pos() int { | ||
106 | return i.p | ||
107 | } | ||
108 | |||
109 | func (i *Iter) setDone() { | ||
110 | i.next = nextDone | ||
111 | i.p = i.rb.nsrc | ||
112 | } | ||
113 | |||
114 | // Done returns true if there is no more input to process. | ||
115 | func (i *Iter) Done() bool { | ||
116 | return i.p >= i.rb.nsrc | ||
117 | } | ||
118 | |||
119 | // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. | ||
120 | // For any input a and b for which f(a) == f(b), subsequent calls | ||
121 | // to Next will return the same segments. | ||
122 | // Modifying runes are grouped together with the preceding starter, if such a starter exists. | ||
123 | // Although not guaranteed, n will typically be the smallest possible n. | ||
124 | func (i *Iter) Next() []byte { | ||
125 | return i.next(i) | ||
126 | } | ||
127 | |||
128 | func nextASCIIBytes(i *Iter) []byte { | ||
129 | p := i.p + 1 | ||
130 | if p >= i.rb.nsrc { | ||
131 | p0 := i.p | ||
132 | i.setDone() | ||
133 | return i.rb.src.bytes[p0:p] | ||
134 | } | ||
135 | if i.rb.src.bytes[p] < utf8.RuneSelf { | ||
136 | p0 := i.p | ||
137 | i.p = p | ||
138 | return i.rb.src.bytes[p0:p] | ||
139 | } | ||
140 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
141 | i.next = i.rb.f.nextMain | ||
142 | return i.next(i) | ||
143 | } | ||
144 | |||
145 | func nextASCIIString(i *Iter) []byte { | ||
146 | p := i.p + 1 | ||
147 | if p >= i.rb.nsrc { | ||
148 | i.buf[0] = i.rb.src.str[i.p] | ||
149 | i.setDone() | ||
150 | return i.buf[:1] | ||
151 | } | ||
152 | if i.rb.src.str[p] < utf8.RuneSelf { | ||
153 | i.buf[0] = i.rb.src.str[i.p] | ||
154 | i.p = p | ||
155 | return i.buf[:1] | ||
156 | } | ||
157 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
158 | i.next = i.rb.f.nextMain | ||
159 | return i.next(i) | ||
160 | } | ||
161 | |||
162 | func nextHangul(i *Iter) []byte { | ||
163 | p := i.p | ||
164 | next := p + hangulUTF8Size | ||
165 | if next >= i.rb.nsrc { | ||
166 | i.setDone() | ||
167 | } else if i.rb.src.hangul(next) == 0 { | ||
168 | i.rb.ss.next(i.info) | ||
169 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
170 | i.next = i.rb.f.nextMain | ||
171 | return i.next(i) | ||
172 | } | ||
173 | i.p = next | ||
174 | return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] | ||
175 | } | ||
176 | |||
177 | func nextDone(i *Iter) []byte { | ||
178 | return nil | ||
179 | } | ||
180 | |||
181 | // nextMulti is used for iterating over multi-segment decompositions | ||
182 | // for decomposing normal forms. | ||
183 | func nextMulti(i *Iter) []byte { | ||
184 | j := 0 | ||
185 | d := i.multiSeg | ||
186 | // skip first rune | ||
187 | for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { | ||
188 | } | ||
189 | for j < len(d) { | ||
190 | info := i.rb.f.info(input{bytes: d}, j) | ||
191 | if info.BoundaryBefore() { | ||
192 | i.multiSeg = d[j:] | ||
193 | return d[:j] | ||
194 | } | ||
195 | j += int(info.size) | ||
196 | } | ||
197 | // treat last segment as normal decomposition | ||
198 | i.next = i.rb.f.nextMain | ||
199 | return i.next(i) | ||
200 | } | ||
201 | |||
202 | // nextMultiNorm is used for iterating over multi-segment decompositions | ||
203 | // for composing normal forms. | ||
204 | func nextMultiNorm(i *Iter) []byte { | ||
205 | j := 0 | ||
206 | d := i.multiSeg | ||
207 | for j < len(d) { | ||
208 | info := i.rb.f.info(input{bytes: d}, j) | ||
209 | if info.BoundaryBefore() { | ||
210 | i.rb.compose() | ||
211 | seg := i.buf[:i.rb.flushCopy(i.buf[:])] | ||
212 | i.rb.insertUnsafe(input{bytes: d}, j, info) | ||
213 | i.multiSeg = d[j+int(info.size):] | ||
214 | return seg | ||
215 | } | ||
216 | i.rb.insertUnsafe(input{bytes: d}, j, info) | ||
217 | j += int(info.size) | ||
218 | } | ||
219 | i.multiSeg = nil | ||
220 | i.next = nextComposed | ||
221 | return doNormComposed(i) | ||
222 | } | ||
223 | |||
224 | // nextDecomposed is the implementation of Next for forms NFD and NFKD. | ||
225 | func nextDecomposed(i *Iter) (next []byte) { | ||
226 | outp := 0 | ||
227 | inCopyStart, outCopyStart := i.p, 0 | ||
228 | for { | ||
229 | if sz := int(i.info.size); sz <= 1 { | ||
230 | i.rb.ss = 0 | ||
231 | p := i.p | ||
232 | i.p++ // ASCII or illegal byte. Either way, advance by 1. | ||
233 | if i.p >= i.rb.nsrc { | ||
234 | i.setDone() | ||
235 | return i.returnSlice(p, i.p) | ||
236 | } else if i.rb.src._byte(i.p) < utf8.RuneSelf { | ||
237 | i.next = i.asciiF | ||
238 | return i.returnSlice(p, i.p) | ||
239 | } | ||
240 | outp++ | ||
241 | } else if d := i.info.Decomposition(); d != nil { | ||
242 | // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. | ||
243 | // Case 1: there is a leftover to copy. In this case the decomposition | ||
244 | // must begin with a modifier and should always be appended. | ||
245 | // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. | ||
246 | p := outp + len(d) | ||
247 | if outp > 0 { | ||
248 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) | ||
249 | // TODO: this condition should not be possible, but we leave it | ||
250 | // in for defensive purposes. | ||
251 | if p > len(i.buf) { | ||
252 | return i.buf[:outp] | ||
253 | } | ||
254 | } else if i.info.multiSegment() { | ||
255 | // outp must be 0 as multi-segment decompositions always | ||
256 | // start a new segment. | ||
257 | if i.multiSeg == nil { | ||
258 | i.multiSeg = d | ||
259 | i.next = nextMulti | ||
260 | return nextMulti(i) | ||
261 | } | ||
262 | // We are in the last segment. Treat as normal decomposition. | ||
263 | d = i.multiSeg | ||
264 | i.multiSeg = nil | ||
265 | p = len(d) | ||
266 | } | ||
267 | prevCC := i.info.tccc | ||
268 | if i.p += sz; i.p >= i.rb.nsrc { | ||
269 | i.setDone() | ||
270 | i.info = Properties{} // Force BoundaryBefore to succeed. | ||
271 | } else { | ||
272 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
273 | } | ||
274 | switch i.rb.ss.next(i.info) { | ||
275 | case ssOverflow: | ||
276 | i.next = nextCGJDecompose | ||
277 | fallthrough | ||
278 | case ssStarter: | ||
279 | if outp > 0 { | ||
280 | copy(i.buf[outp:], d) | ||
281 | return i.buf[:p] | ||
282 | } | ||
283 | return d | ||
284 | } | ||
285 | copy(i.buf[outp:], d) | ||
286 | outp = p | ||
287 | inCopyStart, outCopyStart = i.p, outp | ||
288 | if i.info.ccc < prevCC { | ||
289 | goto doNorm | ||
290 | } | ||
291 | continue | ||
292 | } else if r := i.rb.src.hangul(i.p); r != 0 { | ||
293 | outp = decomposeHangul(i.buf[:], r) | ||
294 | i.p += hangulUTF8Size | ||
295 | inCopyStart, outCopyStart = i.p, outp | ||
296 | if i.p >= i.rb.nsrc { | ||
297 | i.setDone() | ||
298 | break | ||
299 | } else if i.rb.src.hangul(i.p) != 0 { | ||
300 | i.next = nextHangul | ||
301 | return i.buf[:outp] | ||
302 | } | ||
303 | } else { | ||
304 | p := outp + sz | ||
305 | if p > len(i.buf) { | ||
306 | break | ||
307 | } | ||
308 | outp = p | ||
309 | i.p += sz | ||
310 | } | ||
311 | if i.p >= i.rb.nsrc { | ||
312 | i.setDone() | ||
313 | break | ||
314 | } | ||
315 | prevCC := i.info.tccc | ||
316 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
317 | if v := i.rb.ss.next(i.info); v == ssStarter { | ||
318 | break | ||
319 | } else if v == ssOverflow { | ||
320 | i.next = nextCGJDecompose | ||
321 | break | ||
322 | } | ||
323 | if i.info.ccc < prevCC { | ||
324 | goto doNorm | ||
325 | } | ||
326 | } | ||
327 | if outCopyStart == 0 { | ||
328 | return i.returnSlice(inCopyStart, i.p) | ||
329 | } else if inCopyStart < i.p { | ||
330 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) | ||
331 | } | ||
332 | return i.buf[:outp] | ||
333 | doNorm: | ||
334 | // Insert what we have decomposed so far in the reorderBuffer. | ||
335 | // As we will only reorder, there will always be enough room. | ||
336 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) | ||
337 | i.rb.insertDecomposed(i.buf[0:outp]) | ||
338 | return doNormDecomposed(i) | ||
339 | } | ||
340 | |||
341 | func doNormDecomposed(i *Iter) []byte { | ||
342 | for { | ||
343 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | ||
344 | if i.p += int(i.info.size); i.p >= i.rb.nsrc { | ||
345 | i.setDone() | ||
346 | break | ||
347 | } | ||
348 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
349 | if i.info.ccc == 0 { | ||
350 | break | ||
351 | } | ||
352 | if s := i.rb.ss.next(i.info); s == ssOverflow { | ||
353 | i.next = nextCGJDecompose | ||
354 | break | ||
355 | } | ||
356 | } | ||
357 | // new segment or too many combining characters: exit normalization | ||
358 | return i.buf[:i.rb.flushCopy(i.buf[:])] | ||
359 | } | ||
360 | |||
361 | func nextCGJDecompose(i *Iter) []byte { | ||
362 | i.rb.ss = 0 | ||
363 | i.rb.insertCGJ() | ||
364 | i.next = nextDecomposed | ||
365 | i.rb.ss.first(i.info) | ||
366 | buf := doNormDecomposed(i) | ||
367 | return buf | ||
368 | } | ||
369 | |||
370 | // nextComposed is the implementation of Next for forms NFC and NFKC. | ||
371 | func nextComposed(i *Iter) []byte { | ||
372 | outp, startp := 0, i.p | ||
373 | var prevCC uint8 | ||
374 | for { | ||
375 | if !i.info.isYesC() { | ||
376 | goto doNorm | ||
377 | } | ||
378 | prevCC = i.info.tccc | ||
379 | sz := int(i.info.size) | ||
380 | if sz == 0 { | ||
381 | sz = 1 // illegal rune: copy byte-by-byte | ||
382 | } | ||
383 | p := outp + sz | ||
384 | if p > len(i.buf) { | ||
385 | break | ||
386 | } | ||
387 | outp = p | ||
388 | i.p += sz | ||
389 | if i.p >= i.rb.nsrc { | ||
390 | i.setDone() | ||
391 | break | ||
392 | } else if i.rb.src._byte(i.p) < utf8.RuneSelf { | ||
393 | i.rb.ss = 0 | ||
394 | i.next = i.asciiF | ||
395 | break | ||
396 | } | ||
397 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
398 | if v := i.rb.ss.next(i.info); v == ssStarter { | ||
399 | break | ||
400 | } else if v == ssOverflow { | ||
401 | i.next = nextCGJCompose | ||
402 | break | ||
403 | } | ||
404 | if i.info.ccc < prevCC { | ||
405 | goto doNorm | ||
406 | } | ||
407 | } | ||
408 | return i.returnSlice(startp, i.p) | ||
409 | doNorm: | ||
410 | // reset to start position | ||
411 | i.p = startp | ||
412 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
413 | i.rb.ss.first(i.info) | ||
414 | if i.info.multiSegment() { | ||
415 | d := i.info.Decomposition() | ||
416 | info := i.rb.f.info(input{bytes: d}, 0) | ||
417 | i.rb.insertUnsafe(input{bytes: d}, 0, info) | ||
418 | i.multiSeg = d[int(info.size):] | ||
419 | i.next = nextMultiNorm | ||
420 | return nextMultiNorm(i) | ||
421 | } | ||
422 | i.rb.ss.first(i.info) | ||
423 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | ||
424 | return doNormComposed(i) | ||
425 | } | ||
426 | |||
427 | func doNormComposed(i *Iter) []byte { | ||
428 | // First rune should already be inserted. | ||
429 | for { | ||
430 | if i.p += int(i.info.size); i.p >= i.rb.nsrc { | ||
431 | i.setDone() | ||
432 | break | ||
433 | } | ||
434 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
435 | if s := i.rb.ss.next(i.info); s == ssStarter { | ||
436 | break | ||
437 | } else if s == ssOverflow { | ||
438 | i.next = nextCGJCompose | ||
439 | break | ||
440 | } | ||
441 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | ||
442 | } | ||
443 | i.rb.compose() | ||
444 | seg := i.buf[:i.rb.flushCopy(i.buf[:])] | ||
445 | return seg | ||
446 | } | ||
447 | |||
448 | func nextCGJCompose(i *Iter) []byte { | ||
449 | i.rb.ss = 0 // instead of first | ||
450 | i.rb.insertCGJ() | ||
451 | i.next = nextComposed | ||
452 | // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, | ||
453 | // even if they are not. This is particularly dubious for U+FF9E and UFF9A. | ||
454 | // If we ever change that, insert a check here. | ||
455 | i.rb.ss.first(i.info) | ||
456 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | ||
457 | return doNormComposed(i) | ||
458 | } | ||