diff options
Diffstat (limited to 'vendor/golang.org/x/text/unicode/norm/iter.go')
| -rw-r--r-- | vendor/golang.org/x/text/unicode/norm/iter.go | 458 |
1 files changed, 0 insertions, 458 deletions
diff --git a/vendor/golang.org/x/text/unicode/norm/iter.go b/vendor/golang.org/x/text/unicode/norm/iter.go deleted file mode 100644 index 417c6b2..0000000 --- a/vendor/golang.org/x/text/unicode/norm/iter.go +++ /dev/null | |||
| @@ -1,458 +0,0 @@ | |||
| 1 | // Copyright 2011 The Go Authors. All rights reserved. | ||
| 2 | // Use of this source code is governed by a BSD-style | ||
| 3 | // license that can be found in the LICENSE file. | ||
| 4 | |||
| 5 | package norm | ||
| 6 | |||
| 7 | import ( | ||
| 8 | "fmt" | ||
| 9 | "unicode/utf8" | ||
| 10 | ) | ||
| 11 | |||
| 12 | // MaxSegmentSize is the maximum size of a byte buffer needed to consider any | ||
| 13 | // sequence of starter and non-starter runes for the purpose of normalization. | ||
| 14 | const MaxSegmentSize = maxByteBufferSize | ||
| 15 | |||
| 16 | // An Iter iterates over a string or byte slice, while normalizing it | ||
| 17 | // to a given Form. | ||
| 18 | type Iter struct { | ||
| 19 | rb reorderBuffer | ||
| 20 | buf [maxByteBufferSize]byte | ||
| 21 | info Properties // first character saved from previous iteration | ||
| 22 | next iterFunc // implementation of next depends on form | ||
| 23 | asciiF iterFunc | ||
| 24 | |||
| 25 | p int // current position in input source | ||
| 26 | multiSeg []byte // remainder of multi-segment decomposition | ||
| 27 | } | ||
| 28 | |||
| 29 | type iterFunc func(*Iter) []byte | ||
| 30 | |||
| 31 | // Init initializes i to iterate over src after normalizing it to Form f. | ||
| 32 | func (i *Iter) Init(f Form, src []byte) { | ||
| 33 | i.p = 0 | ||
| 34 | if len(src) == 0 { | ||
| 35 | i.setDone() | ||
| 36 | i.rb.nsrc = 0 | ||
| 37 | return | ||
| 38 | } | ||
| 39 | i.multiSeg = nil | ||
| 40 | i.rb.init(f, src) | ||
| 41 | i.next = i.rb.f.nextMain | ||
| 42 | i.asciiF = nextASCIIBytes | ||
| 43 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 44 | i.rb.ss.first(i.info) | ||
| 45 | } | ||
| 46 | |||
| 47 | // InitString initializes i to iterate over src after normalizing it to Form f. | ||
| 48 | func (i *Iter) InitString(f Form, src string) { | ||
| 49 | i.p = 0 | ||
| 50 | if len(src) == 0 { | ||
| 51 | i.setDone() | ||
| 52 | i.rb.nsrc = 0 | ||
| 53 | return | ||
| 54 | } | ||
| 55 | i.multiSeg = nil | ||
| 56 | i.rb.initString(f, src) | ||
| 57 | i.next = i.rb.f.nextMain | ||
| 58 | i.asciiF = nextASCIIString | ||
| 59 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 60 | i.rb.ss.first(i.info) | ||
| 61 | } | ||
| 62 | |||
| 63 | // Seek sets the segment to be returned by the next call to Next to start | ||
| 64 | // at position p. It is the responsibility of the caller to set p to the | ||
| 65 | // start of a segment. | ||
| 66 | func (i *Iter) Seek(offset int64, whence int) (int64, error) { | ||
| 67 | var abs int64 | ||
| 68 | switch whence { | ||
| 69 | case 0: | ||
| 70 | abs = offset | ||
| 71 | case 1: | ||
| 72 | abs = int64(i.p) + offset | ||
| 73 | case 2: | ||
| 74 | abs = int64(i.rb.nsrc) + offset | ||
| 75 | default: | ||
| 76 | return 0, fmt.Errorf("norm: invalid whence") | ||
| 77 | } | ||
| 78 | if abs < 0 { | ||
| 79 | return 0, fmt.Errorf("norm: negative position") | ||
| 80 | } | ||
| 81 | if int(abs) >= i.rb.nsrc { | ||
| 82 | i.setDone() | ||
| 83 | return int64(i.p), nil | ||
| 84 | } | ||
| 85 | i.p = int(abs) | ||
| 86 | i.multiSeg = nil | ||
| 87 | i.next = i.rb.f.nextMain | ||
| 88 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 89 | i.rb.ss.first(i.info) | ||
| 90 | return abs, nil | ||
| 91 | } | ||
| 92 | |||
| 93 | // returnSlice returns a slice of the underlying input type as a byte slice. | ||
| 94 | // If the underlying is of type []byte, it will simply return a slice. | ||
| 95 | // If the underlying is of type string, it will copy the slice to the buffer | ||
| 96 | // and return that. | ||
| 97 | func (i *Iter) returnSlice(a, b int) []byte { | ||
| 98 | if i.rb.src.bytes == nil { | ||
| 99 | return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] | ||
| 100 | } | ||
| 101 | return i.rb.src.bytes[a:b] | ||
| 102 | } | ||
| 103 | |||
| 104 | // Pos returns the byte position at which the next call to Next will commence processing. | ||
| 105 | func (i *Iter) Pos() int { | ||
| 106 | return i.p | ||
| 107 | } | ||
| 108 | |||
| 109 | func (i *Iter) setDone() { | ||
| 110 | i.next = nextDone | ||
| 111 | i.p = i.rb.nsrc | ||
| 112 | } | ||
| 113 | |||
| 114 | // Done returns true if there is no more input to process. | ||
| 115 | func (i *Iter) Done() bool { | ||
| 116 | return i.p >= i.rb.nsrc | ||
| 117 | } | ||
| 118 | |||
| 119 | // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. | ||
| 120 | // For any input a and b for which f(a) == f(b), subsequent calls | ||
| 121 | // to Next will return the same segments. | ||
| 122 | // Modifying runes are grouped together with the preceding starter, if such a starter exists. | ||
| 123 | // Although not guaranteed, n will typically be the smallest possible n. | ||
| 124 | func (i *Iter) Next() []byte { | ||
| 125 | return i.next(i) | ||
| 126 | } | ||
| 127 | |||
| 128 | func nextASCIIBytes(i *Iter) []byte { | ||
| 129 | p := i.p + 1 | ||
| 130 | if p >= i.rb.nsrc { | ||
| 131 | p0 := i.p | ||
| 132 | i.setDone() | ||
| 133 | return i.rb.src.bytes[p0:p] | ||
| 134 | } | ||
| 135 | if i.rb.src.bytes[p] < utf8.RuneSelf { | ||
| 136 | p0 := i.p | ||
| 137 | i.p = p | ||
| 138 | return i.rb.src.bytes[p0:p] | ||
| 139 | } | ||
| 140 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 141 | i.next = i.rb.f.nextMain | ||
| 142 | return i.next(i) | ||
| 143 | } | ||
| 144 | |||
| 145 | func nextASCIIString(i *Iter) []byte { | ||
| 146 | p := i.p + 1 | ||
| 147 | if p >= i.rb.nsrc { | ||
| 148 | i.buf[0] = i.rb.src.str[i.p] | ||
| 149 | i.setDone() | ||
| 150 | return i.buf[:1] | ||
| 151 | } | ||
| 152 | if i.rb.src.str[p] < utf8.RuneSelf { | ||
| 153 | i.buf[0] = i.rb.src.str[i.p] | ||
| 154 | i.p = p | ||
| 155 | return i.buf[:1] | ||
| 156 | } | ||
| 157 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 158 | i.next = i.rb.f.nextMain | ||
| 159 | return i.next(i) | ||
| 160 | } | ||
| 161 | |||
| 162 | func nextHangul(i *Iter) []byte { | ||
| 163 | p := i.p | ||
| 164 | next := p + hangulUTF8Size | ||
| 165 | if next >= i.rb.nsrc { | ||
| 166 | i.setDone() | ||
| 167 | } else if i.rb.src.hangul(next) == 0 { | ||
| 168 | i.rb.ss.next(i.info) | ||
| 169 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 170 | i.next = i.rb.f.nextMain | ||
| 171 | return i.next(i) | ||
| 172 | } | ||
| 173 | i.p = next | ||
| 174 | return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] | ||
| 175 | } | ||
| 176 | |||
| 177 | func nextDone(i *Iter) []byte { | ||
| 178 | return nil | ||
| 179 | } | ||
| 180 | |||
| 181 | // nextMulti is used for iterating over multi-segment decompositions | ||
| 182 | // for decomposing normal forms. | ||
| 183 | func nextMulti(i *Iter) []byte { | ||
| 184 | j := 0 | ||
| 185 | d := i.multiSeg | ||
| 186 | // skip first rune | ||
| 187 | for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { | ||
| 188 | } | ||
| 189 | for j < len(d) { | ||
| 190 | info := i.rb.f.info(input{bytes: d}, j) | ||
| 191 | if info.BoundaryBefore() { | ||
| 192 | i.multiSeg = d[j:] | ||
| 193 | return d[:j] | ||
| 194 | } | ||
| 195 | j += int(info.size) | ||
| 196 | } | ||
| 197 | // treat last segment as normal decomposition | ||
| 198 | i.next = i.rb.f.nextMain | ||
| 199 | return i.next(i) | ||
| 200 | } | ||
| 201 | |||
| 202 | // nextMultiNorm is used for iterating over multi-segment decompositions | ||
| 203 | // for composing normal forms. | ||
| 204 | func nextMultiNorm(i *Iter) []byte { | ||
| 205 | j := 0 | ||
| 206 | d := i.multiSeg | ||
| 207 | for j < len(d) { | ||
| 208 | info := i.rb.f.info(input{bytes: d}, j) | ||
| 209 | if info.BoundaryBefore() { | ||
| 210 | i.rb.compose() | ||
| 211 | seg := i.buf[:i.rb.flushCopy(i.buf[:])] | ||
| 212 | i.rb.insertUnsafe(input{bytes: d}, j, info) | ||
| 213 | i.multiSeg = d[j+int(info.size):] | ||
| 214 | return seg | ||
| 215 | } | ||
| 216 | i.rb.insertUnsafe(input{bytes: d}, j, info) | ||
| 217 | j += int(info.size) | ||
| 218 | } | ||
| 219 | i.multiSeg = nil | ||
| 220 | i.next = nextComposed | ||
| 221 | return doNormComposed(i) | ||
| 222 | } | ||
| 223 | |||
| 224 | // nextDecomposed is the implementation of Next for forms NFD and NFKD. | ||
| 225 | func nextDecomposed(i *Iter) (next []byte) { | ||
| 226 | outp := 0 | ||
| 227 | inCopyStart, outCopyStart := i.p, 0 | ||
| 228 | for { | ||
| 229 | if sz := int(i.info.size); sz <= 1 { | ||
| 230 | i.rb.ss = 0 | ||
| 231 | p := i.p | ||
| 232 | i.p++ // ASCII or illegal byte. Either way, advance by 1. | ||
| 233 | if i.p >= i.rb.nsrc { | ||
| 234 | i.setDone() | ||
| 235 | return i.returnSlice(p, i.p) | ||
| 236 | } else if i.rb.src._byte(i.p) < utf8.RuneSelf { | ||
| 237 | i.next = i.asciiF | ||
| 238 | return i.returnSlice(p, i.p) | ||
| 239 | } | ||
| 240 | outp++ | ||
| 241 | } else if d := i.info.Decomposition(); d != nil { | ||
| 242 | // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. | ||
| 243 | // Case 1: there is a leftover to copy. In this case the decomposition | ||
| 244 | // must begin with a modifier and should always be appended. | ||
| 245 | // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. | ||
| 246 | p := outp + len(d) | ||
| 247 | if outp > 0 { | ||
| 248 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) | ||
| 249 | // TODO: this condition should not be possible, but we leave it | ||
| 250 | // in for defensive purposes. | ||
| 251 | if p > len(i.buf) { | ||
| 252 | return i.buf[:outp] | ||
| 253 | } | ||
| 254 | } else if i.info.multiSegment() { | ||
| 255 | // outp must be 0 as multi-segment decompositions always | ||
| 256 | // start a new segment. | ||
| 257 | if i.multiSeg == nil { | ||
| 258 | i.multiSeg = d | ||
| 259 | i.next = nextMulti | ||
| 260 | return nextMulti(i) | ||
| 261 | } | ||
| 262 | // We are in the last segment. Treat as normal decomposition. | ||
| 263 | d = i.multiSeg | ||
| 264 | i.multiSeg = nil | ||
| 265 | p = len(d) | ||
| 266 | } | ||
| 267 | prevCC := i.info.tccc | ||
| 268 | if i.p += sz; i.p >= i.rb.nsrc { | ||
| 269 | i.setDone() | ||
| 270 | i.info = Properties{} // Force BoundaryBefore to succeed. | ||
| 271 | } else { | ||
| 272 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 273 | } | ||
| 274 | switch i.rb.ss.next(i.info) { | ||
| 275 | case ssOverflow: | ||
| 276 | i.next = nextCGJDecompose | ||
| 277 | fallthrough | ||
| 278 | case ssStarter: | ||
| 279 | if outp > 0 { | ||
| 280 | copy(i.buf[outp:], d) | ||
| 281 | return i.buf[:p] | ||
| 282 | } | ||
| 283 | return d | ||
| 284 | } | ||
| 285 | copy(i.buf[outp:], d) | ||
| 286 | outp = p | ||
| 287 | inCopyStart, outCopyStart = i.p, outp | ||
| 288 | if i.info.ccc < prevCC { | ||
| 289 | goto doNorm | ||
| 290 | } | ||
| 291 | continue | ||
| 292 | } else if r := i.rb.src.hangul(i.p); r != 0 { | ||
| 293 | outp = decomposeHangul(i.buf[:], r) | ||
| 294 | i.p += hangulUTF8Size | ||
| 295 | inCopyStart, outCopyStart = i.p, outp | ||
| 296 | if i.p >= i.rb.nsrc { | ||
| 297 | i.setDone() | ||
| 298 | break | ||
| 299 | } else if i.rb.src.hangul(i.p) != 0 { | ||
| 300 | i.next = nextHangul | ||
| 301 | return i.buf[:outp] | ||
| 302 | } | ||
| 303 | } else { | ||
| 304 | p := outp + sz | ||
| 305 | if p > len(i.buf) { | ||
| 306 | break | ||
| 307 | } | ||
| 308 | outp = p | ||
| 309 | i.p += sz | ||
| 310 | } | ||
| 311 | if i.p >= i.rb.nsrc { | ||
| 312 | i.setDone() | ||
| 313 | break | ||
| 314 | } | ||
| 315 | prevCC := i.info.tccc | ||
| 316 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 317 | if v := i.rb.ss.next(i.info); v == ssStarter { | ||
| 318 | break | ||
| 319 | } else if v == ssOverflow { | ||
| 320 | i.next = nextCGJDecompose | ||
| 321 | break | ||
| 322 | } | ||
| 323 | if i.info.ccc < prevCC { | ||
| 324 | goto doNorm | ||
| 325 | } | ||
| 326 | } | ||
| 327 | if outCopyStart == 0 { | ||
| 328 | return i.returnSlice(inCopyStart, i.p) | ||
| 329 | } else if inCopyStart < i.p { | ||
| 330 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) | ||
| 331 | } | ||
| 332 | return i.buf[:outp] | ||
| 333 | doNorm: | ||
| 334 | // Insert what we have decomposed so far in the reorderBuffer. | ||
| 335 | // As we will only reorder, there will always be enough room. | ||
| 336 | i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) | ||
| 337 | i.rb.insertDecomposed(i.buf[0:outp]) | ||
| 338 | return doNormDecomposed(i) | ||
| 339 | } | ||
| 340 | |||
| 341 | func doNormDecomposed(i *Iter) []byte { | ||
| 342 | for { | ||
| 343 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | ||
| 344 | if i.p += int(i.info.size); i.p >= i.rb.nsrc { | ||
| 345 | i.setDone() | ||
| 346 | break | ||
| 347 | } | ||
| 348 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 349 | if i.info.ccc == 0 { | ||
| 350 | break | ||
| 351 | } | ||
| 352 | if s := i.rb.ss.next(i.info); s == ssOverflow { | ||
| 353 | i.next = nextCGJDecompose | ||
| 354 | break | ||
| 355 | } | ||
| 356 | } | ||
| 357 | // new segment or too many combining characters: exit normalization | ||
| 358 | return i.buf[:i.rb.flushCopy(i.buf[:])] | ||
| 359 | } | ||
| 360 | |||
| 361 | func nextCGJDecompose(i *Iter) []byte { | ||
| 362 | i.rb.ss = 0 | ||
| 363 | i.rb.insertCGJ() | ||
| 364 | i.next = nextDecomposed | ||
| 365 | i.rb.ss.first(i.info) | ||
| 366 | buf := doNormDecomposed(i) | ||
| 367 | return buf | ||
| 368 | } | ||
| 369 | |||
| 370 | // nextComposed is the implementation of Next for forms NFC and NFKC. | ||
| 371 | func nextComposed(i *Iter) []byte { | ||
| 372 | outp, startp := 0, i.p | ||
| 373 | var prevCC uint8 | ||
| 374 | for { | ||
| 375 | if !i.info.isYesC() { | ||
| 376 | goto doNorm | ||
| 377 | } | ||
| 378 | prevCC = i.info.tccc | ||
| 379 | sz := int(i.info.size) | ||
| 380 | if sz == 0 { | ||
| 381 | sz = 1 // illegal rune: copy byte-by-byte | ||
| 382 | } | ||
| 383 | p := outp + sz | ||
| 384 | if p > len(i.buf) { | ||
| 385 | break | ||
| 386 | } | ||
| 387 | outp = p | ||
| 388 | i.p += sz | ||
| 389 | if i.p >= i.rb.nsrc { | ||
| 390 | i.setDone() | ||
| 391 | break | ||
| 392 | } else if i.rb.src._byte(i.p) < utf8.RuneSelf { | ||
| 393 | i.rb.ss = 0 | ||
| 394 | i.next = i.asciiF | ||
| 395 | break | ||
| 396 | } | ||
| 397 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 398 | if v := i.rb.ss.next(i.info); v == ssStarter { | ||
| 399 | break | ||
| 400 | } else if v == ssOverflow { | ||
| 401 | i.next = nextCGJCompose | ||
| 402 | break | ||
| 403 | } | ||
| 404 | if i.info.ccc < prevCC { | ||
| 405 | goto doNorm | ||
| 406 | } | ||
| 407 | } | ||
| 408 | return i.returnSlice(startp, i.p) | ||
| 409 | doNorm: | ||
| 410 | // reset to start position | ||
| 411 | i.p = startp | ||
| 412 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 413 | i.rb.ss.first(i.info) | ||
| 414 | if i.info.multiSegment() { | ||
| 415 | d := i.info.Decomposition() | ||
| 416 | info := i.rb.f.info(input{bytes: d}, 0) | ||
| 417 | i.rb.insertUnsafe(input{bytes: d}, 0, info) | ||
| 418 | i.multiSeg = d[int(info.size):] | ||
| 419 | i.next = nextMultiNorm | ||
| 420 | return nextMultiNorm(i) | ||
| 421 | } | ||
| 422 | i.rb.ss.first(i.info) | ||
| 423 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | ||
| 424 | return doNormComposed(i) | ||
| 425 | } | ||
| 426 | |||
| 427 | func doNormComposed(i *Iter) []byte { | ||
| 428 | // First rune should already be inserted. | ||
| 429 | for { | ||
| 430 | if i.p += int(i.info.size); i.p >= i.rb.nsrc { | ||
| 431 | i.setDone() | ||
| 432 | break | ||
| 433 | } | ||
| 434 | i.info = i.rb.f.info(i.rb.src, i.p) | ||
| 435 | if s := i.rb.ss.next(i.info); s == ssStarter { | ||
| 436 | break | ||
| 437 | } else if s == ssOverflow { | ||
| 438 | i.next = nextCGJCompose | ||
| 439 | break | ||
| 440 | } | ||
| 441 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | ||
| 442 | } | ||
| 443 | i.rb.compose() | ||
| 444 | seg := i.buf[:i.rb.flushCopy(i.buf[:])] | ||
| 445 | return seg | ||
| 446 | } | ||
| 447 | |||
| 448 | func nextCGJCompose(i *Iter) []byte { | ||
| 449 | i.rb.ss = 0 // instead of first | ||
| 450 | i.rb.insertCGJ() | ||
| 451 | i.next = nextComposed | ||
| 452 | // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, | ||
| 453 | // even if they are not. This is particularly dubious for U+FF9E and UFF9A. | ||
| 454 | // If we ever change that, insert a check here. | ||
| 455 | i.rb.ss.first(i.info) | ||
| 456 | i.rb.insertUnsafe(i.rb.src, i.p, i.info) | ||
| 457 | return doNormComposed(i) | ||
| 458 | } | ||