Mercurial > yakumo_izuru > aya
comparison vendor/github.com/dlclark/regexp2/syntax/charclass.go @ 66:787b5ee0289d draft
Use vendored modules
Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author | yakumo.izuru |
---|---|
date | Sun, 23 Jul 2023 13:18:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
65:6d985efa0f7a | 66:787b5ee0289d |
---|---|
1 package syntax | |
2 | |
3 import ( | |
4 "bytes" | |
5 "encoding/binary" | |
6 "fmt" | |
7 "sort" | |
8 "unicode" | |
9 "unicode/utf8" | |
10 ) | |
11 | |
12 // CharSet combines start-end rune ranges and unicode categories representing a set of characters | |
13 type CharSet struct { | |
14 ranges []singleRange | |
15 categories []category | |
16 sub *CharSet //optional subtractor | |
17 negate bool | |
18 anything bool | |
19 } | |
20 | |
21 type category struct { | |
22 negate bool | |
23 cat string | |
24 } | |
25 | |
26 type singleRange struct { | |
27 first rune | |
28 last rune | |
29 } | |
30 | |
31 const ( | |
32 spaceCategoryText = " " | |
33 wordCategoryText = "W" | |
34 ) | |
35 | |
36 var ( | |
37 ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00} | |
38 ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b} | |
39 ecmaDigit = []rune{0x0030, 0x003a} | |
40 ) | |
41 | |
42 var ( | |
43 AnyClass = getCharSetFromOldString([]rune{0}, false) | |
44 ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false) | |
45 NoneClass = getCharSetFromOldString(nil, false) | |
46 ECMAWordClass = getCharSetFromOldString(ecmaWord, false) | |
47 NotECMAWordClass = getCharSetFromOldString(ecmaWord, true) | |
48 ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false) | |
49 NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true) | |
50 ECMADigitClass = getCharSetFromOldString(ecmaDigit, false) | |
51 NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true) | |
52 | |
53 WordClass = getCharSetFromCategoryString(false, false, wordCategoryText) | |
54 NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText) | |
55 SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText) | |
56 NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText) | |
57 DigitClass = getCharSetFromCategoryString(false, false, "Nd") | |
58 NotDigitClass = getCharSetFromCategoryString(false, true, "Nd") | |
59 ) | |
60 | |
61 var unicodeCategories = func() map[string]*unicode.RangeTable { | |
62 retVal := make(map[string]*unicode.RangeTable) | |
63 for k, v := range unicode.Scripts { | |
64 retVal[k] = v | |
65 } | |
66 for k, v := range unicode.Categories { | |
67 retVal[k] = v | |
68 } | |
69 for k, v := range unicode.Properties { | |
70 retVal[k] = v | |
71 } | |
72 return retVal | |
73 }() | |
74 | |
75 func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet { | |
76 if negateCat && negateSet { | |
77 panic("BUG! You should only negate the set OR the category in a constant setup, but not both") | |
78 } | |
79 | |
80 c := CharSet{negate: negateSet} | |
81 | |
82 c.categories = make([]category, len(cats)) | |
83 for i, cat := range cats { | |
84 c.categories[i] = category{cat: cat, negate: negateCat} | |
85 } | |
86 return func() *CharSet { | |
87 //make a copy each time | |
88 local := c | |
89 //return that address | |
90 return &local | |
91 } | |
92 } | |
93 | |
94 func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet { | |
95 c := CharSet{} | |
96 if len(setText) > 0 { | |
97 fillFirst := false | |
98 l := len(setText) | |
99 if negate { | |
100 if setText[0] == 0 { | |
101 setText = setText[1:] | |
102 } else { | |
103 l++ | |
104 fillFirst = true | |
105 } | |
106 } | |
107 | |
108 if l%2 == 0 { | |
109 c.ranges = make([]singleRange, l/2) | |
110 } else { | |
111 c.ranges = make([]singleRange, l/2+1) | |
112 } | |
113 | |
114 first := true | |
115 if fillFirst { | |
116 c.ranges[0] = singleRange{first: 0} | |
117 first = false | |
118 } | |
119 | |
120 i := 0 | |
121 for _, r := range setText { | |
122 if first { | |
123 // lower bound in a new range | |
124 c.ranges[i] = singleRange{first: r} | |
125 first = false | |
126 } else { | |
127 c.ranges[i].last = r - 1 | |
128 i++ | |
129 first = true | |
130 } | |
131 } | |
132 if !first { | |
133 c.ranges[i].last = utf8.MaxRune | |
134 } | |
135 } | |
136 | |
137 return func() *CharSet { | |
138 local := c | |
139 return &local | |
140 } | |
141 } | |
142 | |
143 // Copy makes a deep copy to prevent accidental mutation of a set | |
144 func (c CharSet) Copy() CharSet { | |
145 ret := CharSet{ | |
146 anything: c.anything, | |
147 negate: c.negate, | |
148 } | |
149 | |
150 ret.ranges = append(ret.ranges, c.ranges...) | |
151 ret.categories = append(ret.categories, c.categories...) | |
152 | |
153 if c.sub != nil { | |
154 sub := c.sub.Copy() | |
155 ret.sub = &sub | |
156 } | |
157 | |
158 return ret | |
159 } | |
160 | |
161 // gets a human-readable description for a set string | |
162 func (c CharSet) String() string { | |
163 buf := &bytes.Buffer{} | |
164 buf.WriteRune('[') | |
165 | |
166 if c.IsNegated() { | |
167 buf.WriteRune('^') | |
168 } | |
169 | |
170 for _, r := range c.ranges { | |
171 | |
172 buf.WriteString(CharDescription(r.first)) | |
173 if r.first != r.last { | |
174 if r.last-r.first != 1 { | |
175 //groups that are 1 char apart skip the dash | |
176 buf.WriteRune('-') | |
177 } | |
178 buf.WriteString(CharDescription(r.last)) | |
179 } | |
180 } | |
181 | |
182 for _, c := range c.categories { | |
183 buf.WriteString(c.String()) | |
184 } | |
185 | |
186 if c.sub != nil { | |
187 buf.WriteRune('-') | |
188 buf.WriteString(c.sub.String()) | |
189 } | |
190 | |
191 buf.WriteRune(']') | |
192 | |
193 return buf.String() | |
194 } | |
195 | |
196 // mapHashFill converts a charset into a buffer for use in maps | |
197 func (c CharSet) mapHashFill(buf *bytes.Buffer) { | |
198 if c.negate { | |
199 buf.WriteByte(0) | |
200 } else { | |
201 buf.WriteByte(1) | |
202 } | |
203 | |
204 binary.Write(buf, binary.LittleEndian, len(c.ranges)) | |
205 binary.Write(buf, binary.LittleEndian, len(c.categories)) | |
206 for _, r := range c.ranges { | |
207 buf.WriteRune(r.first) | |
208 buf.WriteRune(r.last) | |
209 } | |
210 for _, ct := range c.categories { | |
211 buf.WriteString(ct.cat) | |
212 if ct.negate { | |
213 buf.WriteByte(1) | |
214 } else { | |
215 buf.WriteByte(0) | |
216 } | |
217 } | |
218 | |
219 if c.sub != nil { | |
220 c.sub.mapHashFill(buf) | |
221 } | |
222 } | |
223 | |
224 // CharIn returns true if the rune is in our character set (either ranges or categories). | |
225 // It handles negations and subtracted sub-charsets. | |
226 func (c CharSet) CharIn(ch rune) bool { | |
227 val := false | |
228 // in s && !s.subtracted | |
229 | |
230 //check ranges | |
231 for _, r := range c.ranges { | |
232 if ch < r.first { | |
233 continue | |
234 } | |
235 if ch <= r.last { | |
236 val = true | |
237 break | |
238 } | |
239 } | |
240 | |
241 //check categories if we haven't already found a range | |
242 if !val && len(c.categories) > 0 { | |
243 for _, ct := range c.categories { | |
244 // special categories...then unicode | |
245 if ct.cat == spaceCategoryText { | |
246 if unicode.IsSpace(ch) { | |
247 // we found a space so we're done | |
248 // negate means this is a "bad" thing | |
249 val = !ct.negate | |
250 break | |
251 } else if ct.negate { | |
252 val = true | |
253 break | |
254 } | |
255 } else if ct.cat == wordCategoryText { | |
256 if IsWordChar(ch) { | |
257 val = !ct.negate | |
258 break | |
259 } else if ct.negate { | |
260 val = true | |
261 break | |
262 } | |
263 } else if unicode.Is(unicodeCategories[ct.cat], ch) { | |
264 // if we're in this unicode category then we're done | |
265 // if negate=true on this category then we "failed" our test | |
266 // otherwise we're good that we found it | |
267 val = !ct.negate | |
268 break | |
269 } else if ct.negate { | |
270 val = true | |
271 break | |
272 } | |
273 } | |
274 } | |
275 | |
276 // negate the whole char set | |
277 if c.negate { | |
278 val = !val | |
279 } | |
280 | |
281 // get subtracted recurse | |
282 if val && c.sub != nil { | |
283 val = !c.sub.CharIn(ch) | |
284 } | |
285 | |
286 //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val) | |
287 return val | |
288 } | |
289 | |
290 func (c category) String() string { | |
291 switch c.cat { | |
292 case spaceCategoryText: | |
293 if c.negate { | |
294 return "\\S" | |
295 } | |
296 return "\\s" | |
297 case wordCategoryText: | |
298 if c.negate { | |
299 return "\\W" | |
300 } | |
301 return "\\w" | |
302 } | |
303 if _, ok := unicodeCategories[c.cat]; ok { | |
304 | |
305 if c.negate { | |
306 return "\\P{" + c.cat + "}" | |
307 } | |
308 return "\\p{" + c.cat + "}" | |
309 } | |
310 return "Unknown category: " + c.cat | |
311 } | |
312 | |
313 // CharDescription Produces a human-readable description for a single character. | |
314 func CharDescription(ch rune) string { | |
315 /*if ch == '\\' { | |
316 return "\\\\" | |
317 } | |
318 | |
319 if ch > ' ' && ch <= '~' { | |
320 return string(ch) | |
321 } else if ch == '\n' { | |
322 return "\\n" | |
323 } else if ch == ' ' { | |
324 return "\\ " | |
325 }*/ | |
326 | |
327 b := &bytes.Buffer{} | |
328 escape(b, ch, false) //fmt.Sprintf("%U", ch) | |
329 return b.String() | |
330 } | |
331 | |
332 // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) | |
333 // RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic | |
334 // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C | |
335 // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER. | |
336 func IsWordChar(r rune) bool { | |
337 //"L", "Mn", "Nd", "Pc" | |
338 return unicode.In(r, | |
339 unicode.Categories["L"], unicode.Categories["Mn"], | |
340 unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C' | |
341 //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' | |
342 } | |
343 | |
344 func IsECMAWordChar(r rune) bool { | |
345 return unicode.In(r, | |
346 unicode.Categories["L"], unicode.Categories["Mn"], | |
347 unicode.Categories["Nd"], unicode.Categories["Pc"]) | |
348 | |
349 //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' | |
350 } | |
351 | |
352 // SingletonChar will return the char from the first range without validation. | |
353 // It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input | |
354 func (c CharSet) SingletonChar() rune { | |
355 return c.ranges[0].first | |
356 } | |
357 | |
358 func (c CharSet) IsSingleton() bool { | |
359 return !c.negate && //negated is multiple chars | |
360 len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars | |
361 c.sub == nil && // subtraction means we've got multiple chars | |
362 c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char | |
363 } | |
364 | |
365 func (c CharSet) IsSingletonInverse() bool { | |
366 return c.negate && //same as above, but requires negated | |
367 len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars | |
368 c.sub == nil && // subtraction means we've got multiple chars | |
369 c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char | |
370 } | |
371 | |
372 func (c CharSet) IsMergeable() bool { | |
373 return !c.IsNegated() && !c.HasSubtraction() | |
374 } | |
375 | |
376 func (c CharSet) IsNegated() bool { | |
377 return c.negate | |
378 } | |
379 | |
380 func (c CharSet) HasSubtraction() bool { | |
381 return c.sub != nil | |
382 } | |
383 | |
384 func (c CharSet) IsEmpty() bool { | |
385 return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil | |
386 } | |
387 | |
388 func (c *CharSet) addDigit(ecma, negate bool, pattern string) { | |
389 if ecma { | |
390 if negate { | |
391 c.addRanges(NotECMADigitClass().ranges) | |
392 } else { | |
393 c.addRanges(ECMADigitClass().ranges) | |
394 } | |
395 } else { | |
396 c.addCategories(category{cat: "Nd", negate: negate}) | |
397 } | |
398 } | |
399 | |
400 func (c *CharSet) addChar(ch rune) { | |
401 c.addRange(ch, ch) | |
402 } | |
403 | |
404 func (c *CharSet) addSpace(ecma, negate bool) { | |
405 if ecma { | |
406 if negate { | |
407 c.addRanges(NotECMASpaceClass().ranges) | |
408 } else { | |
409 c.addRanges(ECMASpaceClass().ranges) | |
410 } | |
411 } else { | |
412 c.addCategories(category{cat: spaceCategoryText, negate: negate}) | |
413 } | |
414 } | |
415 | |
416 func (c *CharSet) addWord(ecma, negate bool) { | |
417 if ecma { | |
418 if negate { | |
419 c.addRanges(NotECMAWordClass().ranges) | |
420 } else { | |
421 c.addRanges(ECMAWordClass().ranges) | |
422 } | |
423 } else { | |
424 c.addCategories(category{cat: wordCategoryText, negate: negate}) | |
425 } | |
426 } | |
427 | |
428 // Add set ranges and categories into ours -- no deduping or anything | |
429 func (c *CharSet) addSet(set CharSet) { | |
430 if c.anything { | |
431 return | |
432 } | |
433 if set.anything { | |
434 c.makeAnything() | |
435 return | |
436 } | |
437 // just append here to prevent double-canon | |
438 c.ranges = append(c.ranges, set.ranges...) | |
439 c.addCategories(set.categories...) | |
440 c.canonicalize() | |
441 } | |
442 | |
443 func (c *CharSet) makeAnything() { | |
444 c.anything = true | |
445 c.categories = []category{} | |
446 c.ranges = AnyClass().ranges | |
447 } | |
448 | |
449 func (c *CharSet) addCategories(cats ...category) { | |
450 // don't add dupes and remove positive+negative | |
451 if c.anything { | |
452 // if we've had a previous positive+negative group then | |
453 // just return, we're as broad as we can get | |
454 return | |
455 } | |
456 | |
457 for _, ct := range cats { | |
458 found := false | |
459 for _, ct2 := range c.categories { | |
460 if ct.cat == ct2.cat { | |
461 if ct.negate != ct2.negate { | |
462 // oposite negations...this mean we just | |
463 // take us as anything and move on | |
464 c.makeAnything() | |
465 return | |
466 } | |
467 found = true | |
468 break | |
469 } | |
470 } | |
471 | |
472 if !found { | |
473 c.categories = append(c.categories, ct) | |
474 } | |
475 } | |
476 } | |
477 | |
478 // Merges new ranges to our own | |
479 func (c *CharSet) addRanges(ranges []singleRange) { | |
480 if c.anything { | |
481 return | |
482 } | |
483 c.ranges = append(c.ranges, ranges...) | |
484 c.canonicalize() | |
485 } | |
486 | |
487 // Merges everything but the new ranges into our own | |
488 func (c *CharSet) addNegativeRanges(ranges []singleRange) { | |
489 if c.anything { | |
490 return | |
491 } | |
492 | |
493 var hi rune | |
494 | |
495 // convert incoming ranges into opposites, assume they are in order | |
496 for _, r := range ranges { | |
497 if hi < r.first { | |
498 c.ranges = append(c.ranges, singleRange{hi, r.first - 1}) | |
499 } | |
500 hi = r.last + 1 | |
501 } | |
502 | |
503 if hi < utf8.MaxRune { | |
504 c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune}) | |
505 } | |
506 | |
507 c.canonicalize() | |
508 } | |
509 | |
510 func isValidUnicodeCat(catName string) bool { | |
511 _, ok := unicodeCategories[catName] | |
512 return ok | |
513 } | |
514 | |
515 func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) { | |
516 if !isValidUnicodeCat(categoryName) { | |
517 // unknown unicode category, script, or property "blah" | |
518 panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName)) | |
519 | |
520 } | |
521 | |
522 if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") { | |
523 // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match | |
524 c.addCategories( | |
525 category{cat: "Ll", negate: negate}, | |
526 category{cat: "Lu", negate: negate}, | |
527 category{cat: "Lt", negate: negate}) | |
528 } | |
529 c.addCategories(category{cat: categoryName, negate: negate}) | |
530 } | |
531 | |
532 func (c *CharSet) addSubtraction(sub *CharSet) { | |
533 c.sub = sub | |
534 } | |
535 | |
536 func (c *CharSet) addRange(chMin, chMax rune) { | |
537 c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax}) | |
538 c.canonicalize() | |
539 } | |
540 | |
541 func (c *CharSet) addNamedASCII(name string, negate bool) bool { | |
542 var rs []singleRange | |
543 | |
544 switch name { | |
545 case "alnum": | |
546 rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}} | |
547 case "alpha": | |
548 rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}} | |
549 case "ascii": | |
550 rs = []singleRange{singleRange{0, 0x7f}} | |
551 case "blank": | |
552 rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}} | |
553 case "cntrl": | |
554 rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}} | |
555 case "digit": | |
556 c.addDigit(false, negate, "") | |
557 case "graph": | |
558 rs = []singleRange{singleRange{'!', '~'}} | |
559 case "lower": | |
560 rs = []singleRange{singleRange{'a', 'z'}} | |
561 case "print": | |
562 rs = []singleRange{singleRange{' ', '~'}} | |
563 case "punct": //[!-/:-@[-`{-~] | |
564 rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}} | |
565 case "space": | |
566 c.addSpace(true, negate) | |
567 case "upper": | |
568 rs = []singleRange{singleRange{'A', 'Z'}} | |
569 case "word": | |
570 c.addWord(true, negate) | |
571 case "xdigit": | |
572 rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}} | |
573 default: | |
574 return false | |
575 } | |
576 | |
577 if len(rs) > 0 { | |
578 if negate { | |
579 c.addNegativeRanges(rs) | |
580 } else { | |
581 c.addRanges(rs) | |
582 } | |
583 } | |
584 | |
585 return true | |
586 } | |
587 | |
588 type singleRangeSorter []singleRange | |
589 | |
590 func (p singleRangeSorter) Len() int { return len(p) } | |
591 func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first } | |
592 func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] } | |
593 | |
594 // Logic to reduce a character class to a unique, sorted form. | |
595 func (c *CharSet) canonicalize() { | |
596 var i, j int | |
597 var last rune | |
598 | |
599 // | |
600 // Find and eliminate overlapping or abutting ranges | |
601 // | |
602 | |
603 if len(c.ranges) > 1 { | |
604 sort.Sort(singleRangeSorter(c.ranges)) | |
605 | |
606 done := false | |
607 | |
608 for i, j = 1, 0; ; i++ { | |
609 for last = c.ranges[j].last; ; i++ { | |
610 if i == len(c.ranges) || last == utf8.MaxRune { | |
611 done = true | |
612 break | |
613 } | |
614 | |
615 CurrentRange := c.ranges[i] | |
616 if CurrentRange.first > last+1 { | |
617 break | |
618 } | |
619 | |
620 if last < CurrentRange.last { | |
621 last = CurrentRange.last | |
622 } | |
623 } | |
624 | |
625 c.ranges[j] = singleRange{first: c.ranges[j].first, last: last} | |
626 | |
627 j++ | |
628 | |
629 if done { | |
630 break | |
631 } | |
632 | |
633 if j < i { | |
634 c.ranges[j] = c.ranges[i] | |
635 } | |
636 } | |
637 | |
638 c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...) | |
639 } | |
640 } | |
641 | |
642 // Adds to the class any lowercase versions of characters already | |
643 // in the class. Used for case-insensitivity. | |
644 func (c *CharSet) addLowercase() { | |
645 if c.anything { | |
646 return | |
647 } | |
648 toAdd := []singleRange{} | |
649 for i := 0; i < len(c.ranges); i++ { | |
650 r := c.ranges[i] | |
651 if r.first == r.last { | |
652 lower := unicode.ToLower(r.first) | |
653 c.ranges[i] = singleRange{first: lower, last: lower} | |
654 } else { | |
655 toAdd = append(toAdd, r) | |
656 } | |
657 } | |
658 | |
659 for _, r := range toAdd { | |
660 c.addLowercaseRange(r.first, r.last) | |
661 } | |
662 c.canonicalize() | |
663 } | |
664 | |
665 /************************************************************************** | |
666 Let U be the set of Unicode character values and let L be the lowercase | |
667 function, mapping from U to U. To perform case insensitive matching of | |
668 character sets, we need to be able to map an interval I in U, say | |
669 | |
670 I = [chMin, chMax] = { ch : chMin <= ch <= chMax } | |
671 | |
672 to a set A such that A contains L(I) and A is contained in the union of | |
673 I and L(I). | |
674 | |
675 The table below partitions U into intervals on which L is non-decreasing. | |
676 Thus, for any interval J = [a, b] contained in one of these intervals, | |
677 L(J) is contained in [L(a), L(b)]. | |
678 | |
679 It is also true that for any such J, [L(a), L(b)] is contained in the | |
680 union of J and L(J). This does not follow from L being non-decreasing on | |
681 these intervals. It follows from the nature of the L on each interval. | |
682 On each interval, L has one of the following forms: | |
683 | |
684 (1) L(ch) = constant (LowercaseSet) | |
685 (2) L(ch) = ch + offset (LowercaseAdd) | |
686 (3) L(ch) = ch | 1 (LowercaseBor) | |
687 (4) L(ch) = ch + (ch & 1) (LowercaseBad) | |
688 | |
689 It is easy to verify that for any of these forms [L(a), L(b)] is | |
690 contained in the union of [a, b] and L([a, b]). | |
691 ***************************************************************************/ | |
692 | |
693 const ( | |
694 LowercaseSet = 0 // Set to arg. | |
695 LowercaseAdd = 1 // Add arg. | |
696 LowercaseBor = 2 // Bitwise or with 1. | |
697 LowercaseBad = 3 // Bitwise and with 1 and add original. | |
698 ) | |
699 | |
700 type lcMap struct { | |
701 chMin, chMax rune | |
702 op, data int32 | |
703 } | |
704 | |
705 var lcTable = []lcMap{ | |
706 lcMap{'\u0041', '\u005A', LowercaseAdd, 32}, | |
707 lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32}, | |
708 lcMap{'\u0100', '\u012E', LowercaseBor, 0}, | |
709 lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069}, | |
710 lcMap{'\u0132', '\u0136', LowercaseBor, 0}, | |
711 lcMap{'\u0139', '\u0147', LowercaseBad, 0}, | |
712 lcMap{'\u014A', '\u0176', LowercaseBor, 0}, | |
713 lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF}, | |
714 lcMap{'\u0179', '\u017D', LowercaseBad, 0}, | |
715 lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253}, | |
716 lcMap{'\u0182', '\u0184', LowercaseBor, 0}, | |
717 lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254}, | |
718 lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188}, | |
719 lcMap{'\u0189', '\u018A', LowercaseAdd, 205}, | |
720 lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C}, | |
721 lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD}, | |
722 lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259}, | |
723 lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B}, | |
724 lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192}, | |
725 lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260}, | |
726 lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263}, | |
727 lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269}, | |
728 lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268}, | |
729 lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199}, | |
730 lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F}, | |
731 lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272}, | |
732 lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275}, | |
733 lcMap{'\u01A0', '\u01A4', LowercaseBor, 0}, | |
734 lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8}, | |
735 lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283}, | |
736 lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD}, | |
737 lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288}, | |
738 lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0}, | |
739 lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217}, | |
740 lcMap{'\u01B3', '\u01B5', LowercaseBad, 0}, | |
741 lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292}, | |
742 lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9}, | |
743 lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD}, | |
744 lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6}, | |
745 lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9}, | |
746 lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC}, | |
747 lcMap{'\u01CD', '\u01DB', LowercaseBad, 0}, | |
748 lcMap{'\u01DE', '\u01EE', LowercaseBor, 0}, | |
749 lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3}, | |
750 lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5}, | |
751 lcMap{'\u01FA', '\u0216', LowercaseBor, 0}, | |
752 lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC}, | |
753 lcMap{'\u0388', '\u038A', LowercaseAdd, 37}, | |
754 lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC}, | |
755 lcMap{'\u038E', '\u038F', LowercaseAdd, 63}, | |
756 lcMap{'\u0391', '\u03AB', LowercaseAdd, 32}, | |
757 lcMap{'\u03E2', '\u03EE', LowercaseBor, 0}, | |
758 lcMap{'\u0401', '\u040F', LowercaseAdd, 80}, | |
759 lcMap{'\u0410', '\u042F', LowercaseAdd, 32}, | |
760 lcMap{'\u0460', '\u0480', LowercaseBor, 0}, | |
761 lcMap{'\u0490', '\u04BE', LowercaseBor, 0}, | |
762 lcMap{'\u04C1', '\u04C3', LowercaseBad, 0}, | |
763 lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8}, | |
764 lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC}, | |
765 lcMap{'\u04D0', '\u04EA', LowercaseBor, 0}, | |
766 lcMap{'\u04EE', '\u04F4', LowercaseBor, 0}, | |
767 lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9}, | |
768 lcMap{'\u0531', '\u0556', LowercaseAdd, 48}, | |
769 lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48}, | |
770 lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0}, | |
771 lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8}, | |
772 lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8}, | |
773 lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8}, | |
774 lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8}, | |
775 lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8}, | |
776 lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51}, | |
777 lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53}, | |
778 lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55}, | |
779 lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57}, | |
780 lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8}, | |
781 lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8}, | |
782 lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8}, | |
783 lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8}, | |
784 lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8}, | |
785 lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74}, | |
786 lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3}, | |
787 lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86}, | |
788 lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3}, | |
789 lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8}, | |
790 lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100}, | |
791 lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8}, | |
792 lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112}, | |
793 lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5}, | |
794 lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128}, | |
795 lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126}, | |
796 lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3}, | |
797 lcMap{'\u2160', '\u216F', LowercaseAdd, 16}, | |
798 lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26}, | |
799 lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32}, | |
800 } | |
801 | |
802 func (c *CharSet) addLowercaseRange(chMin, chMax rune) { | |
803 var i, iMax, iMid int | |
804 var chMinT, chMaxT rune | |
805 var lc lcMap | |
806 | |
807 for i, iMax = 0, len(lcTable); i < iMax; { | |
808 iMid = (i + iMax) / 2 | |
809 if lcTable[iMid].chMax < chMin { | |
810 i = iMid + 1 | |
811 } else { | |
812 iMax = iMid | |
813 } | |
814 } | |
815 | |
816 for ; i < len(lcTable); i++ { | |
817 lc = lcTable[i] | |
818 if lc.chMin > chMax { | |
819 return | |
820 } | |
821 chMinT = lc.chMin | |
822 if chMinT < chMin { | |
823 chMinT = chMin | |
824 } | |
825 | |
826 chMaxT = lc.chMax | |
827 if chMaxT > chMax { | |
828 chMaxT = chMax | |
829 } | |
830 | |
831 switch lc.op { | |
832 case LowercaseSet: | |
833 chMinT = rune(lc.data) | |
834 chMaxT = rune(lc.data) | |
835 break | |
836 case LowercaseAdd: | |
837 chMinT += lc.data | |
838 chMaxT += lc.data | |
839 break | |
840 case LowercaseBor: | |
841 chMinT |= 1 | |
842 chMaxT |= 1 | |
843 break | |
844 case LowercaseBad: | |
845 chMinT += (chMinT & 1) | |
846 chMaxT += (chMaxT & 1) | |
847 break | |
848 } | |
849 | |
850 if chMinT < chMin || chMaxT > chMax { | |
851 c.addRange(chMinT, chMaxT) | |
852 } | |
853 } | |
854 } |