66
|
1 package syntax
|
|
2
|
|
3 import (
|
|
4 "bytes"
|
|
5 "encoding/binary"
|
|
6 "fmt"
|
|
7 "sort"
|
|
8 "unicode"
|
|
9 "unicode/utf8"
|
|
10 )
|
|
11
|
|
12 // CharSet combines start-end rune ranges and unicode categories representing a set of characters
|
|
13 type CharSet struct {
|
|
14 ranges []singleRange
|
|
15 categories []category
|
|
16 sub *CharSet //optional subtractor
|
|
17 negate bool
|
|
18 anything bool
|
|
19 }
|
|
20
|
|
21 type category struct {
|
|
22 negate bool
|
|
23 cat string
|
|
24 }
|
|
25
|
|
26 type singleRange struct {
|
|
27 first rune
|
|
28 last rune
|
|
29 }
|
|
30
|
|
31 const (
|
|
32 spaceCategoryText = " "
|
|
33 wordCategoryText = "W"
|
|
34 )
|
|
35
|
|
36 var (
|
|
37 ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
|
|
38 ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
|
|
39 ecmaDigit = []rune{0x0030, 0x003a}
|
|
40 )
|
|
41
|
|
42 var (
|
|
43 AnyClass = getCharSetFromOldString([]rune{0}, false)
|
|
44 ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
|
|
45 NoneClass = getCharSetFromOldString(nil, false)
|
|
46 ECMAWordClass = getCharSetFromOldString(ecmaWord, false)
|
|
47 NotECMAWordClass = getCharSetFromOldString(ecmaWord, true)
|
|
48 ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false)
|
|
49 NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
|
|
50 ECMADigitClass = getCharSetFromOldString(ecmaDigit, false)
|
|
51 NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
|
|
52
|
|
53 WordClass = getCharSetFromCategoryString(false, false, wordCategoryText)
|
|
54 NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText)
|
|
55 SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText)
|
|
56 NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
|
|
57 DigitClass = getCharSetFromCategoryString(false, false, "Nd")
|
|
58 NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
|
|
59 )
|
|
60
|
|
61 var unicodeCategories = func() map[string]*unicode.RangeTable {
|
|
62 retVal := make(map[string]*unicode.RangeTable)
|
|
63 for k, v := range unicode.Scripts {
|
|
64 retVal[k] = v
|
|
65 }
|
|
66 for k, v := range unicode.Categories {
|
|
67 retVal[k] = v
|
|
68 }
|
|
69 for k, v := range unicode.Properties {
|
|
70 retVal[k] = v
|
|
71 }
|
|
72 return retVal
|
|
73 }()
|
|
74
|
|
75 func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
|
|
76 if negateCat && negateSet {
|
|
77 panic("BUG! You should only negate the set OR the category in a constant setup, but not both")
|
|
78 }
|
|
79
|
|
80 c := CharSet{negate: negateSet}
|
|
81
|
|
82 c.categories = make([]category, len(cats))
|
|
83 for i, cat := range cats {
|
|
84 c.categories[i] = category{cat: cat, negate: negateCat}
|
|
85 }
|
|
86 return func() *CharSet {
|
|
87 //make a copy each time
|
|
88 local := c
|
|
89 //return that address
|
|
90 return &local
|
|
91 }
|
|
92 }
|
|
93
|
|
94 func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
|
|
95 c := CharSet{}
|
|
96 if len(setText) > 0 {
|
|
97 fillFirst := false
|
|
98 l := len(setText)
|
|
99 if negate {
|
|
100 if setText[0] == 0 {
|
|
101 setText = setText[1:]
|
|
102 } else {
|
|
103 l++
|
|
104 fillFirst = true
|
|
105 }
|
|
106 }
|
|
107
|
|
108 if l%2 == 0 {
|
|
109 c.ranges = make([]singleRange, l/2)
|
|
110 } else {
|
|
111 c.ranges = make([]singleRange, l/2+1)
|
|
112 }
|
|
113
|
|
114 first := true
|
|
115 if fillFirst {
|
|
116 c.ranges[0] = singleRange{first: 0}
|
|
117 first = false
|
|
118 }
|
|
119
|
|
120 i := 0
|
|
121 for _, r := range setText {
|
|
122 if first {
|
|
123 // lower bound in a new range
|
|
124 c.ranges[i] = singleRange{first: r}
|
|
125 first = false
|
|
126 } else {
|
|
127 c.ranges[i].last = r - 1
|
|
128 i++
|
|
129 first = true
|
|
130 }
|
|
131 }
|
|
132 if !first {
|
|
133 c.ranges[i].last = utf8.MaxRune
|
|
134 }
|
|
135 }
|
|
136
|
|
137 return func() *CharSet {
|
|
138 local := c
|
|
139 return &local
|
|
140 }
|
|
141 }
|
|
142
|
|
143 // Copy makes a deep copy to prevent accidental mutation of a set
|
|
144 func (c CharSet) Copy() CharSet {
|
|
145 ret := CharSet{
|
|
146 anything: c.anything,
|
|
147 negate: c.negate,
|
|
148 }
|
|
149
|
|
150 ret.ranges = append(ret.ranges, c.ranges...)
|
|
151 ret.categories = append(ret.categories, c.categories...)
|
|
152
|
|
153 if c.sub != nil {
|
|
154 sub := c.sub.Copy()
|
|
155 ret.sub = &sub
|
|
156 }
|
|
157
|
|
158 return ret
|
|
159 }
|
|
160
|
|
161 // gets a human-readable description for a set string
|
|
162 func (c CharSet) String() string {
|
|
163 buf := &bytes.Buffer{}
|
|
164 buf.WriteRune('[')
|
|
165
|
|
166 if c.IsNegated() {
|
|
167 buf.WriteRune('^')
|
|
168 }
|
|
169
|
|
170 for _, r := range c.ranges {
|
|
171
|
|
172 buf.WriteString(CharDescription(r.first))
|
|
173 if r.first != r.last {
|
|
174 if r.last-r.first != 1 {
|
|
175 //groups that are 1 char apart skip the dash
|
|
176 buf.WriteRune('-')
|
|
177 }
|
|
178 buf.WriteString(CharDescription(r.last))
|
|
179 }
|
|
180 }
|
|
181
|
|
182 for _, c := range c.categories {
|
|
183 buf.WriteString(c.String())
|
|
184 }
|
|
185
|
|
186 if c.sub != nil {
|
|
187 buf.WriteRune('-')
|
|
188 buf.WriteString(c.sub.String())
|
|
189 }
|
|
190
|
|
191 buf.WriteRune(']')
|
|
192
|
|
193 return buf.String()
|
|
194 }
|
|
195
|
|
196 // mapHashFill converts a charset into a buffer for use in maps
|
|
197 func (c CharSet) mapHashFill(buf *bytes.Buffer) {
|
|
198 if c.negate {
|
|
199 buf.WriteByte(0)
|
|
200 } else {
|
|
201 buf.WriteByte(1)
|
|
202 }
|
|
203
|
|
204 binary.Write(buf, binary.LittleEndian, len(c.ranges))
|
|
205 binary.Write(buf, binary.LittleEndian, len(c.categories))
|
|
206 for _, r := range c.ranges {
|
|
207 buf.WriteRune(r.first)
|
|
208 buf.WriteRune(r.last)
|
|
209 }
|
|
210 for _, ct := range c.categories {
|
|
211 buf.WriteString(ct.cat)
|
|
212 if ct.negate {
|
|
213 buf.WriteByte(1)
|
|
214 } else {
|
|
215 buf.WriteByte(0)
|
|
216 }
|
|
217 }
|
|
218
|
|
219 if c.sub != nil {
|
|
220 c.sub.mapHashFill(buf)
|
|
221 }
|
|
222 }
|
|
223
|
|
224 // CharIn returns true if the rune is in our character set (either ranges or categories).
|
|
225 // It handles negations and subtracted sub-charsets.
|
|
226 func (c CharSet) CharIn(ch rune) bool {
|
|
227 val := false
|
|
228 // in s && !s.subtracted
|
|
229
|
|
230 //check ranges
|
|
231 for _, r := range c.ranges {
|
|
232 if ch < r.first {
|
|
233 continue
|
|
234 }
|
|
235 if ch <= r.last {
|
|
236 val = true
|
|
237 break
|
|
238 }
|
|
239 }
|
|
240
|
|
241 //check categories if we haven't already found a range
|
|
242 if !val && len(c.categories) > 0 {
|
|
243 for _, ct := range c.categories {
|
|
244 // special categories...then unicode
|
|
245 if ct.cat == spaceCategoryText {
|
|
246 if unicode.IsSpace(ch) {
|
|
247 // we found a space so we're done
|
|
248 // negate means this is a "bad" thing
|
|
249 val = !ct.negate
|
|
250 break
|
|
251 } else if ct.negate {
|
|
252 val = true
|
|
253 break
|
|
254 }
|
|
255 } else if ct.cat == wordCategoryText {
|
|
256 if IsWordChar(ch) {
|
|
257 val = !ct.negate
|
|
258 break
|
|
259 } else if ct.negate {
|
|
260 val = true
|
|
261 break
|
|
262 }
|
|
263 } else if unicode.Is(unicodeCategories[ct.cat], ch) {
|
|
264 // if we're in this unicode category then we're done
|
|
265 // if negate=true on this category then we "failed" our test
|
|
266 // otherwise we're good that we found it
|
|
267 val = !ct.negate
|
|
268 break
|
|
269 } else if ct.negate {
|
|
270 val = true
|
|
271 break
|
|
272 }
|
|
273 }
|
|
274 }
|
|
275
|
|
276 // negate the whole char set
|
|
277 if c.negate {
|
|
278 val = !val
|
|
279 }
|
|
280
|
|
281 // get subtracted recurse
|
|
282 if val && c.sub != nil {
|
|
283 val = !c.sub.CharIn(ch)
|
|
284 }
|
|
285
|
|
286 //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
|
|
287 return val
|
|
288 }
|
|
289
|
|
290 func (c category) String() string {
|
|
291 switch c.cat {
|
|
292 case spaceCategoryText:
|
|
293 if c.negate {
|
|
294 return "\\S"
|
|
295 }
|
|
296 return "\\s"
|
|
297 case wordCategoryText:
|
|
298 if c.negate {
|
|
299 return "\\W"
|
|
300 }
|
|
301 return "\\w"
|
|
302 }
|
|
303 if _, ok := unicodeCategories[c.cat]; ok {
|
|
304
|
|
305 if c.negate {
|
|
306 return "\\P{" + c.cat + "}"
|
|
307 }
|
|
308 return "\\p{" + c.cat + "}"
|
|
309 }
|
|
310 return "Unknown category: " + c.cat
|
|
311 }
|
|
312
|
|
313 // CharDescription Produces a human-readable description for a single character.
|
|
314 func CharDescription(ch rune) string {
|
|
315 /*if ch == '\\' {
|
|
316 return "\\\\"
|
|
317 }
|
|
318
|
|
319 if ch > ' ' && ch <= '~' {
|
|
320 return string(ch)
|
|
321 } else if ch == '\n' {
|
|
322 return "\\n"
|
|
323 } else if ch == ' ' {
|
|
324 return "\\ "
|
|
325 }*/
|
|
326
|
|
327 b := &bytes.Buffer{}
|
|
328 escape(b, ch, false) //fmt.Sprintf("%U", ch)
|
|
329 return b.String()
|
|
330 }
|
|
331
|
|
332 // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
|
|
333 // RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
|
|
334 // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
|
|
335 // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
|
|
336 func IsWordChar(r rune) bool {
|
|
337 //"L", "Mn", "Nd", "Pc"
|
|
338 return unicode.In(r,
|
|
339 unicode.Categories["L"], unicode.Categories["Mn"],
|
|
340 unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
|
|
341 //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
|
|
342 }
|
|
343
|
|
344 func IsECMAWordChar(r rune) bool {
|
|
345 return unicode.In(r,
|
|
346 unicode.Categories["L"], unicode.Categories["Mn"],
|
|
347 unicode.Categories["Nd"], unicode.Categories["Pc"])
|
|
348
|
|
349 //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
|
|
350 }
|
|
351
|
|
352 // SingletonChar will return the char from the first range without validation.
|
|
353 // It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
|
|
354 func (c CharSet) SingletonChar() rune {
|
|
355 return c.ranges[0].first
|
|
356 }
|
|
357
|
|
358 func (c CharSet) IsSingleton() bool {
|
|
359 return !c.negate && //negated is multiple chars
|
|
360 len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
|
|
361 c.sub == nil && // subtraction means we've got multiple chars
|
|
362 c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
|
|
363 }
|
|
364
|
|
365 func (c CharSet) IsSingletonInverse() bool {
|
|
366 return c.negate && //same as above, but requires negated
|
|
367 len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
|
|
368 c.sub == nil && // subtraction means we've got multiple chars
|
|
369 c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
|
|
370 }
|
|
371
|
|
372 func (c CharSet) IsMergeable() bool {
|
|
373 return !c.IsNegated() && !c.HasSubtraction()
|
|
374 }
|
|
375
|
|
376 func (c CharSet) IsNegated() bool {
|
|
377 return c.negate
|
|
378 }
|
|
379
|
|
380 func (c CharSet) HasSubtraction() bool {
|
|
381 return c.sub != nil
|
|
382 }
|
|
383
|
|
384 func (c CharSet) IsEmpty() bool {
|
|
385 return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
|
|
386 }
|
|
387
|
|
388 func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
|
|
389 if ecma {
|
|
390 if negate {
|
|
391 c.addRanges(NotECMADigitClass().ranges)
|
|
392 } else {
|
|
393 c.addRanges(ECMADigitClass().ranges)
|
|
394 }
|
|
395 } else {
|
|
396 c.addCategories(category{cat: "Nd", negate: negate})
|
|
397 }
|
|
398 }
|
|
399
|
|
400 func (c *CharSet) addChar(ch rune) {
|
|
401 c.addRange(ch, ch)
|
|
402 }
|
|
403
|
|
404 func (c *CharSet) addSpace(ecma, negate bool) {
|
|
405 if ecma {
|
|
406 if negate {
|
|
407 c.addRanges(NotECMASpaceClass().ranges)
|
|
408 } else {
|
|
409 c.addRanges(ECMASpaceClass().ranges)
|
|
410 }
|
|
411 } else {
|
|
412 c.addCategories(category{cat: spaceCategoryText, negate: negate})
|
|
413 }
|
|
414 }
|
|
415
|
|
416 func (c *CharSet) addWord(ecma, negate bool) {
|
|
417 if ecma {
|
|
418 if negate {
|
|
419 c.addRanges(NotECMAWordClass().ranges)
|
|
420 } else {
|
|
421 c.addRanges(ECMAWordClass().ranges)
|
|
422 }
|
|
423 } else {
|
|
424 c.addCategories(category{cat: wordCategoryText, negate: negate})
|
|
425 }
|
|
426 }
|
|
427
|
|
428 // Add set ranges and categories into ours -- no deduping or anything
|
|
429 func (c *CharSet) addSet(set CharSet) {
|
|
430 if c.anything {
|
|
431 return
|
|
432 }
|
|
433 if set.anything {
|
|
434 c.makeAnything()
|
|
435 return
|
|
436 }
|
|
437 // just append here to prevent double-canon
|
|
438 c.ranges = append(c.ranges, set.ranges...)
|
|
439 c.addCategories(set.categories...)
|
|
440 c.canonicalize()
|
|
441 }
|
|
442
|
|
443 func (c *CharSet) makeAnything() {
|
|
444 c.anything = true
|
|
445 c.categories = []category{}
|
|
446 c.ranges = AnyClass().ranges
|
|
447 }
|
|
448
|
|
449 func (c *CharSet) addCategories(cats ...category) {
|
|
450 // don't add dupes and remove positive+negative
|
|
451 if c.anything {
|
|
452 // if we've had a previous positive+negative group then
|
|
453 // just return, we're as broad as we can get
|
|
454 return
|
|
455 }
|
|
456
|
|
457 for _, ct := range cats {
|
|
458 found := false
|
|
459 for _, ct2 := range c.categories {
|
|
460 if ct.cat == ct2.cat {
|
|
461 if ct.negate != ct2.negate {
|
|
462 // oposite negations...this mean we just
|
|
463 // take us as anything and move on
|
|
464 c.makeAnything()
|
|
465 return
|
|
466 }
|
|
467 found = true
|
|
468 break
|
|
469 }
|
|
470 }
|
|
471
|
|
472 if !found {
|
|
473 c.categories = append(c.categories, ct)
|
|
474 }
|
|
475 }
|
|
476 }
|
|
477
|
|
478 // Merges new ranges to our own
|
|
479 func (c *CharSet) addRanges(ranges []singleRange) {
|
|
480 if c.anything {
|
|
481 return
|
|
482 }
|
|
483 c.ranges = append(c.ranges, ranges...)
|
|
484 c.canonicalize()
|
|
485 }
|
|
486
|
|
487 // Merges everything but the new ranges into our own
|
|
488 func (c *CharSet) addNegativeRanges(ranges []singleRange) {
|
|
489 if c.anything {
|
|
490 return
|
|
491 }
|
|
492
|
|
493 var hi rune
|
|
494
|
|
495 // convert incoming ranges into opposites, assume they are in order
|
|
496 for _, r := range ranges {
|
|
497 if hi < r.first {
|
|
498 c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
|
|
499 }
|
|
500 hi = r.last + 1
|
|
501 }
|
|
502
|
|
503 if hi < utf8.MaxRune {
|
|
504 c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
|
|
505 }
|
|
506
|
|
507 c.canonicalize()
|
|
508 }
|
|
509
|
|
510 func isValidUnicodeCat(catName string) bool {
|
|
511 _, ok := unicodeCategories[catName]
|
|
512 return ok
|
|
513 }
|
|
514
|
|
515 func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
|
|
516 if !isValidUnicodeCat(categoryName) {
|
|
517 // unknown unicode category, script, or property "blah"
|
|
518 panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
|
|
519
|
|
520 }
|
|
521
|
|
522 if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
|
|
523 // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
|
|
524 c.addCategories(
|
|
525 category{cat: "Ll", negate: negate},
|
|
526 category{cat: "Lu", negate: negate},
|
|
527 category{cat: "Lt", negate: negate})
|
|
528 }
|
|
529 c.addCategories(category{cat: categoryName, negate: negate})
|
|
530 }
|
|
531
|
|
532 func (c *CharSet) addSubtraction(sub *CharSet) {
|
|
533 c.sub = sub
|
|
534 }
|
|
535
|
|
536 func (c *CharSet) addRange(chMin, chMax rune) {
|
|
537 c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
|
|
538 c.canonicalize()
|
|
539 }
|
|
540
|
|
541 func (c *CharSet) addNamedASCII(name string, negate bool) bool {
|
|
542 var rs []singleRange
|
|
543
|
|
544 switch name {
|
|
545 case "alnum":
|
|
546 rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
|
|
547 case "alpha":
|
|
548 rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
|
|
549 case "ascii":
|
|
550 rs = []singleRange{singleRange{0, 0x7f}}
|
|
551 case "blank":
|
|
552 rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
|
|
553 case "cntrl":
|
|
554 rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
|
|
555 case "digit":
|
|
556 c.addDigit(false, negate, "")
|
|
557 case "graph":
|
|
558 rs = []singleRange{singleRange{'!', '~'}}
|
|
559 case "lower":
|
|
560 rs = []singleRange{singleRange{'a', 'z'}}
|
|
561 case "print":
|
|
562 rs = []singleRange{singleRange{' ', '~'}}
|
|
563 case "punct": //[!-/:-@[-`{-~]
|
|
564 rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
|
|
565 case "space":
|
|
566 c.addSpace(true, negate)
|
|
567 case "upper":
|
|
568 rs = []singleRange{singleRange{'A', 'Z'}}
|
|
569 case "word":
|
|
570 c.addWord(true, negate)
|
|
571 case "xdigit":
|
|
572 rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
|
|
573 default:
|
|
574 return false
|
|
575 }
|
|
576
|
|
577 if len(rs) > 0 {
|
|
578 if negate {
|
|
579 c.addNegativeRanges(rs)
|
|
580 } else {
|
|
581 c.addRanges(rs)
|
|
582 }
|
|
583 }
|
|
584
|
|
585 return true
|
|
586 }
|
|
587
|
|
588 type singleRangeSorter []singleRange
|
|
589
|
|
590 func (p singleRangeSorter) Len() int { return len(p) }
|
|
591 func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
|
|
592 func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
|
|
593
|
|
594 // Logic to reduce a character class to a unique, sorted form.
|
|
595 func (c *CharSet) canonicalize() {
|
|
596 var i, j int
|
|
597 var last rune
|
|
598
|
|
599 //
|
|
600 // Find and eliminate overlapping or abutting ranges
|
|
601 //
|
|
602
|
|
603 if len(c.ranges) > 1 {
|
|
604 sort.Sort(singleRangeSorter(c.ranges))
|
|
605
|
|
606 done := false
|
|
607
|
|
608 for i, j = 1, 0; ; i++ {
|
|
609 for last = c.ranges[j].last; ; i++ {
|
|
610 if i == len(c.ranges) || last == utf8.MaxRune {
|
|
611 done = true
|
|
612 break
|
|
613 }
|
|
614
|
|
615 CurrentRange := c.ranges[i]
|
|
616 if CurrentRange.first > last+1 {
|
|
617 break
|
|
618 }
|
|
619
|
|
620 if last < CurrentRange.last {
|
|
621 last = CurrentRange.last
|
|
622 }
|
|
623 }
|
|
624
|
|
625 c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
|
|
626
|
|
627 j++
|
|
628
|
|
629 if done {
|
|
630 break
|
|
631 }
|
|
632
|
|
633 if j < i {
|
|
634 c.ranges[j] = c.ranges[i]
|
|
635 }
|
|
636 }
|
|
637
|
|
638 c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
|
|
639 }
|
|
640 }
|
|
641
|
|
642 // Adds to the class any lowercase versions of characters already
|
|
643 // in the class. Used for case-insensitivity.
|
|
644 func (c *CharSet) addLowercase() {
|
|
645 if c.anything {
|
|
646 return
|
|
647 }
|
|
648 toAdd := []singleRange{}
|
|
649 for i := 0; i < len(c.ranges); i++ {
|
|
650 r := c.ranges[i]
|
|
651 if r.first == r.last {
|
|
652 lower := unicode.ToLower(r.first)
|
|
653 c.ranges[i] = singleRange{first: lower, last: lower}
|
|
654 } else {
|
|
655 toAdd = append(toAdd, r)
|
|
656 }
|
|
657 }
|
|
658
|
|
659 for _, r := range toAdd {
|
|
660 c.addLowercaseRange(r.first, r.last)
|
|
661 }
|
|
662 c.canonicalize()
|
|
663 }
|
|
664
|
|
665 /**************************************************************************
|
|
666 Let U be the set of Unicode character values and let L be the lowercase
|
|
667 function, mapping from U to U. To perform case insensitive matching of
|
|
668 character sets, we need to be able to map an interval I in U, say
|
|
669
|
|
670 I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
|
|
671
|
|
672 to a set A such that A contains L(I) and A is contained in the union of
|
|
673 I and L(I).
|
|
674
|
|
675 The table below partitions U into intervals on which L is non-decreasing.
|
|
676 Thus, for any interval J = [a, b] contained in one of these intervals,
|
|
677 L(J) is contained in [L(a), L(b)].
|
|
678
|
|
679 It is also true that for any such J, [L(a), L(b)] is contained in the
|
|
680 union of J and L(J). This does not follow from L being non-decreasing on
|
|
681 these intervals. It follows from the nature of the L on each interval.
|
|
682 On each interval, L has one of the following forms:
|
|
683
|
|
684 (1) L(ch) = constant (LowercaseSet)
|
|
685 (2) L(ch) = ch + offset (LowercaseAdd)
|
|
686 (3) L(ch) = ch | 1 (LowercaseBor)
|
|
687 (4) L(ch) = ch + (ch & 1) (LowercaseBad)
|
|
688
|
|
689 It is easy to verify that for any of these forms [L(a), L(b)] is
|
|
690 contained in the union of [a, b] and L([a, b]).
|
|
691 ***************************************************************************/
|
|
692
|
|
693 const (
|
|
694 LowercaseSet = 0 // Set to arg.
|
|
695 LowercaseAdd = 1 // Add arg.
|
|
696 LowercaseBor = 2 // Bitwise or with 1.
|
|
697 LowercaseBad = 3 // Bitwise and with 1 and add original.
|
|
698 )
|
|
699
|
|
700 type lcMap struct {
|
|
701 chMin, chMax rune
|
|
702 op, data int32
|
|
703 }
|
|
704
|
|
705 var lcTable = []lcMap{
|
|
706 lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
|
|
707 lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
|
|
708 lcMap{'\u0100', '\u012E', LowercaseBor, 0},
|
|
709 lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
|
|
710 lcMap{'\u0132', '\u0136', LowercaseBor, 0},
|
|
711 lcMap{'\u0139', '\u0147', LowercaseBad, 0},
|
|
712 lcMap{'\u014A', '\u0176', LowercaseBor, 0},
|
|
713 lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
|
|
714 lcMap{'\u0179', '\u017D', LowercaseBad, 0},
|
|
715 lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
|
|
716 lcMap{'\u0182', '\u0184', LowercaseBor, 0},
|
|
717 lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
|
|
718 lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
|
|
719 lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
|
|
720 lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
|
|
721 lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
|
|
722 lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
|
|
723 lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
|
|
724 lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
|
|
725 lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
|
|
726 lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
|
|
727 lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
|
|
728 lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
|
|
729 lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
|
|
730 lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
|
|
731 lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
|
|
732 lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
|
|
733 lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
|
|
734 lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
|
|
735 lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
|
|
736 lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
|
|
737 lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
|
|
738 lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
|
|
739 lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
|
|
740 lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
|
|
741 lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
|
|
742 lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
|
|
743 lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
|
|
744 lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
|
|
745 lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
|
|
746 lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
|
|
747 lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
|
|
748 lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
|
|
749 lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
|
|
750 lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
|
|
751 lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
|
|
752 lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
|
|
753 lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
|
|
754 lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
|
|
755 lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
|
|
756 lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
|
|
757 lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
|
|
758 lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
|
|
759 lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
|
|
760 lcMap{'\u0460', '\u0480', LowercaseBor, 0},
|
|
761 lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
|
|
762 lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
|
|
763 lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
|
|
764 lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
|
|
765 lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
|
|
766 lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
|
|
767 lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
|
|
768 lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
|
|
769 lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
|
|
770 lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
|
|
771 lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
|
|
772 lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
|
|
773 lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
|
|
774 lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
|
|
775 lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
|
|
776 lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
|
|
777 lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
|
|
778 lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
|
|
779 lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
|
|
780 lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
|
|
781 lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
|
|
782 lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
|
|
783 lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
|
|
784 lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
|
|
785 lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
|
|
786 lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
|
|
787 lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
|
|
788 lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
|
|
789 lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
|
|
790 lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
|
|
791 lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
|
|
792 lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
|
|
793 lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
|
|
794 lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
|
|
795 lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
|
|
796 lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
|
|
797 lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
|
|
798 lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
|
|
799 lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
|
|
800 }
|
|
801
|
|
802 func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
|
|
803 var i, iMax, iMid int
|
|
804 var chMinT, chMaxT rune
|
|
805 var lc lcMap
|
|
806
|
|
807 for i, iMax = 0, len(lcTable); i < iMax; {
|
|
808 iMid = (i + iMax) / 2
|
|
809 if lcTable[iMid].chMax < chMin {
|
|
810 i = iMid + 1
|
|
811 } else {
|
|
812 iMax = iMid
|
|
813 }
|
|
814 }
|
|
815
|
|
816 for ; i < len(lcTable); i++ {
|
|
817 lc = lcTable[i]
|
|
818 if lc.chMin > chMax {
|
|
819 return
|
|
820 }
|
|
821 chMinT = lc.chMin
|
|
822 if chMinT < chMin {
|
|
823 chMinT = chMin
|
|
824 }
|
|
825
|
|
826 chMaxT = lc.chMax
|
|
827 if chMaxT > chMax {
|
|
828 chMaxT = chMax
|
|
829 }
|
|
830
|
|
831 switch lc.op {
|
|
832 case LowercaseSet:
|
|
833 chMinT = rune(lc.data)
|
|
834 chMaxT = rune(lc.data)
|
|
835 break
|
|
836 case LowercaseAdd:
|
|
837 chMinT += lc.data
|
|
838 chMaxT += lc.data
|
|
839 break
|
|
840 case LowercaseBor:
|
|
841 chMinT |= 1
|
|
842 chMaxT |= 1
|
|
843 break
|
|
844 case LowercaseBad:
|
|
845 chMinT += (chMinT & 1)
|
|
846 chMaxT += (chMaxT & 1)
|
|
847 break
|
|
848 }
|
|
849
|
|
850 if chMinT < chMin || chMaxT > chMax {
|
|
851 c.addRange(chMinT, chMaxT)
|
|
852 }
|
|
853 }
|
|
854 }
|