Mercurial > yakumo_izuru > aya
diff vendor/github.com/dlclark/regexp2/syntax/charclass.go @ 66:787b5ee0289d draft
Use vendored modules
Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author | yakumo.izuru |
---|---|
date | Sun, 23 Jul 2023 13:18:53 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vendor/github.com/dlclark/regexp2/syntax/charclass.go Sun Jul 23 13:18:53 2023 +0000 @@ -0,0 +1,854 @@ +package syntax + +import ( + "bytes" + "encoding/binary" + "fmt" + "sort" + "unicode" + "unicode/utf8" +) + +// CharSet combines start-end rune ranges and unicode categories representing a set of characters +type CharSet struct { + ranges []singleRange + categories []category + sub *CharSet //optional subtractor + negate bool + anything bool +} + +type category struct { + negate bool + cat string +} + +type singleRange struct { + first rune + last rune +} + +const ( + spaceCategoryText = " " + wordCategoryText = "W" +) + +var ( + ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00} + ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b} + ecmaDigit = []rune{0x0030, 0x003a} +) + +var ( + AnyClass = getCharSetFromOldString([]rune{0}, false) + ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false) + NoneClass = getCharSetFromOldString(nil, false) + ECMAWordClass = getCharSetFromOldString(ecmaWord, false) + NotECMAWordClass = getCharSetFromOldString(ecmaWord, true) + ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false) + NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true) + ECMADigitClass = getCharSetFromOldString(ecmaDigit, false) + NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true) + + WordClass = getCharSetFromCategoryString(false, false, wordCategoryText) + NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText) + SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText) + NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText) + DigitClass = getCharSetFromCategoryString(false, false, "Nd") + NotDigitClass = getCharSetFromCategoryString(false, true, "Nd") +) + +var unicodeCategories = func() map[string]*unicode.RangeTable { + retVal := make(map[string]*unicode.RangeTable) + for k, v := range unicode.Scripts { + retVal[k] = v + } + for k, v := range unicode.Categories { + retVal[k] = v + } + for k, v := range unicode.Properties { + retVal[k] = v + } + return retVal +}() + +func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet { + if negateCat && negateSet { + panic("BUG! You should only negate the set OR the category in a constant setup, but not both") + } + + c := CharSet{negate: negateSet} + + c.categories = make([]category, len(cats)) + for i, cat := range cats { + c.categories[i] = category{cat: cat, negate: negateCat} + } + return func() *CharSet { + //make a copy each time + local := c + //return that address + return &local + } +} + +func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet { + c := CharSet{} + if len(setText) > 0 { + fillFirst := false + l := len(setText) + if negate { + if setText[0] == 0 { + setText = setText[1:] + } else { + l++ + fillFirst = true + } + } + + if l%2 == 0 { + c.ranges = make([]singleRange, l/2) + } else { + c.ranges = make([]singleRange, l/2+1) + } + + first := true + if fillFirst { + c.ranges[0] = singleRange{first: 0} + first = false + } + + i := 0 + for _, r := range setText { + if first { + // lower bound in a new range + c.ranges[i] = singleRange{first: r} + first = false + } else { + c.ranges[i].last = r - 1 + i++ + first = true + } + } + if !first { + c.ranges[i].last = utf8.MaxRune + } + } + + return func() *CharSet { + local := c + return &local + } +} + +// Copy makes a deep copy to prevent accidental mutation of a set +func (c CharSet) Copy() CharSet { + ret := CharSet{ + anything: c.anything, + negate: c.negate, + } + + ret.ranges = append(ret.ranges, c.ranges...) + ret.categories = append(ret.categories, c.categories...) + + if c.sub != nil { + sub := c.sub.Copy() + ret.sub = &sub + } + + return ret +} + +// gets a human-readable description for a set string +func (c CharSet) String() string { + buf := &bytes.Buffer{} + buf.WriteRune('[') + + if c.IsNegated() { + buf.WriteRune('^') + } + + for _, r := range c.ranges { + + buf.WriteString(CharDescription(r.first)) + if r.first != r.last { + if r.last-r.first != 1 { + //groups that are 1 char apart skip the dash + buf.WriteRune('-') + } + buf.WriteString(CharDescription(r.last)) + } + } + + for _, c := range c.categories { + buf.WriteString(c.String()) + } + + if c.sub != nil { + buf.WriteRune('-') + buf.WriteString(c.sub.String()) + } + + buf.WriteRune(']') + + return buf.String() +} + +// mapHashFill converts a charset into a buffer for use in maps +func (c CharSet) mapHashFill(buf *bytes.Buffer) { + if c.negate { + buf.WriteByte(0) + } else { + buf.WriteByte(1) + } + + binary.Write(buf, binary.LittleEndian, len(c.ranges)) + binary.Write(buf, binary.LittleEndian, len(c.categories)) + for _, r := range c.ranges { + buf.WriteRune(r.first) + buf.WriteRune(r.last) + } + for _, ct := range c.categories { + buf.WriteString(ct.cat) + if ct.negate { + buf.WriteByte(1) + } else { + buf.WriteByte(0) + } + } + + if c.sub != nil { + c.sub.mapHashFill(buf) + } +} + +// CharIn returns true if the rune is in our character set (either ranges or categories). +// It handles negations and subtracted sub-charsets. +func (c CharSet) CharIn(ch rune) bool { + val := false + // in s && !s.subtracted + + //check ranges + for _, r := range c.ranges { + if ch < r.first { + continue + } + if ch <= r.last { + val = true + break + } + } + + //check categories if we haven't already found a range + if !val && len(c.categories) > 0 { + for _, ct := range c.categories { + // special categories...then unicode + if ct.cat == spaceCategoryText { + if unicode.IsSpace(ch) { + // we found a space so we're done + // negate means this is a "bad" thing + val = !ct.negate + break + } else if ct.negate { + val = true + break + } + } else if ct.cat == wordCategoryText { + if IsWordChar(ch) { + val = !ct.negate + break + } else if ct.negate { + val = true + break + } + } else if unicode.Is(unicodeCategories[ct.cat], ch) { + // if we're in this unicode category then we're done + // if negate=true on this category then we "failed" our test + // otherwise we're good that we found it + val = !ct.negate + break + } else if ct.negate { + val = true + break + } + } + } + + // negate the whole char set + if c.negate { + val = !val + } + + // get subtracted recurse + if val && c.sub != nil { + val = !c.sub.CharIn(ch) + } + + //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val) + return val +} + +func (c category) String() string { + switch c.cat { + case spaceCategoryText: + if c.negate { + return "\\S" + } + return "\\s" + case wordCategoryText: + if c.negate { + return "\\W" + } + return "\\w" + } + if _, ok := unicodeCategories[c.cat]; ok { + + if c.negate { + return "\\P{" + c.cat + "}" + } + return "\\p{" + c.cat + "}" + } + return "Unknown category: " + c.cat +} + +// CharDescription Produces a human-readable description for a single character. +func CharDescription(ch rune) string { + /*if ch == '\\' { + return "\\\\" + } + + if ch > ' ' && ch <= '~' { + return string(ch) + } else if ch == '\n' { + return "\\n" + } else if ch == ' ' { + return "\\ " + }*/ + + b := &bytes.Buffer{} + escape(b, ch, false) //fmt.Sprintf("%U", ch) + return b.String() +} + +// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) +// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic +// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C +// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER. +func IsWordChar(r rune) bool { + //"L", "Mn", "Nd", "Pc" + return unicode.In(r, + unicode.Categories["L"], unicode.Categories["Mn"], + unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C' + //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' +} + +func IsECMAWordChar(r rune) bool { + return unicode.In(r, + unicode.Categories["L"], unicode.Categories["Mn"], + unicode.Categories["Nd"], unicode.Categories["Pc"]) + + //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' +} + +// SingletonChar will return the char from the first range without validation. +// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input +func (c CharSet) SingletonChar() rune { + return c.ranges[0].first +} + +func (c CharSet) IsSingleton() bool { + return !c.negate && //negated is multiple chars + len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars + c.sub == nil && // subtraction means we've got multiple chars + c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char +} + +func (c CharSet) IsSingletonInverse() bool { + return c.negate && //same as above, but requires negated + len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars + c.sub == nil && // subtraction means we've got multiple chars + c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char +} + +func (c CharSet) IsMergeable() bool { + return !c.IsNegated() && !c.HasSubtraction() +} + +func (c CharSet) IsNegated() bool { + return c.negate +} + +func (c CharSet) HasSubtraction() bool { + return c.sub != nil +} + +func (c CharSet) IsEmpty() bool { + return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil +} + +func (c *CharSet) addDigit(ecma, negate bool, pattern string) { + if ecma { + if negate { + c.addRanges(NotECMADigitClass().ranges) + } else { + c.addRanges(ECMADigitClass().ranges) + } + } else { + c.addCategories(category{cat: "Nd", negate: negate}) + } +} + +func (c *CharSet) addChar(ch rune) { + c.addRange(ch, ch) +} + +func (c *CharSet) addSpace(ecma, negate bool) { + if ecma { + if negate { + c.addRanges(NotECMASpaceClass().ranges) + } else { + c.addRanges(ECMASpaceClass().ranges) + } + } else { + c.addCategories(category{cat: spaceCategoryText, negate: negate}) + } +} + +func (c *CharSet) addWord(ecma, negate bool) { + if ecma { + if negate { + c.addRanges(NotECMAWordClass().ranges) + } else { + c.addRanges(ECMAWordClass().ranges) + } + } else { + c.addCategories(category{cat: wordCategoryText, negate: negate}) + } +} + +// Add set ranges and categories into ours -- no deduping or anything +func (c *CharSet) addSet(set CharSet) { + if c.anything { + return + } + if set.anything { + c.makeAnything() + return + } + // just append here to prevent double-canon + c.ranges = append(c.ranges, set.ranges...) + c.addCategories(set.categories...) + c.canonicalize() +} + +func (c *CharSet) makeAnything() { + c.anything = true + c.categories = []category{} + c.ranges = AnyClass().ranges +} + +func (c *CharSet) addCategories(cats ...category) { + // don't add dupes and remove positive+negative + if c.anything { + // if we've had a previous positive+negative group then + // just return, we're as broad as we can get + return + } + + for _, ct := range cats { + found := false + for _, ct2 := range c.categories { + if ct.cat == ct2.cat { + if ct.negate != ct2.negate { + // oposite negations...this mean we just + // take us as anything and move on + c.makeAnything() + return + } + found = true + break + } + } + + if !found { + c.categories = append(c.categories, ct) + } + } +} + +// Merges new ranges to our own +func (c *CharSet) addRanges(ranges []singleRange) { + if c.anything { + return + } + c.ranges = append(c.ranges, ranges...) + c.canonicalize() +} + +// Merges everything but the new ranges into our own +func (c *CharSet) addNegativeRanges(ranges []singleRange) { + if c.anything { + return + } + + var hi rune + + // convert incoming ranges into opposites, assume they are in order + for _, r := range ranges { + if hi < r.first { + c.ranges = append(c.ranges, singleRange{hi, r.first - 1}) + } + hi = r.last + 1 + } + + if hi < utf8.MaxRune { + c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune}) + } + + c.canonicalize() +} + +func isValidUnicodeCat(catName string) bool { + _, ok := unicodeCategories[catName] + return ok +} + +func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) { + if !isValidUnicodeCat(categoryName) { + // unknown unicode category, script, or property "blah" + panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName)) + + } + + if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") { + // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match + c.addCategories( + category{cat: "Ll", negate: negate}, + category{cat: "Lu", negate: negate}, + category{cat: "Lt", negate: negate}) + } + c.addCategories(category{cat: categoryName, negate: negate}) +} + +func (c *CharSet) addSubtraction(sub *CharSet) { + c.sub = sub +} + +func (c *CharSet) addRange(chMin, chMax rune) { + c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax}) + c.canonicalize() +} + +func (c *CharSet) addNamedASCII(name string, negate bool) bool { + var rs []singleRange + + switch name { + case "alnum": + rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}} + case "alpha": + rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}} + case "ascii": + rs = []singleRange{singleRange{0, 0x7f}} + case "blank": + rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}} + case "cntrl": + rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}} + case "digit": + c.addDigit(false, negate, "") + case "graph": + rs = []singleRange{singleRange{'!', '~'}} + case "lower": + rs = []singleRange{singleRange{'a', 'z'}} + case "print": + rs = []singleRange{singleRange{' ', '~'}} + case "punct": //[!-/:-@[-`{-~] + rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}} + case "space": + c.addSpace(true, negate) + case "upper": + rs = []singleRange{singleRange{'A', 'Z'}} + case "word": + c.addWord(true, negate) + case "xdigit": + rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}} + default: + return false + } + + if len(rs) > 0 { + if negate { + c.addNegativeRanges(rs) + } else { + c.addRanges(rs) + } + } + + return true +} + +type singleRangeSorter []singleRange + +func (p singleRangeSorter) Len() int { return len(p) } +func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first } +func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +// Logic to reduce a character class to a unique, sorted form. +func (c *CharSet) canonicalize() { + var i, j int + var last rune + + // + // Find and eliminate overlapping or abutting ranges + // + + if len(c.ranges) > 1 { + sort.Sort(singleRangeSorter(c.ranges)) + + done := false + + for i, j = 1, 0; ; i++ { + for last = c.ranges[j].last; ; i++ { + if i == len(c.ranges) || last == utf8.MaxRune { + done = true + break + } + + CurrentRange := c.ranges[i] + if CurrentRange.first > last+1 { + break + } + + if last < CurrentRange.last { + last = CurrentRange.last + } + } + + c.ranges[j] = singleRange{first: c.ranges[j].first, last: last} + + j++ + + if done { + break + } + + if j < i { + c.ranges[j] = c.ranges[i] + } + } + + c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...) + } +} + +// Adds to the class any lowercase versions of characters already +// in the class. Used for case-insensitivity. +func (c *CharSet) addLowercase() { + if c.anything { + return + } + toAdd := []singleRange{} + for i := 0; i < len(c.ranges); i++ { + r := c.ranges[i] + if r.first == r.last { + lower := unicode.ToLower(r.first) + c.ranges[i] = singleRange{first: lower, last: lower} + } else { + toAdd = append(toAdd, r) + } + } + + for _, r := range toAdd { + c.addLowercaseRange(r.first, r.last) + } + c.canonicalize() +} + +/************************************************************************** + Let U be the set of Unicode character values and let L be the lowercase + function, mapping from U to U. To perform case insensitive matching of + character sets, we need to be able to map an interval I in U, say + + I = [chMin, chMax] = { ch : chMin <= ch <= chMax } + + to a set A such that A contains L(I) and A is contained in the union of + I and L(I). + + The table below partitions U into intervals on which L is non-decreasing. + Thus, for any interval J = [a, b] contained in one of these intervals, + L(J) is contained in [L(a), L(b)]. + + It is also true that for any such J, [L(a), L(b)] is contained in the + union of J and L(J). This does not follow from L being non-decreasing on + these intervals. It follows from the nature of the L on each interval. + On each interval, L has one of the following forms: + + (1) L(ch) = constant (LowercaseSet) + (2) L(ch) = ch + offset (LowercaseAdd) + (3) L(ch) = ch | 1 (LowercaseBor) + (4) L(ch) = ch + (ch & 1) (LowercaseBad) + + It is easy to verify that for any of these forms [L(a), L(b)] is + contained in the union of [a, b] and L([a, b]). +***************************************************************************/ + +const ( + LowercaseSet = 0 // Set to arg. + LowercaseAdd = 1 // Add arg. + LowercaseBor = 2 // Bitwise or with 1. + LowercaseBad = 3 // Bitwise and with 1 and add original. +) + +type lcMap struct { + chMin, chMax rune + op, data int32 +} + +var lcTable = []lcMap{ + lcMap{'\u0041', '\u005A', LowercaseAdd, 32}, + lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32}, + lcMap{'\u0100', '\u012E', LowercaseBor, 0}, + lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069}, + lcMap{'\u0132', '\u0136', LowercaseBor, 0}, + lcMap{'\u0139', '\u0147', LowercaseBad, 0}, + lcMap{'\u014A', '\u0176', LowercaseBor, 0}, + lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF}, + lcMap{'\u0179', '\u017D', LowercaseBad, 0}, + lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253}, + lcMap{'\u0182', '\u0184', LowercaseBor, 0}, + lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254}, + lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188}, + lcMap{'\u0189', '\u018A', LowercaseAdd, 205}, + lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C}, + lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD}, + lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259}, + lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B}, + lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192}, + lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260}, + lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263}, + lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269}, + lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268}, + lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199}, + lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F}, + lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272}, + lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275}, + lcMap{'\u01A0', '\u01A4', LowercaseBor, 0}, + lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8}, + lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283}, + lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD}, + lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288}, + lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0}, + lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217}, + lcMap{'\u01B3', '\u01B5', LowercaseBad, 0}, + lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292}, + lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9}, + lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD}, + lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6}, + lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9}, + lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC}, + lcMap{'\u01CD', '\u01DB', LowercaseBad, 0}, + lcMap{'\u01DE', '\u01EE', LowercaseBor, 0}, + lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3}, + lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5}, + lcMap{'\u01FA', '\u0216', LowercaseBor, 0}, + lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC}, + lcMap{'\u0388', '\u038A', LowercaseAdd, 37}, + lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC}, + lcMap{'\u038E', '\u038F', LowercaseAdd, 63}, + lcMap{'\u0391', '\u03AB', LowercaseAdd, 32}, + lcMap{'\u03E2', '\u03EE', LowercaseBor, 0}, + lcMap{'\u0401', '\u040F', LowercaseAdd, 80}, + lcMap{'\u0410', '\u042F', LowercaseAdd, 32}, + lcMap{'\u0460', '\u0480', LowercaseBor, 0}, + lcMap{'\u0490', '\u04BE', LowercaseBor, 0}, + lcMap{'\u04C1', '\u04C3', LowercaseBad, 0}, + lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8}, + lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC}, + lcMap{'\u04D0', '\u04EA', LowercaseBor, 0}, + lcMap{'\u04EE', '\u04F4', LowercaseBor, 0}, + lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9}, + lcMap{'\u0531', '\u0556', LowercaseAdd, 48}, + lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48}, + lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0}, + lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8}, + lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8}, + lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8}, + lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8}, + lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8}, + lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51}, + lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53}, + lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55}, + lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57}, + lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8}, + lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8}, + lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8}, + lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8}, + lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8}, + lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74}, + lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3}, + lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86}, + lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3}, + lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8}, + lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100}, + lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8}, + lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112}, + lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5}, + lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128}, + lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126}, + lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3}, + lcMap{'\u2160', '\u216F', LowercaseAdd, 16}, + lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26}, + lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32}, +} + +func (c *CharSet) addLowercaseRange(chMin, chMax rune) { + var i, iMax, iMid int + var chMinT, chMaxT rune + var lc lcMap + + for i, iMax = 0, len(lcTable); i < iMax; { + iMid = (i + iMax) / 2 + if lcTable[iMid].chMax < chMin { + i = iMid + 1 + } else { + iMax = iMid + } + } + + for ; i < len(lcTable); i++ { + lc = lcTable[i] + if lc.chMin > chMax { + return + } + chMinT = lc.chMin + if chMinT < chMin { + chMinT = chMin + } + + chMaxT = lc.chMax + if chMaxT > chMax { + chMaxT = chMax + } + + switch lc.op { + case LowercaseSet: + chMinT = rune(lc.data) + chMaxT = rune(lc.data) + break + case LowercaseAdd: + chMinT += lc.data + chMaxT += lc.data + break + case LowercaseBor: + chMinT |= 1 + chMaxT |= 1 + break + case LowercaseBad: + chMinT += (chMinT & 1) + chMaxT += (chMaxT & 1) + break + } + + if chMinT < chMin || chMaxT > chMax { + c.addRange(chMinT, chMaxT) + } + } +}