Mercurial > yakumo_izuru > aya
comparison vendor/github.com/dlclark/regexp2/syntax/prefix.go @ 66:787b5ee0289d draft
Use vendored modules
Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author | yakumo.izuru |
---|---|
date | Sun, 23 Jul 2023 13:18:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
65:6d985efa0f7a | 66:787b5ee0289d |
---|---|
1 package syntax | |
2 | |
3 import ( | |
4 "bytes" | |
5 "fmt" | |
6 "strconv" | |
7 "unicode" | |
8 "unicode/utf8" | |
9 ) | |
10 | |
11 type Prefix struct { | |
12 PrefixStr []rune | |
13 PrefixSet CharSet | |
14 CaseInsensitive bool | |
15 } | |
16 | |
17 // It takes a RegexTree and computes the set of chars that can start it. | |
18 func getFirstCharsPrefix(tree *RegexTree) *Prefix { | |
19 s := regexFcd{ | |
20 fcStack: make([]regexFc, 32), | |
21 intStack: make([]int, 32), | |
22 } | |
23 fc := s.regexFCFromRegexTree(tree) | |
24 | |
25 if fc == nil || fc.nullable || fc.cc.IsEmpty() { | |
26 return nil | |
27 } | |
28 fcSet := fc.getFirstChars() | |
29 return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive} | |
30 } | |
31 | |
32 type regexFcd struct { | |
33 intStack []int | |
34 intDepth int | |
35 fcStack []regexFc | |
36 fcDepth int | |
37 skipAllChildren bool // don't process any more children at the current level | |
38 skipchild bool // don't process the current child. | |
39 failed bool | |
40 } | |
41 | |
42 /* | |
43 * The main FC computation. It does a shortcutted depth-first walk | |
44 * through the tree and calls CalculateFC to emits code before | |
45 * and after each child of an interior node, and at each leaf. | |
46 */ | |
47 func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc { | |
48 curNode := tree.root | |
49 curChild := 0 | |
50 | |
51 for { | |
52 if len(curNode.children) == 0 { | |
53 // This is a leaf node | |
54 s.calculateFC(curNode.t, curNode, 0) | |
55 } else if curChild < len(curNode.children) && !s.skipAllChildren { | |
56 // This is an interior node, and we have more children to analyze | |
57 s.calculateFC(curNode.t|beforeChild, curNode, curChild) | |
58 | |
59 if !s.skipchild { | |
60 curNode = curNode.children[curChild] | |
61 // this stack is how we get a depth first walk of the tree. | |
62 s.pushInt(curChild) | |
63 curChild = 0 | |
64 } else { | |
65 curChild++ | |
66 s.skipchild = false | |
67 } | |
68 continue | |
69 } | |
70 | |
71 // This is an interior node where we've finished analyzing all the children, or | |
72 // the end of a leaf node. | |
73 s.skipAllChildren = false | |
74 | |
75 if s.intIsEmpty() { | |
76 break | |
77 } | |
78 | |
79 curChild = s.popInt() | |
80 curNode = curNode.next | |
81 | |
82 s.calculateFC(curNode.t|afterChild, curNode, curChild) | |
83 if s.failed { | |
84 return nil | |
85 } | |
86 | |
87 curChild++ | |
88 } | |
89 | |
90 if s.fcIsEmpty() { | |
91 return nil | |
92 } | |
93 | |
94 return s.popFC() | |
95 } | |
96 | |
97 // To avoid recursion, we use a simple integer stack. | |
98 // This is the push. | |
99 func (s *regexFcd) pushInt(I int) { | |
100 if s.intDepth >= len(s.intStack) { | |
101 expanded := make([]int, s.intDepth*2) | |
102 copy(expanded, s.intStack) | |
103 s.intStack = expanded | |
104 } | |
105 | |
106 s.intStack[s.intDepth] = I | |
107 s.intDepth++ | |
108 } | |
109 | |
110 // True if the stack is empty. | |
111 func (s *regexFcd) intIsEmpty() bool { | |
112 return s.intDepth == 0 | |
113 } | |
114 | |
115 // This is the pop. | |
116 func (s *regexFcd) popInt() int { | |
117 s.intDepth-- | |
118 return s.intStack[s.intDepth] | |
119 } | |
120 | |
121 // We also use a stack of RegexFC objects. | |
122 // This is the push. | |
123 func (s *regexFcd) pushFC(fc regexFc) { | |
124 if s.fcDepth >= len(s.fcStack) { | |
125 expanded := make([]regexFc, s.fcDepth*2) | |
126 copy(expanded, s.fcStack) | |
127 s.fcStack = expanded | |
128 } | |
129 | |
130 s.fcStack[s.fcDepth] = fc | |
131 s.fcDepth++ | |
132 } | |
133 | |
134 // True if the stack is empty. | |
135 func (s *regexFcd) fcIsEmpty() bool { | |
136 return s.fcDepth == 0 | |
137 } | |
138 | |
139 // This is the pop. | |
140 func (s *regexFcd) popFC() *regexFc { | |
141 s.fcDepth-- | |
142 return &s.fcStack[s.fcDepth] | |
143 } | |
144 | |
145 // This is the top. | |
146 func (s *regexFcd) topFC() *regexFc { | |
147 return &s.fcStack[s.fcDepth-1] | |
148 } | |
149 | |
150 // Called in Beforechild to prevent further processing of the current child | |
151 func (s *regexFcd) skipChild() { | |
152 s.skipchild = true | |
153 } | |
154 | |
155 // FC computation and shortcut cases for each node type | |
156 func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) { | |
157 //fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description()) | |
158 ci := false | |
159 rtl := false | |
160 | |
161 if nt <= ntRef { | |
162 if (node.options & IgnoreCase) != 0 { | |
163 ci = true | |
164 } | |
165 if (node.options & RightToLeft) != 0 { | |
166 rtl = true | |
167 } | |
168 } | |
169 | |
170 switch nt { | |
171 case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild: | |
172 break | |
173 | |
174 case ntTestgroup | beforeChild: | |
175 if CurIndex == 0 { | |
176 s.skipChild() | |
177 } | |
178 break | |
179 | |
180 case ntEmpty: | |
181 s.pushFC(regexFc{nullable: true}) | |
182 break | |
183 | |
184 case ntConcatenate | afterChild: | |
185 if CurIndex != 0 { | |
186 child := s.popFC() | |
187 cumul := s.topFC() | |
188 | |
189 s.failed = !cumul.addFC(*child, true) | |
190 } | |
191 | |
192 fc := s.topFC() | |
193 if !fc.nullable { | |
194 s.skipAllChildren = true | |
195 } | |
196 break | |
197 | |
198 case ntTestgroup | afterChild: | |
199 if CurIndex > 1 { | |
200 child := s.popFC() | |
201 cumul := s.topFC() | |
202 | |
203 s.failed = !cumul.addFC(*child, false) | |
204 } | |
205 break | |
206 | |
207 case ntAlternate | afterChild, ntTestref | afterChild: | |
208 if CurIndex != 0 { | |
209 child := s.popFC() | |
210 cumul := s.topFC() | |
211 | |
212 s.failed = !cumul.addFC(*child, false) | |
213 } | |
214 break | |
215 | |
216 case ntLoop | afterChild, ntLazyloop | afterChild: | |
217 if node.m == 0 { | |
218 fc := s.topFC() | |
219 fc.nullable = true | |
220 } | |
221 break | |
222 | |
223 case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild: | |
224 break | |
225 | |
226 case ntRequire | beforeChild, ntPrevent | beforeChild: | |
227 s.skipChild() | |
228 s.pushFC(regexFc{nullable: true}) | |
229 break | |
230 | |
231 case ntRequire | afterChild, ntPrevent | afterChild: | |
232 break | |
233 | |
234 case ntOne, ntNotone: | |
235 s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci)) | |
236 break | |
237 | |
238 case ntOneloop, ntOnelazy: | |
239 s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci)) | |
240 break | |
241 | |
242 case ntNotoneloop, ntNotonelazy: | |
243 s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci)) | |
244 break | |
245 | |
246 case ntMulti: | |
247 if len(node.str) == 0 { | |
248 s.pushFC(regexFc{nullable: true}) | |
249 } else if !rtl { | |
250 s.pushFC(newRegexFc(node.str[0], false, false, ci)) | |
251 } else { | |
252 s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci)) | |
253 } | |
254 break | |
255 | |
256 case ntSet: | |
257 s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci}) | |
258 break | |
259 | |
260 case ntSetloop, ntSetlazy: | |
261 s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci}) | |
262 break | |
263 | |
264 case ntRef: | |
265 s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false}) | |
266 break | |
267 | |
268 case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: | |
269 s.pushFC(regexFc{nullable: true}) | |
270 break | |
271 | |
272 default: | |
273 panic(fmt.Sprintf("unexpected op code: %v", nt)) | |
274 } | |
275 } | |
276 | |
277 type regexFc struct { | |
278 cc CharSet | |
279 nullable bool | |
280 caseInsensitive bool | |
281 } | |
282 | |
283 func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc { | |
284 r := regexFc{ | |
285 caseInsensitive: caseInsensitive, | |
286 nullable: nullable, | |
287 } | |
288 if not { | |
289 if ch > 0 { | |
290 r.cc.addRange('\x00', ch-1) | |
291 } | |
292 if ch < 0xFFFF { | |
293 r.cc.addRange(ch+1, utf8.MaxRune) | |
294 } | |
295 } else { | |
296 r.cc.addRange(ch, ch) | |
297 } | |
298 return r | |
299 } | |
300 | |
301 func (r *regexFc) getFirstChars() CharSet { | |
302 if r.caseInsensitive { | |
303 r.cc.addLowercase() | |
304 } | |
305 | |
306 return r.cc | |
307 } | |
308 | |
309 func (r *regexFc) addFC(fc regexFc, concatenate bool) bool { | |
310 if !r.cc.IsMergeable() || !fc.cc.IsMergeable() { | |
311 return false | |
312 } | |
313 | |
314 if concatenate { | |
315 if !r.nullable { | |
316 return true | |
317 } | |
318 | |
319 if !fc.nullable { | |
320 r.nullable = false | |
321 } | |
322 } else { | |
323 if fc.nullable { | |
324 r.nullable = true | |
325 } | |
326 } | |
327 | |
328 r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive | |
329 r.cc.addSet(fc.cc) | |
330 | |
331 return true | |
332 } | |
333 | |
334 // This is a related computation: it takes a RegexTree and computes the | |
335 // leading substring if it sees one. It's quite trivial and gives up easily. | |
336 func getPrefix(tree *RegexTree) *Prefix { | |
337 var concatNode *regexNode | |
338 nextChild := 0 | |
339 | |
340 curNode := tree.root | |
341 | |
342 for { | |
343 switch curNode.t { | |
344 case ntConcatenate: | |
345 if len(curNode.children) > 0 { | |
346 concatNode = curNode | |
347 nextChild = 0 | |
348 } | |
349 | |
350 case ntGreedy, ntCapture: | |
351 curNode = curNode.children[0] | |
352 concatNode = nil | |
353 continue | |
354 | |
355 case ntOneloop, ntOnelazy: | |
356 if curNode.m > 0 { | |
357 return &Prefix{ | |
358 PrefixStr: repeat(curNode.ch, curNode.m), | |
359 CaseInsensitive: (curNode.options & IgnoreCase) != 0, | |
360 } | |
361 } | |
362 return nil | |
363 | |
364 case ntOne: | |
365 return &Prefix{ | |
366 PrefixStr: []rune{curNode.ch}, | |
367 CaseInsensitive: (curNode.options & IgnoreCase) != 0, | |
368 } | |
369 | |
370 case ntMulti: | |
371 return &Prefix{ | |
372 PrefixStr: curNode.str, | |
373 CaseInsensitive: (curNode.options & IgnoreCase) != 0, | |
374 } | |
375 | |
376 case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart, | |
377 ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent: | |
378 | |
379 default: | |
380 return nil | |
381 } | |
382 | |
383 if concatNode == nil || nextChild >= len(concatNode.children) { | |
384 return nil | |
385 } | |
386 | |
387 curNode = concatNode.children[nextChild] | |
388 nextChild++ | |
389 } | |
390 } | |
391 | |
392 // repeat the rune r, c times... up to the max of MaxPrefixSize | |
393 func repeat(r rune, c int) []rune { | |
394 if c > MaxPrefixSize { | |
395 c = MaxPrefixSize | |
396 } | |
397 | |
398 ret := make([]rune, c) | |
399 | |
400 // binary growth using copy for speed | |
401 ret[0] = r | |
402 bp := 1 | |
403 for bp < len(ret) { | |
404 copy(ret[bp:], ret[:bp]) | |
405 bp *= 2 | |
406 } | |
407 | |
408 return ret | |
409 } | |
410 | |
411 // BmPrefix precomputes the Boyer-Moore | |
412 // tables for fast string scanning. These tables allow | |
413 // you to scan for the first occurrence of a string within | |
414 // a large body of text without examining every character. | |
415 // The performance of the heuristic depends on the actual | |
416 // string and the text being searched, but usually, the longer | |
417 // the string that is being searched for, the fewer characters | |
418 // need to be examined. | |
419 type BmPrefix struct { | |
420 positive []int | |
421 negativeASCII []int | |
422 negativeUnicode [][]int | |
423 pattern []rune | |
424 lowASCII rune | |
425 highASCII rune | |
426 rightToLeft bool | |
427 caseInsensitive bool | |
428 } | |
429 | |
430 func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix { | |
431 | |
432 b := &BmPrefix{ | |
433 rightToLeft: rightToLeft, | |
434 caseInsensitive: caseInsensitive, | |
435 pattern: pattern, | |
436 } | |
437 | |
438 if caseInsensitive { | |
439 for i := 0; i < len(b.pattern); i++ { | |
440 // We do the ToLower character by character for consistency. With surrogate chars, doing | |
441 // a ToLower on the entire string could actually change the surrogate pair. This is more correct | |
442 // linguistically, but since Regex doesn't support surrogates, it's more important to be | |
443 // consistent. | |
444 | |
445 b.pattern[i] = unicode.ToLower(b.pattern[i]) | |
446 } | |
447 } | |
448 | |
449 var beforefirst, last, bump int | |
450 var scan, match int | |
451 | |
452 if !rightToLeft { | |
453 beforefirst = -1 | |
454 last = len(b.pattern) - 1 | |
455 bump = 1 | |
456 } else { | |
457 beforefirst = len(b.pattern) | |
458 last = 0 | |
459 bump = -1 | |
460 } | |
461 | |
462 // PART I - the good-suffix shift table | |
463 // | |
464 // compute the positive requirement: | |
465 // if char "i" is the first one from the right that doesn't match, | |
466 // then we know the matcher can advance by _positive[i]. | |
467 // | |
468 // This algorithm is a simplified variant of the standard | |
469 // Boyer-Moore good suffix calculation. | |
470 | |
471 b.positive = make([]int, len(b.pattern)) | |
472 | |
473 examine := last | |
474 ch := b.pattern[examine] | |
475 b.positive[examine] = bump | |
476 examine -= bump | |
477 | |
478 Outerloop: | |
479 for { | |
480 // find an internal char (examine) that matches the tail | |
481 | |
482 for { | |
483 if examine == beforefirst { | |
484 break Outerloop | |
485 } | |
486 if b.pattern[examine] == ch { | |
487 break | |
488 } | |
489 examine -= bump | |
490 } | |
491 | |
492 match = last | |
493 scan = examine | |
494 | |
495 // find the length of the match | |
496 for { | |
497 if scan == beforefirst || b.pattern[match] != b.pattern[scan] { | |
498 // at the end of the match, note the difference in _positive | |
499 // this is not the length of the match, but the distance from the internal match | |
500 // to the tail suffix. | |
501 if b.positive[match] == 0 { | |
502 b.positive[match] = match - scan | |
503 } | |
504 | |
505 // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan)); | |
506 | |
507 break | |
508 } | |
509 | |
510 scan -= bump | |
511 match -= bump | |
512 } | |
513 | |
514 examine -= bump | |
515 } | |
516 | |
517 match = last - bump | |
518 | |
519 // scan for the chars for which there are no shifts that yield a different candidate | |
520 | |
521 // The inside of the if statement used to say | |
522 // "_positive[match] = last - beforefirst;" | |
523 // This is slightly less aggressive in how much we skip, but at worst it | |
524 // should mean a little more work rather than skipping a potential match. | |
525 for match != beforefirst { | |
526 if b.positive[match] == 0 { | |
527 b.positive[match] = bump | |
528 } | |
529 | |
530 match -= bump | |
531 } | |
532 | |
533 // PART II - the bad-character shift table | |
534 // | |
535 // compute the negative requirement: | |
536 // if char "ch" is the reject character when testing position "i", | |
537 // we can slide up by _negative[ch]; | |
538 // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) | |
539 // | |
540 // the lookup table is divided into ASCII and Unicode portions; | |
541 // only those parts of the Unicode 16-bit code set that actually | |
542 // appear in the string are in the table. (Maximum size with | |
543 // Unicode is 65K; ASCII only case is 512 bytes.) | |
544 | |
545 b.negativeASCII = make([]int, 128) | |
546 | |
547 for i := 0; i < len(b.negativeASCII); i++ { | |
548 b.negativeASCII[i] = last - beforefirst | |
549 } | |
550 | |
551 b.lowASCII = 127 | |
552 b.highASCII = 0 | |
553 | |
554 for examine = last; examine != beforefirst; examine -= bump { | |
555 ch = b.pattern[examine] | |
556 | |
557 switch { | |
558 case ch < 128: | |
559 if b.lowASCII > ch { | |
560 b.lowASCII = ch | |
561 } | |
562 | |
563 if b.highASCII < ch { | |
564 b.highASCII = ch | |
565 } | |
566 | |
567 if b.negativeASCII[ch] == last-beforefirst { | |
568 b.negativeASCII[ch] = last - examine | |
569 } | |
570 case ch <= 0xffff: | |
571 i, j := ch>>8, ch&0xFF | |
572 | |
573 if b.negativeUnicode == nil { | |
574 b.negativeUnicode = make([][]int, 256) | |
575 } | |
576 | |
577 if b.negativeUnicode[i] == nil { | |
578 newarray := make([]int, 256) | |
579 | |
580 for k := 0; k < len(newarray); k++ { | |
581 newarray[k] = last - beforefirst | |
582 } | |
583 | |
584 if i == 0 { | |
585 copy(newarray, b.negativeASCII) | |
586 //TODO: this line needed? | |
587 b.negativeASCII = newarray | |
588 } | |
589 | |
590 b.negativeUnicode[i] = newarray | |
591 } | |
592 | |
593 if b.negativeUnicode[i][j] == last-beforefirst { | |
594 b.negativeUnicode[i][j] = last - examine | |
595 } | |
596 default: | |
597 // we can't do the filter because this algo doesn't support | |
598 // unicode chars >0xffff | |
599 return nil | |
600 } | |
601 } | |
602 | |
603 return b | |
604 } | |
605 | |
606 func (b *BmPrefix) String() string { | |
607 return string(b.pattern) | |
608 } | |
609 | |
610 // Dump returns the contents of the filter as a human readable string | |
611 func (b *BmPrefix) Dump(indent string) string { | |
612 buf := &bytes.Buffer{} | |
613 | |
614 fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent) | |
615 for i := 0; i < len(b.positive); i++ { | |
616 buf.WriteString(strconv.Itoa(b.positive[i])) | |
617 buf.WriteRune(' ') | |
618 } | |
619 buf.WriteRune('\n') | |
620 | |
621 if b.negativeASCII != nil { | |
622 buf.WriteString(indent) | |
623 buf.WriteString("Negative table\n") | |
624 for i := 0; i < len(b.negativeASCII); i++ { | |
625 if b.negativeASCII[i] != len(b.pattern) { | |
626 fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i])) | |
627 } | |
628 } | |
629 } | |
630 | |
631 return buf.String() | |
632 } | |
633 | |
634 // Scan uses the Boyer-Moore algorithm to find the first occurrence | |
635 // of the specified string within text, beginning at index, and | |
636 // constrained within beglimit and endlimit. | |
637 // | |
638 // The direction and case-sensitivity of the match is determined | |
639 // by the arguments to the RegexBoyerMoore constructor. | |
640 func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int { | |
641 var ( | |
642 defadv, test, test2 int | |
643 match, startmatch, endmatch int | |
644 bump, advance int | |
645 chTest rune | |
646 unicodeLookup []int | |
647 ) | |
648 | |
649 if !b.rightToLeft { | |
650 defadv = len(b.pattern) | |
651 startmatch = len(b.pattern) - 1 | |
652 endmatch = 0 | |
653 test = index + defadv - 1 | |
654 bump = 1 | |
655 } else { | |
656 defadv = -len(b.pattern) | |
657 startmatch = 0 | |
658 endmatch = -defadv - 1 | |
659 test = index + defadv | |
660 bump = -1 | |
661 } | |
662 | |
663 chMatch := b.pattern[startmatch] | |
664 | |
665 for { | |
666 if test >= endlimit || test < beglimit { | |
667 return -1 | |
668 } | |
669 | |
670 chTest = text[test] | |
671 | |
672 if b.caseInsensitive { | |
673 chTest = unicode.ToLower(chTest) | |
674 } | |
675 | |
676 if chTest != chMatch { | |
677 if chTest < 128 { | |
678 advance = b.negativeASCII[chTest] | |
679 } else if chTest < 0xffff && len(b.negativeUnicode) > 0 { | |
680 unicodeLookup = b.negativeUnicode[chTest>>8] | |
681 if len(unicodeLookup) > 0 { | |
682 advance = unicodeLookup[chTest&0xFF] | |
683 } else { | |
684 advance = defadv | |
685 } | |
686 } else { | |
687 advance = defadv | |
688 } | |
689 | |
690 test += advance | |
691 } else { // if (chTest == chMatch) | |
692 test2 = test | |
693 match = startmatch | |
694 | |
695 for { | |
696 if match == endmatch { | |
697 if b.rightToLeft { | |
698 return test2 + 1 | |
699 } else { | |
700 return test2 | |
701 } | |
702 } | |
703 | |
704 match -= bump | |
705 test2 -= bump | |
706 | |
707 chTest = text[test2] | |
708 | |
709 if b.caseInsensitive { | |
710 chTest = unicode.ToLower(chTest) | |
711 } | |
712 | |
713 if chTest != b.pattern[match] { | |
714 advance = b.positive[match] | |
715 if (chTest & 0xFF80) == 0 { | |
716 test2 = (match - startmatch) + b.negativeASCII[chTest] | |
717 } else if chTest < 0xffff && len(b.negativeUnicode) > 0 { | |
718 unicodeLookup = b.negativeUnicode[chTest>>8] | |
719 if len(unicodeLookup) > 0 { | |
720 test2 = (match - startmatch) + unicodeLookup[chTest&0xFF] | |
721 } else { | |
722 test += advance | |
723 break | |
724 } | |
725 } else { | |
726 test += advance | |
727 break | |
728 } | |
729 | |
730 if b.rightToLeft { | |
731 if test2 < advance { | |
732 advance = test2 | |
733 } | |
734 } else if test2 > advance { | |
735 advance = test2 | |
736 } | |
737 | |
738 test += advance | |
739 break | |
740 } | |
741 } | |
742 } | |
743 } | |
744 } | |
745 | |
746 // When a regex is anchored, we can do a quick IsMatch test instead of a Scan | |
747 func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool { | |
748 if !b.rightToLeft { | |
749 if index < beglimit || endlimit-index < len(b.pattern) { | |
750 return false | |
751 } | |
752 | |
753 return b.matchPattern(text, index) | |
754 } else { | |
755 if index > endlimit || index-beglimit < len(b.pattern) { | |
756 return false | |
757 } | |
758 | |
759 return b.matchPattern(text, index-len(b.pattern)) | |
760 } | |
761 } | |
762 | |
763 func (b *BmPrefix) matchPattern(text []rune, index int) bool { | |
764 if len(text)-index < len(b.pattern) { | |
765 return false | |
766 } | |
767 | |
768 if b.caseInsensitive { | |
769 for i := 0; i < len(b.pattern); i++ { | |
770 //Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!"); | |
771 if unicode.ToLower(text[index+i]) != b.pattern[i] { | |
772 return false | |
773 } | |
774 } | |
775 return true | |
776 } else { | |
777 for i := 0; i < len(b.pattern); i++ { | |
778 if text[index+i] != b.pattern[i] { | |
779 return false | |
780 } | |
781 } | |
782 return true | |
783 } | |
784 } | |
785 | |
786 type AnchorLoc int16 | |
787 | |
788 // where the regex can be pegged | |
789 const ( | |
790 AnchorBeginning AnchorLoc = 0x0001 | |
791 AnchorBol = 0x0002 | |
792 AnchorStart = 0x0004 | |
793 AnchorEol = 0x0008 | |
794 AnchorEndZ = 0x0010 | |
795 AnchorEnd = 0x0020 | |
796 AnchorBoundary = 0x0040 | |
797 AnchorECMABoundary = 0x0080 | |
798 ) | |
799 | |
800 func getAnchors(tree *RegexTree) AnchorLoc { | |
801 | |
802 var concatNode *regexNode | |
803 nextChild, result := 0, AnchorLoc(0) | |
804 | |
805 curNode := tree.root | |
806 | |
807 for { | |
808 switch curNode.t { | |
809 case ntConcatenate: | |
810 if len(curNode.children) > 0 { | |
811 concatNode = curNode | |
812 nextChild = 0 | |
813 } | |
814 | |
815 case ntGreedy, ntCapture: | |
816 curNode = curNode.children[0] | |
817 concatNode = nil | |
818 continue | |
819 | |
820 case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, | |
821 ntStart, ntEndZ, ntEnd: | |
822 return result | anchorFromType(curNode.t) | |
823 | |
824 case ntEmpty, ntRequire, ntPrevent: | |
825 | |
826 default: | |
827 return result | |
828 } | |
829 | |
830 if concatNode == nil || nextChild >= len(concatNode.children) { | |
831 return result | |
832 } | |
833 | |
834 curNode = concatNode.children[nextChild] | |
835 nextChild++ | |
836 } | |
837 } | |
838 | |
839 func anchorFromType(t nodeType) AnchorLoc { | |
840 switch t { | |
841 case ntBol: | |
842 return AnchorBol | |
843 case ntEol: | |
844 return AnchorEol | |
845 case ntBoundary: | |
846 return AnchorBoundary | |
847 case ntECMABoundary: | |
848 return AnchorECMABoundary | |
849 case ntBeginning: | |
850 return AnchorBeginning | |
851 case ntStart: | |
852 return AnchorStart | |
853 case ntEndZ: | |
854 return AnchorEndZ | |
855 case ntEnd: | |
856 return AnchorEnd | |
857 default: | |
858 return 0 | |
859 } | |
860 } | |
861 | |
862 // anchorDescription returns a human-readable description of the anchors | |
863 func (anchors AnchorLoc) String() string { | |
864 buf := &bytes.Buffer{} | |
865 | |
866 if 0 != (anchors & AnchorBeginning) { | |
867 buf.WriteString(", Beginning") | |
868 } | |
869 if 0 != (anchors & AnchorStart) { | |
870 buf.WriteString(", Start") | |
871 } | |
872 if 0 != (anchors & AnchorBol) { | |
873 buf.WriteString(", Bol") | |
874 } | |
875 if 0 != (anchors & AnchorBoundary) { | |
876 buf.WriteString(", Boundary") | |
877 } | |
878 if 0 != (anchors & AnchorECMABoundary) { | |
879 buf.WriteString(", ECMABoundary") | |
880 } | |
881 if 0 != (anchors & AnchorEol) { | |
882 buf.WriteString(", Eol") | |
883 } | |
884 if 0 != (anchors & AnchorEnd) { | |
885 buf.WriteString(", End") | |
886 } | |
887 if 0 != (anchors & AnchorEndZ) { | |
888 buf.WriteString(", EndZ") | |
889 } | |
890 | |
891 // trim off comma | |
892 if buf.Len() >= 2 { | |
893 return buf.String()[2:] | |
894 } | |
895 return "None" | |
896 } |