66
|
1 package syntax
|
|
2
|
|
3 import (
|
|
4 "bytes"
|
|
5 "fmt"
|
|
6 "strconv"
|
|
7 "unicode"
|
|
8 "unicode/utf8"
|
|
9 )
|
|
10
|
|
11 type Prefix struct {
|
|
12 PrefixStr []rune
|
|
13 PrefixSet CharSet
|
|
14 CaseInsensitive bool
|
|
15 }
|
|
16
|
|
17 // It takes a RegexTree and computes the set of chars that can start it.
|
|
18 func getFirstCharsPrefix(tree *RegexTree) *Prefix {
|
|
19 s := regexFcd{
|
|
20 fcStack: make([]regexFc, 32),
|
|
21 intStack: make([]int, 32),
|
|
22 }
|
|
23 fc := s.regexFCFromRegexTree(tree)
|
|
24
|
|
25 if fc == nil || fc.nullable || fc.cc.IsEmpty() {
|
|
26 return nil
|
|
27 }
|
|
28 fcSet := fc.getFirstChars()
|
|
29 return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
|
|
30 }
|
|
31
|
|
32 type regexFcd struct {
|
|
33 intStack []int
|
|
34 intDepth int
|
|
35 fcStack []regexFc
|
|
36 fcDepth int
|
|
37 skipAllChildren bool // don't process any more children at the current level
|
|
38 skipchild bool // don't process the current child.
|
|
39 failed bool
|
|
40 }
|
|
41
|
|
42 /*
|
|
43 * The main FC computation. It does a shortcutted depth-first walk
|
|
44 * through the tree and calls CalculateFC to emits code before
|
|
45 * and after each child of an interior node, and at each leaf.
|
|
46 */
|
|
47 func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
|
|
48 curNode := tree.root
|
|
49 curChild := 0
|
|
50
|
|
51 for {
|
|
52 if len(curNode.children) == 0 {
|
|
53 // This is a leaf node
|
|
54 s.calculateFC(curNode.t, curNode, 0)
|
|
55 } else if curChild < len(curNode.children) && !s.skipAllChildren {
|
|
56 // This is an interior node, and we have more children to analyze
|
|
57 s.calculateFC(curNode.t|beforeChild, curNode, curChild)
|
|
58
|
|
59 if !s.skipchild {
|
|
60 curNode = curNode.children[curChild]
|
|
61 // this stack is how we get a depth first walk of the tree.
|
|
62 s.pushInt(curChild)
|
|
63 curChild = 0
|
|
64 } else {
|
|
65 curChild++
|
|
66 s.skipchild = false
|
|
67 }
|
|
68 continue
|
|
69 }
|
|
70
|
|
71 // This is an interior node where we've finished analyzing all the children, or
|
|
72 // the end of a leaf node.
|
|
73 s.skipAllChildren = false
|
|
74
|
|
75 if s.intIsEmpty() {
|
|
76 break
|
|
77 }
|
|
78
|
|
79 curChild = s.popInt()
|
|
80 curNode = curNode.next
|
|
81
|
|
82 s.calculateFC(curNode.t|afterChild, curNode, curChild)
|
|
83 if s.failed {
|
|
84 return nil
|
|
85 }
|
|
86
|
|
87 curChild++
|
|
88 }
|
|
89
|
|
90 if s.fcIsEmpty() {
|
|
91 return nil
|
|
92 }
|
|
93
|
|
94 return s.popFC()
|
|
95 }
|
|
96
|
|
97 // To avoid recursion, we use a simple integer stack.
|
|
98 // This is the push.
|
|
99 func (s *regexFcd) pushInt(I int) {
|
|
100 if s.intDepth >= len(s.intStack) {
|
|
101 expanded := make([]int, s.intDepth*2)
|
|
102 copy(expanded, s.intStack)
|
|
103 s.intStack = expanded
|
|
104 }
|
|
105
|
|
106 s.intStack[s.intDepth] = I
|
|
107 s.intDepth++
|
|
108 }
|
|
109
|
|
110 // True if the stack is empty.
|
|
111 func (s *regexFcd) intIsEmpty() bool {
|
|
112 return s.intDepth == 0
|
|
113 }
|
|
114
|
|
115 // This is the pop.
|
|
116 func (s *regexFcd) popInt() int {
|
|
117 s.intDepth--
|
|
118 return s.intStack[s.intDepth]
|
|
119 }
|
|
120
|
|
121 // We also use a stack of RegexFC objects.
|
|
122 // This is the push.
|
|
123 func (s *regexFcd) pushFC(fc regexFc) {
|
|
124 if s.fcDepth >= len(s.fcStack) {
|
|
125 expanded := make([]regexFc, s.fcDepth*2)
|
|
126 copy(expanded, s.fcStack)
|
|
127 s.fcStack = expanded
|
|
128 }
|
|
129
|
|
130 s.fcStack[s.fcDepth] = fc
|
|
131 s.fcDepth++
|
|
132 }
|
|
133
|
|
134 // True if the stack is empty.
|
|
135 func (s *regexFcd) fcIsEmpty() bool {
|
|
136 return s.fcDepth == 0
|
|
137 }
|
|
138
|
|
139 // This is the pop.
|
|
140 func (s *regexFcd) popFC() *regexFc {
|
|
141 s.fcDepth--
|
|
142 return &s.fcStack[s.fcDepth]
|
|
143 }
|
|
144
|
|
145 // This is the top.
|
|
146 func (s *regexFcd) topFC() *regexFc {
|
|
147 return &s.fcStack[s.fcDepth-1]
|
|
148 }
|
|
149
|
|
150 // Called in Beforechild to prevent further processing of the current child
|
|
151 func (s *regexFcd) skipChild() {
|
|
152 s.skipchild = true
|
|
153 }
|
|
154
|
|
155 // FC computation and shortcut cases for each node type
|
|
156 func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
|
|
157 //fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
|
|
158 ci := false
|
|
159 rtl := false
|
|
160
|
|
161 if nt <= ntRef {
|
|
162 if (node.options & IgnoreCase) != 0 {
|
|
163 ci = true
|
|
164 }
|
|
165 if (node.options & RightToLeft) != 0 {
|
|
166 rtl = true
|
|
167 }
|
|
168 }
|
|
169
|
|
170 switch nt {
|
|
171 case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
|
|
172 break
|
|
173
|
|
174 case ntTestgroup | beforeChild:
|
|
175 if CurIndex == 0 {
|
|
176 s.skipChild()
|
|
177 }
|
|
178 break
|
|
179
|
|
180 case ntEmpty:
|
|
181 s.pushFC(regexFc{nullable: true})
|
|
182 break
|
|
183
|
|
184 case ntConcatenate | afterChild:
|
|
185 if CurIndex != 0 {
|
|
186 child := s.popFC()
|
|
187 cumul := s.topFC()
|
|
188
|
|
189 s.failed = !cumul.addFC(*child, true)
|
|
190 }
|
|
191
|
|
192 fc := s.topFC()
|
|
193 if !fc.nullable {
|
|
194 s.skipAllChildren = true
|
|
195 }
|
|
196 break
|
|
197
|
|
198 case ntTestgroup | afterChild:
|
|
199 if CurIndex > 1 {
|
|
200 child := s.popFC()
|
|
201 cumul := s.topFC()
|
|
202
|
|
203 s.failed = !cumul.addFC(*child, false)
|
|
204 }
|
|
205 break
|
|
206
|
|
207 case ntAlternate | afterChild, ntTestref | afterChild:
|
|
208 if CurIndex != 0 {
|
|
209 child := s.popFC()
|
|
210 cumul := s.topFC()
|
|
211
|
|
212 s.failed = !cumul.addFC(*child, false)
|
|
213 }
|
|
214 break
|
|
215
|
|
216 case ntLoop | afterChild, ntLazyloop | afterChild:
|
|
217 if node.m == 0 {
|
|
218 fc := s.topFC()
|
|
219 fc.nullable = true
|
|
220 }
|
|
221 break
|
|
222
|
|
223 case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
|
|
224 break
|
|
225
|
|
226 case ntRequire | beforeChild, ntPrevent | beforeChild:
|
|
227 s.skipChild()
|
|
228 s.pushFC(regexFc{nullable: true})
|
|
229 break
|
|
230
|
|
231 case ntRequire | afterChild, ntPrevent | afterChild:
|
|
232 break
|
|
233
|
|
234 case ntOne, ntNotone:
|
|
235 s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
|
|
236 break
|
|
237
|
|
238 case ntOneloop, ntOnelazy:
|
|
239 s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
|
|
240 break
|
|
241
|
|
242 case ntNotoneloop, ntNotonelazy:
|
|
243 s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
|
|
244 break
|
|
245
|
|
246 case ntMulti:
|
|
247 if len(node.str) == 0 {
|
|
248 s.pushFC(regexFc{nullable: true})
|
|
249 } else if !rtl {
|
|
250 s.pushFC(newRegexFc(node.str[0], false, false, ci))
|
|
251 } else {
|
|
252 s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
|
|
253 }
|
|
254 break
|
|
255
|
|
256 case ntSet:
|
|
257 s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
|
|
258 break
|
|
259
|
|
260 case ntSetloop, ntSetlazy:
|
|
261 s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
|
|
262 break
|
|
263
|
|
264 case ntRef:
|
|
265 s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
|
|
266 break
|
|
267
|
|
268 case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
|
269 s.pushFC(regexFc{nullable: true})
|
|
270 break
|
|
271
|
|
272 default:
|
|
273 panic(fmt.Sprintf("unexpected op code: %v", nt))
|
|
274 }
|
|
275 }
|
|
276
|
|
277 type regexFc struct {
|
|
278 cc CharSet
|
|
279 nullable bool
|
|
280 caseInsensitive bool
|
|
281 }
|
|
282
|
|
283 func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
|
|
284 r := regexFc{
|
|
285 caseInsensitive: caseInsensitive,
|
|
286 nullable: nullable,
|
|
287 }
|
|
288 if not {
|
|
289 if ch > 0 {
|
|
290 r.cc.addRange('\x00', ch-1)
|
|
291 }
|
|
292 if ch < 0xFFFF {
|
|
293 r.cc.addRange(ch+1, utf8.MaxRune)
|
|
294 }
|
|
295 } else {
|
|
296 r.cc.addRange(ch, ch)
|
|
297 }
|
|
298 return r
|
|
299 }
|
|
300
|
|
301 func (r *regexFc) getFirstChars() CharSet {
|
|
302 if r.caseInsensitive {
|
|
303 r.cc.addLowercase()
|
|
304 }
|
|
305
|
|
306 return r.cc
|
|
307 }
|
|
308
|
|
309 func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
|
|
310 if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
|
|
311 return false
|
|
312 }
|
|
313
|
|
314 if concatenate {
|
|
315 if !r.nullable {
|
|
316 return true
|
|
317 }
|
|
318
|
|
319 if !fc.nullable {
|
|
320 r.nullable = false
|
|
321 }
|
|
322 } else {
|
|
323 if fc.nullable {
|
|
324 r.nullable = true
|
|
325 }
|
|
326 }
|
|
327
|
|
328 r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
|
|
329 r.cc.addSet(fc.cc)
|
|
330
|
|
331 return true
|
|
332 }
|
|
333
|
|
334 // This is a related computation: it takes a RegexTree and computes the
|
|
335 // leading substring if it sees one. It's quite trivial and gives up easily.
|
|
336 func getPrefix(tree *RegexTree) *Prefix {
|
|
337 var concatNode *regexNode
|
|
338 nextChild := 0
|
|
339
|
|
340 curNode := tree.root
|
|
341
|
|
342 for {
|
|
343 switch curNode.t {
|
|
344 case ntConcatenate:
|
|
345 if len(curNode.children) > 0 {
|
|
346 concatNode = curNode
|
|
347 nextChild = 0
|
|
348 }
|
|
349
|
|
350 case ntGreedy, ntCapture:
|
|
351 curNode = curNode.children[0]
|
|
352 concatNode = nil
|
|
353 continue
|
|
354
|
|
355 case ntOneloop, ntOnelazy:
|
|
356 if curNode.m > 0 {
|
|
357 return &Prefix{
|
|
358 PrefixStr: repeat(curNode.ch, curNode.m),
|
|
359 CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
|
360 }
|
|
361 }
|
|
362 return nil
|
|
363
|
|
364 case ntOne:
|
|
365 return &Prefix{
|
|
366 PrefixStr: []rune{curNode.ch},
|
|
367 CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
|
368 }
|
|
369
|
|
370 case ntMulti:
|
|
371 return &Prefix{
|
|
372 PrefixStr: curNode.str,
|
|
373 CaseInsensitive: (curNode.options & IgnoreCase) != 0,
|
|
374 }
|
|
375
|
|
376 case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
|
|
377 ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
|
|
378
|
|
379 default:
|
|
380 return nil
|
|
381 }
|
|
382
|
|
383 if concatNode == nil || nextChild >= len(concatNode.children) {
|
|
384 return nil
|
|
385 }
|
|
386
|
|
387 curNode = concatNode.children[nextChild]
|
|
388 nextChild++
|
|
389 }
|
|
390 }
|
|
391
|
|
392 // repeat the rune r, c times... up to the max of MaxPrefixSize
|
|
393 func repeat(r rune, c int) []rune {
|
|
394 if c > MaxPrefixSize {
|
|
395 c = MaxPrefixSize
|
|
396 }
|
|
397
|
|
398 ret := make([]rune, c)
|
|
399
|
|
400 // binary growth using copy for speed
|
|
401 ret[0] = r
|
|
402 bp := 1
|
|
403 for bp < len(ret) {
|
|
404 copy(ret[bp:], ret[:bp])
|
|
405 bp *= 2
|
|
406 }
|
|
407
|
|
408 return ret
|
|
409 }
|
|
410
|
|
411 // BmPrefix precomputes the Boyer-Moore
|
|
412 // tables for fast string scanning. These tables allow
|
|
413 // you to scan for the first occurrence of a string within
|
|
414 // a large body of text without examining every character.
|
|
415 // The performance of the heuristic depends on the actual
|
|
416 // string and the text being searched, but usually, the longer
|
|
417 // the string that is being searched for, the fewer characters
|
|
418 // need to be examined.
|
|
419 type BmPrefix struct {
|
|
420 positive []int
|
|
421 negativeASCII []int
|
|
422 negativeUnicode [][]int
|
|
423 pattern []rune
|
|
424 lowASCII rune
|
|
425 highASCII rune
|
|
426 rightToLeft bool
|
|
427 caseInsensitive bool
|
|
428 }
|
|
429
|
|
430 func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
|
|
431
|
|
432 b := &BmPrefix{
|
|
433 rightToLeft: rightToLeft,
|
|
434 caseInsensitive: caseInsensitive,
|
|
435 pattern: pattern,
|
|
436 }
|
|
437
|
|
438 if caseInsensitive {
|
|
439 for i := 0; i < len(b.pattern); i++ {
|
|
440 // We do the ToLower character by character for consistency. With surrogate chars, doing
|
|
441 // a ToLower on the entire string could actually change the surrogate pair. This is more correct
|
|
442 // linguistically, but since Regex doesn't support surrogates, it's more important to be
|
|
443 // consistent.
|
|
444
|
|
445 b.pattern[i] = unicode.ToLower(b.pattern[i])
|
|
446 }
|
|
447 }
|
|
448
|
|
449 var beforefirst, last, bump int
|
|
450 var scan, match int
|
|
451
|
|
452 if !rightToLeft {
|
|
453 beforefirst = -1
|
|
454 last = len(b.pattern) - 1
|
|
455 bump = 1
|
|
456 } else {
|
|
457 beforefirst = len(b.pattern)
|
|
458 last = 0
|
|
459 bump = -1
|
|
460 }
|
|
461
|
|
462 // PART I - the good-suffix shift table
|
|
463 //
|
|
464 // compute the positive requirement:
|
|
465 // if char "i" is the first one from the right that doesn't match,
|
|
466 // then we know the matcher can advance by _positive[i].
|
|
467 //
|
|
468 // This algorithm is a simplified variant of the standard
|
|
469 // Boyer-Moore good suffix calculation.
|
|
470
|
|
471 b.positive = make([]int, len(b.pattern))
|
|
472
|
|
473 examine := last
|
|
474 ch := b.pattern[examine]
|
|
475 b.positive[examine] = bump
|
|
476 examine -= bump
|
|
477
|
|
478 Outerloop:
|
|
479 for {
|
|
480 // find an internal char (examine) that matches the tail
|
|
481
|
|
482 for {
|
|
483 if examine == beforefirst {
|
|
484 break Outerloop
|
|
485 }
|
|
486 if b.pattern[examine] == ch {
|
|
487 break
|
|
488 }
|
|
489 examine -= bump
|
|
490 }
|
|
491
|
|
492 match = last
|
|
493 scan = examine
|
|
494
|
|
495 // find the length of the match
|
|
496 for {
|
|
497 if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
|
|
498 // at the end of the match, note the difference in _positive
|
|
499 // this is not the length of the match, but the distance from the internal match
|
|
500 // to the tail suffix.
|
|
501 if b.positive[match] == 0 {
|
|
502 b.positive[match] = match - scan
|
|
503 }
|
|
504
|
|
505 // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
|
|
506
|
|
507 break
|
|
508 }
|
|
509
|
|
510 scan -= bump
|
|
511 match -= bump
|
|
512 }
|
|
513
|
|
514 examine -= bump
|
|
515 }
|
|
516
|
|
517 match = last - bump
|
|
518
|
|
519 // scan for the chars for which there are no shifts that yield a different candidate
|
|
520
|
|
521 // The inside of the if statement used to say
|
|
522 // "_positive[match] = last - beforefirst;"
|
|
523 // This is slightly less aggressive in how much we skip, but at worst it
|
|
524 // should mean a little more work rather than skipping a potential match.
|
|
525 for match != beforefirst {
|
|
526 if b.positive[match] == 0 {
|
|
527 b.positive[match] = bump
|
|
528 }
|
|
529
|
|
530 match -= bump
|
|
531 }
|
|
532
|
|
533 // PART II - the bad-character shift table
|
|
534 //
|
|
535 // compute the negative requirement:
|
|
536 // if char "ch" is the reject character when testing position "i",
|
|
537 // we can slide up by _negative[ch];
|
|
538 // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
|
|
539 //
|
|
540 // the lookup table is divided into ASCII and Unicode portions;
|
|
541 // only those parts of the Unicode 16-bit code set that actually
|
|
542 // appear in the string are in the table. (Maximum size with
|
|
543 // Unicode is 65K; ASCII only case is 512 bytes.)
|
|
544
|
|
545 b.negativeASCII = make([]int, 128)
|
|
546
|
|
547 for i := 0; i < len(b.negativeASCII); i++ {
|
|
548 b.negativeASCII[i] = last - beforefirst
|
|
549 }
|
|
550
|
|
551 b.lowASCII = 127
|
|
552 b.highASCII = 0
|
|
553
|
|
554 for examine = last; examine != beforefirst; examine -= bump {
|
|
555 ch = b.pattern[examine]
|
|
556
|
|
557 switch {
|
|
558 case ch < 128:
|
|
559 if b.lowASCII > ch {
|
|
560 b.lowASCII = ch
|
|
561 }
|
|
562
|
|
563 if b.highASCII < ch {
|
|
564 b.highASCII = ch
|
|
565 }
|
|
566
|
|
567 if b.negativeASCII[ch] == last-beforefirst {
|
|
568 b.negativeASCII[ch] = last - examine
|
|
569 }
|
|
570 case ch <= 0xffff:
|
|
571 i, j := ch>>8, ch&0xFF
|
|
572
|
|
573 if b.negativeUnicode == nil {
|
|
574 b.negativeUnicode = make([][]int, 256)
|
|
575 }
|
|
576
|
|
577 if b.negativeUnicode[i] == nil {
|
|
578 newarray := make([]int, 256)
|
|
579
|
|
580 for k := 0; k < len(newarray); k++ {
|
|
581 newarray[k] = last - beforefirst
|
|
582 }
|
|
583
|
|
584 if i == 0 {
|
|
585 copy(newarray, b.negativeASCII)
|
|
586 //TODO: this line needed?
|
|
587 b.negativeASCII = newarray
|
|
588 }
|
|
589
|
|
590 b.negativeUnicode[i] = newarray
|
|
591 }
|
|
592
|
|
593 if b.negativeUnicode[i][j] == last-beforefirst {
|
|
594 b.negativeUnicode[i][j] = last - examine
|
|
595 }
|
|
596 default:
|
|
597 // we can't do the filter because this algo doesn't support
|
|
598 // unicode chars >0xffff
|
|
599 return nil
|
|
600 }
|
|
601 }
|
|
602
|
|
603 return b
|
|
604 }
|
|
605
|
|
606 func (b *BmPrefix) String() string {
|
|
607 return string(b.pattern)
|
|
608 }
|
|
609
|
|
610 // Dump returns the contents of the filter as a human readable string
|
|
611 func (b *BmPrefix) Dump(indent string) string {
|
|
612 buf := &bytes.Buffer{}
|
|
613
|
|
614 fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
|
|
615 for i := 0; i < len(b.positive); i++ {
|
|
616 buf.WriteString(strconv.Itoa(b.positive[i]))
|
|
617 buf.WriteRune(' ')
|
|
618 }
|
|
619 buf.WriteRune('\n')
|
|
620
|
|
621 if b.negativeASCII != nil {
|
|
622 buf.WriteString(indent)
|
|
623 buf.WriteString("Negative table\n")
|
|
624 for i := 0; i < len(b.negativeASCII); i++ {
|
|
625 if b.negativeASCII[i] != len(b.pattern) {
|
|
626 fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
|
|
627 }
|
|
628 }
|
|
629 }
|
|
630
|
|
631 return buf.String()
|
|
632 }
|
|
633
|
|
634 // Scan uses the Boyer-Moore algorithm to find the first occurrence
|
|
635 // of the specified string within text, beginning at index, and
|
|
636 // constrained within beglimit and endlimit.
|
|
637 //
|
|
638 // The direction and case-sensitivity of the match is determined
|
|
639 // by the arguments to the RegexBoyerMoore constructor.
|
|
640 func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
|
|
641 var (
|
|
642 defadv, test, test2 int
|
|
643 match, startmatch, endmatch int
|
|
644 bump, advance int
|
|
645 chTest rune
|
|
646 unicodeLookup []int
|
|
647 )
|
|
648
|
|
649 if !b.rightToLeft {
|
|
650 defadv = len(b.pattern)
|
|
651 startmatch = len(b.pattern) - 1
|
|
652 endmatch = 0
|
|
653 test = index + defadv - 1
|
|
654 bump = 1
|
|
655 } else {
|
|
656 defadv = -len(b.pattern)
|
|
657 startmatch = 0
|
|
658 endmatch = -defadv - 1
|
|
659 test = index + defadv
|
|
660 bump = -1
|
|
661 }
|
|
662
|
|
663 chMatch := b.pattern[startmatch]
|
|
664
|
|
665 for {
|
|
666 if test >= endlimit || test < beglimit {
|
|
667 return -1
|
|
668 }
|
|
669
|
|
670 chTest = text[test]
|
|
671
|
|
672 if b.caseInsensitive {
|
|
673 chTest = unicode.ToLower(chTest)
|
|
674 }
|
|
675
|
|
676 if chTest != chMatch {
|
|
677 if chTest < 128 {
|
|
678 advance = b.negativeASCII[chTest]
|
|
679 } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
|
|
680 unicodeLookup = b.negativeUnicode[chTest>>8]
|
|
681 if len(unicodeLookup) > 0 {
|
|
682 advance = unicodeLookup[chTest&0xFF]
|
|
683 } else {
|
|
684 advance = defadv
|
|
685 }
|
|
686 } else {
|
|
687 advance = defadv
|
|
688 }
|
|
689
|
|
690 test += advance
|
|
691 } else { // if (chTest == chMatch)
|
|
692 test2 = test
|
|
693 match = startmatch
|
|
694
|
|
695 for {
|
|
696 if match == endmatch {
|
|
697 if b.rightToLeft {
|
|
698 return test2 + 1
|
|
699 } else {
|
|
700 return test2
|
|
701 }
|
|
702 }
|
|
703
|
|
704 match -= bump
|
|
705 test2 -= bump
|
|
706
|
|
707 chTest = text[test2]
|
|
708
|
|
709 if b.caseInsensitive {
|
|
710 chTest = unicode.ToLower(chTest)
|
|
711 }
|
|
712
|
|
713 if chTest != b.pattern[match] {
|
|
714 advance = b.positive[match]
|
|
715 if (chTest & 0xFF80) == 0 {
|
|
716 test2 = (match - startmatch) + b.negativeASCII[chTest]
|
|
717 } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
|
|
718 unicodeLookup = b.negativeUnicode[chTest>>8]
|
|
719 if len(unicodeLookup) > 0 {
|
|
720 test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
|
|
721 } else {
|
|
722 test += advance
|
|
723 break
|
|
724 }
|
|
725 } else {
|
|
726 test += advance
|
|
727 break
|
|
728 }
|
|
729
|
|
730 if b.rightToLeft {
|
|
731 if test2 < advance {
|
|
732 advance = test2
|
|
733 }
|
|
734 } else if test2 > advance {
|
|
735 advance = test2
|
|
736 }
|
|
737
|
|
738 test += advance
|
|
739 break
|
|
740 }
|
|
741 }
|
|
742 }
|
|
743 }
|
|
744 }
|
|
745
|
|
746 // When a regex is anchored, we can do a quick IsMatch test instead of a Scan
|
|
747 func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
|
|
748 if !b.rightToLeft {
|
|
749 if index < beglimit || endlimit-index < len(b.pattern) {
|
|
750 return false
|
|
751 }
|
|
752
|
|
753 return b.matchPattern(text, index)
|
|
754 } else {
|
|
755 if index > endlimit || index-beglimit < len(b.pattern) {
|
|
756 return false
|
|
757 }
|
|
758
|
|
759 return b.matchPattern(text, index-len(b.pattern))
|
|
760 }
|
|
761 }
|
|
762
|
|
763 func (b *BmPrefix) matchPattern(text []rune, index int) bool {
|
|
764 if len(text)-index < len(b.pattern) {
|
|
765 return false
|
|
766 }
|
|
767
|
|
768 if b.caseInsensitive {
|
|
769 for i := 0; i < len(b.pattern); i++ {
|
|
770 //Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
|
|
771 if unicode.ToLower(text[index+i]) != b.pattern[i] {
|
|
772 return false
|
|
773 }
|
|
774 }
|
|
775 return true
|
|
776 } else {
|
|
777 for i := 0; i < len(b.pattern); i++ {
|
|
778 if text[index+i] != b.pattern[i] {
|
|
779 return false
|
|
780 }
|
|
781 }
|
|
782 return true
|
|
783 }
|
|
784 }
|
|
785
|
|
786 type AnchorLoc int16
|
|
787
|
|
788 // where the regex can be pegged
|
|
789 const (
|
|
790 AnchorBeginning AnchorLoc = 0x0001
|
|
791 AnchorBol = 0x0002
|
|
792 AnchorStart = 0x0004
|
|
793 AnchorEol = 0x0008
|
|
794 AnchorEndZ = 0x0010
|
|
795 AnchorEnd = 0x0020
|
|
796 AnchorBoundary = 0x0040
|
|
797 AnchorECMABoundary = 0x0080
|
|
798 )
|
|
799
|
|
800 func getAnchors(tree *RegexTree) AnchorLoc {
|
|
801
|
|
802 var concatNode *regexNode
|
|
803 nextChild, result := 0, AnchorLoc(0)
|
|
804
|
|
805 curNode := tree.root
|
|
806
|
|
807 for {
|
|
808 switch curNode.t {
|
|
809 case ntConcatenate:
|
|
810 if len(curNode.children) > 0 {
|
|
811 concatNode = curNode
|
|
812 nextChild = 0
|
|
813 }
|
|
814
|
|
815 case ntGreedy, ntCapture:
|
|
816 curNode = curNode.children[0]
|
|
817 concatNode = nil
|
|
818 continue
|
|
819
|
|
820 case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
|
|
821 ntStart, ntEndZ, ntEnd:
|
|
822 return result | anchorFromType(curNode.t)
|
|
823
|
|
824 case ntEmpty, ntRequire, ntPrevent:
|
|
825
|
|
826 default:
|
|
827 return result
|
|
828 }
|
|
829
|
|
830 if concatNode == nil || nextChild >= len(concatNode.children) {
|
|
831 return result
|
|
832 }
|
|
833
|
|
834 curNode = concatNode.children[nextChild]
|
|
835 nextChild++
|
|
836 }
|
|
837 }
|
|
838
|
|
839 func anchorFromType(t nodeType) AnchorLoc {
|
|
840 switch t {
|
|
841 case ntBol:
|
|
842 return AnchorBol
|
|
843 case ntEol:
|
|
844 return AnchorEol
|
|
845 case ntBoundary:
|
|
846 return AnchorBoundary
|
|
847 case ntECMABoundary:
|
|
848 return AnchorECMABoundary
|
|
849 case ntBeginning:
|
|
850 return AnchorBeginning
|
|
851 case ntStart:
|
|
852 return AnchorStart
|
|
853 case ntEndZ:
|
|
854 return AnchorEndZ
|
|
855 case ntEnd:
|
|
856 return AnchorEnd
|
|
857 default:
|
|
858 return 0
|
|
859 }
|
|
860 }
|
|
861
|
|
862 // anchorDescription returns a human-readable description of the anchors
|
|
863 func (anchors AnchorLoc) String() string {
|
|
864 buf := &bytes.Buffer{}
|
|
865
|
|
866 if 0 != (anchors & AnchorBeginning) {
|
|
867 buf.WriteString(", Beginning")
|
|
868 }
|
|
869 if 0 != (anchors & AnchorStart) {
|
|
870 buf.WriteString(", Start")
|
|
871 }
|
|
872 if 0 != (anchors & AnchorBol) {
|
|
873 buf.WriteString(", Bol")
|
|
874 }
|
|
875 if 0 != (anchors & AnchorBoundary) {
|
|
876 buf.WriteString(", Boundary")
|
|
877 }
|
|
878 if 0 != (anchors & AnchorECMABoundary) {
|
|
879 buf.WriteString(", ECMABoundary")
|
|
880 }
|
|
881 if 0 != (anchors & AnchorEol) {
|
|
882 buf.WriteString(", Eol")
|
|
883 }
|
|
884 if 0 != (anchors & AnchorEnd) {
|
|
885 buf.WriteString(", End")
|
|
886 }
|
|
887 if 0 != (anchors & AnchorEndZ) {
|
|
888 buf.WriteString(", EndZ")
|
|
889 }
|
|
890
|
|
891 // trim off comma
|
|
892 if buf.Len() >= 2 {
|
|
893 return buf.String()[2:]
|
|
894 }
|
|
895 return "None"
|
|
896 }
|