Mercurial > yakumo_izuru > aya
comparison vendor/github.com/dlclark/regexp2/syntax/parser.go @ 66:787b5ee0289d draft
Use vendored modules
Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author | yakumo.izuru |
---|---|
date | Sun, 23 Jul 2023 13:18:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
65:6d985efa0f7a | 66:787b5ee0289d |
---|---|
1 package syntax | |
2 | |
3 import ( | |
4 "fmt" | |
5 "math" | |
6 "os" | |
7 "sort" | |
8 "strconv" | |
9 "unicode" | |
10 ) | |
11 | |
12 type RegexOptions int32 | |
13 | |
14 const ( | |
15 IgnoreCase RegexOptions = 0x0001 // "i" | |
16 Multiline = 0x0002 // "m" | |
17 ExplicitCapture = 0x0004 // "n" | |
18 Compiled = 0x0008 // "c" | |
19 Singleline = 0x0010 // "s" | |
20 IgnorePatternWhitespace = 0x0020 // "x" | |
21 RightToLeft = 0x0040 // "r" | |
22 Debug = 0x0080 // "d" | |
23 ECMAScript = 0x0100 // "e" | |
24 RE2 = 0x0200 // RE2 compat mode | |
25 ) | |
26 | |
27 func optionFromCode(ch rune) RegexOptions { | |
28 // case-insensitive | |
29 switch ch { | |
30 case 'i', 'I': | |
31 return IgnoreCase | |
32 case 'r', 'R': | |
33 return RightToLeft | |
34 case 'm', 'M': | |
35 return Multiline | |
36 case 'n', 'N': | |
37 return ExplicitCapture | |
38 case 's', 'S': | |
39 return Singleline | |
40 case 'x', 'X': | |
41 return IgnorePatternWhitespace | |
42 case 'd', 'D': | |
43 return Debug | |
44 case 'e', 'E': | |
45 return ECMAScript | |
46 default: | |
47 return 0 | |
48 } | |
49 } | |
50 | |
51 // An Error describes a failure to parse a regular expression | |
52 // and gives the offending expression. | |
53 type Error struct { | |
54 Code ErrorCode | |
55 Expr string | |
56 Args []interface{} | |
57 } | |
58 | |
59 func (e *Error) Error() string { | |
60 if len(e.Args) == 0 { | |
61 return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`" | |
62 } | |
63 return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`" | |
64 } | |
65 | |
66 // An ErrorCode describes a failure to parse a regular expression. | |
67 type ErrorCode string | |
68 | |
69 const ( | |
70 // internal issue | |
71 ErrInternalError ErrorCode = "regexp/syntax: internal error" | |
72 // Parser errors | |
73 ErrUnterminatedComment = "unterminated comment" | |
74 ErrInvalidCharRange = "invalid character class range" | |
75 ErrInvalidRepeatSize = "invalid repeat count" | |
76 ErrInvalidUTF8 = "invalid UTF-8" | |
77 ErrCaptureGroupOutOfRange = "capture group number out of range" | |
78 ErrUnexpectedParen = "unexpected )" | |
79 ErrMissingParen = "missing closing )" | |
80 ErrMissingBrace = "missing closing }" | |
81 ErrInvalidRepeatOp = "invalid nested repetition operator" | |
82 ErrMissingRepeatArgument = "missing argument to repetition operator" | |
83 ErrConditionalExpression = "illegal conditional (?(...)) expression" | |
84 ErrTooManyAlternates = "too many | in (?()|)" | |
85 ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v" | |
86 ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator" | |
87 ErrCapNumNotZero = "capture number cannot be zero" | |
88 ErrUndefinedBackRef = "reference to undefined group number %v" | |
89 ErrUndefinedNameRef = "reference to undefined group name %v" | |
90 ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named" | |
91 ErrAlternationCantHaveComment = "alternation conditions cannot be comments" | |
92 ErrMalformedReference = "(?(%v) ) malformed" | |
93 ErrUndefinedReference = "(?(%v) ) reference to undefined group" | |
94 ErrIllegalEndEscape = "illegal \\ at end of pattern" | |
95 ErrMalformedSlashP = "malformed \\p{X} character escape" | |
96 ErrIncompleteSlashP = "incomplete \\p{X} character escape" | |
97 ErrUnknownSlashP = "unknown unicode category, script, or property '%v'" | |
98 ErrUnrecognizedEscape = "unrecognized escape sequence \\%v" | |
99 ErrMissingControl = "missing control character" | |
100 ErrUnrecognizedControl = "unrecognized control character" | |
101 ErrTooFewHex = "insufficient hexadecimal digits" | |
102 ErrInvalidHex = "hex values may not be larger than 0x10FFFF" | |
103 ErrMalformedNameRef = "malformed \\k<...> named back reference" | |
104 ErrBadClassInCharRange = "cannot include class \\%v in character range" | |
105 ErrUnterminatedBracket = "unterminated [] set" | |
106 ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class" | |
107 ErrReversedCharRange = "[x-y] range in reverse order" | |
108 ) | |
109 | |
110 func (e ErrorCode) String() string { | |
111 return string(e) | |
112 } | |
113 | |
114 type parser struct { | |
115 stack *regexNode | |
116 group *regexNode | |
117 alternation *regexNode | |
118 concatenation *regexNode | |
119 unit *regexNode | |
120 | |
121 patternRaw string | |
122 pattern []rune | |
123 | |
124 currentPos int | |
125 specialCase *unicode.SpecialCase | |
126 | |
127 autocap int | |
128 capcount int | |
129 captop int | |
130 capsize int | |
131 | |
132 caps map[int]int | |
133 capnames map[string]int | |
134 | |
135 capnumlist []int | |
136 capnamelist []string | |
137 | |
138 options RegexOptions | |
139 optionsStack []RegexOptions | |
140 ignoreNextParen bool | |
141 } | |
142 | |
143 const ( | |
144 maxValueDiv10 int = math.MaxInt32 / 10 | |
145 maxValueMod10 = math.MaxInt32 % 10 | |
146 ) | |
147 | |
148 // Parse converts a regex string into a parse tree | |
149 func Parse(re string, op RegexOptions) (*RegexTree, error) { | |
150 p := parser{ | |
151 options: op, | |
152 caps: make(map[int]int), | |
153 } | |
154 p.setPattern(re) | |
155 | |
156 if err := p.countCaptures(); err != nil { | |
157 return nil, err | |
158 } | |
159 | |
160 p.reset(op) | |
161 root, err := p.scanRegex() | |
162 | |
163 if err != nil { | |
164 return nil, err | |
165 } | |
166 tree := &RegexTree{ | |
167 root: root, | |
168 caps: p.caps, | |
169 capnumlist: p.capnumlist, | |
170 captop: p.captop, | |
171 Capnames: p.capnames, | |
172 Caplist: p.capnamelist, | |
173 options: op, | |
174 } | |
175 | |
176 if tree.options&Debug > 0 { | |
177 os.Stdout.WriteString(tree.Dump()) | |
178 } | |
179 | |
180 return tree, nil | |
181 } | |
182 | |
183 func (p *parser) setPattern(pattern string) { | |
184 p.patternRaw = pattern | |
185 p.pattern = make([]rune, 0, len(pattern)) | |
186 | |
187 //populate our rune array to handle utf8 encoding | |
188 for _, r := range pattern { | |
189 p.pattern = append(p.pattern, r) | |
190 } | |
191 } | |
192 func (p *parser) getErr(code ErrorCode, args ...interface{}) error { | |
193 return &Error{Code: code, Expr: p.patternRaw, Args: args} | |
194 } | |
195 | |
196 func (p *parser) noteCaptureSlot(i, pos int) { | |
197 if _, ok := p.caps[i]; !ok { | |
198 // the rhs of the hashtable isn't used in the parser | |
199 p.caps[i] = pos | |
200 p.capcount++ | |
201 | |
202 if p.captop <= i { | |
203 if i == math.MaxInt32 { | |
204 p.captop = i | |
205 } else { | |
206 p.captop = i + 1 | |
207 } | |
208 } | |
209 } | |
210 } | |
211 | |
212 func (p *parser) noteCaptureName(name string, pos int) { | |
213 if p.capnames == nil { | |
214 p.capnames = make(map[string]int) | |
215 } | |
216 | |
217 if _, ok := p.capnames[name]; !ok { | |
218 p.capnames[name] = pos | |
219 p.capnamelist = append(p.capnamelist, name) | |
220 } | |
221 } | |
222 | |
223 func (p *parser) assignNameSlots() { | |
224 if p.capnames != nil { | |
225 for _, name := range p.capnamelist { | |
226 for p.isCaptureSlot(p.autocap) { | |
227 p.autocap++ | |
228 } | |
229 pos := p.capnames[name] | |
230 p.capnames[name] = p.autocap | |
231 p.noteCaptureSlot(p.autocap, pos) | |
232 | |
233 p.autocap++ | |
234 } | |
235 } | |
236 | |
237 // if the caps array has at least one gap, construct the list of used slots | |
238 if p.capcount < p.captop { | |
239 p.capnumlist = make([]int, p.capcount) | |
240 i := 0 | |
241 | |
242 for k := range p.caps { | |
243 p.capnumlist[i] = k | |
244 i++ | |
245 } | |
246 | |
247 sort.Ints(p.capnumlist) | |
248 } | |
249 | |
250 // merge capsnumlist into capnamelist | |
251 if p.capnames != nil || p.capnumlist != nil { | |
252 var oldcapnamelist []string | |
253 var next int | |
254 var k int | |
255 | |
256 if p.capnames == nil { | |
257 oldcapnamelist = nil | |
258 p.capnames = make(map[string]int) | |
259 p.capnamelist = []string{} | |
260 next = -1 | |
261 } else { | |
262 oldcapnamelist = p.capnamelist | |
263 p.capnamelist = []string{} | |
264 next = p.capnames[oldcapnamelist[0]] | |
265 } | |
266 | |
267 for i := 0; i < p.capcount; i++ { | |
268 j := i | |
269 if p.capnumlist != nil { | |
270 j = p.capnumlist[i] | |
271 } | |
272 | |
273 if next == j { | |
274 p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) | |
275 k++ | |
276 | |
277 if k == len(oldcapnamelist) { | |
278 next = -1 | |
279 } else { | |
280 next = p.capnames[oldcapnamelist[k]] | |
281 } | |
282 | |
283 } else { | |
284 //feature: culture? | |
285 str := strconv.Itoa(j) | |
286 p.capnamelist = append(p.capnamelist, str) | |
287 p.capnames[str] = j | |
288 } | |
289 } | |
290 } | |
291 } | |
292 | |
293 func (p *parser) consumeAutocap() int { | |
294 r := p.autocap | |
295 p.autocap++ | |
296 return r | |
297 } | |
298 | |
299 // CountCaptures is a prescanner for deducing the slots used for | |
300 // captures by doing a partial tokenization of the pattern. | |
301 func (p *parser) countCaptures() error { | |
302 var ch rune | |
303 | |
304 p.noteCaptureSlot(0, 0) | |
305 | |
306 p.autocap = 1 | |
307 | |
308 for p.charsRight() > 0 { | |
309 pos := p.textpos() | |
310 ch = p.moveRightGetChar() | |
311 switch ch { | |
312 case '\\': | |
313 if p.charsRight() > 0 { | |
314 p.scanBackslash(true) | |
315 } | |
316 | |
317 case '#': | |
318 if p.useOptionX() { | |
319 p.moveLeft() | |
320 p.scanBlank() | |
321 } | |
322 | |
323 case '[': | |
324 p.scanCharSet(false, true) | |
325 | |
326 case ')': | |
327 if !p.emptyOptionsStack() { | |
328 p.popOptions() | |
329 } | |
330 | |
331 case '(': | |
332 if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' { | |
333 p.moveLeft() | |
334 p.scanBlank() | |
335 } else { | |
336 p.pushOptions() | |
337 if p.charsRight() > 0 && p.rightChar(0) == '?' { | |
338 // we have (?... | |
339 p.moveRight(1) | |
340 | |
341 if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') { | |
342 // named group: (?<... or (?'... | |
343 | |
344 p.moveRight(1) | |
345 ch = p.rightChar(0) | |
346 | |
347 if ch != '0' && IsWordChar(ch) { | |
348 if ch >= '1' && ch <= '9' { | |
349 dec, err := p.scanDecimal() | |
350 if err != nil { | |
351 return err | |
352 } | |
353 p.noteCaptureSlot(dec, pos) | |
354 } else { | |
355 p.noteCaptureName(p.scanCapname(), pos) | |
356 } | |
357 } | |
358 } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') { | |
359 // RE2-compat (?P<) | |
360 p.moveRight(2) | |
361 ch = p.rightChar(0) | |
362 if IsWordChar(ch) { | |
363 p.noteCaptureName(p.scanCapname(), pos) | |
364 } | |
365 | |
366 } else { | |
367 // (?... | |
368 | |
369 // get the options if it's an option construct (?cimsx-cimsx...) | |
370 p.scanOptions() | |
371 | |
372 if p.charsRight() > 0 { | |
373 if p.rightChar(0) == ')' { | |
374 // (?cimsx-cimsx) | |
375 p.moveRight(1) | |
376 p.popKeepOptions() | |
377 } else if p.rightChar(0) == '(' { | |
378 // alternation construct: (?(foo)yes|no) | |
379 // ignore the next paren so we don't capture the condition | |
380 p.ignoreNextParen = true | |
381 | |
382 // break from here so we don't reset ignoreNextParen | |
383 continue | |
384 } | |
385 } | |
386 } | |
387 } else { | |
388 if !p.useOptionN() && !p.ignoreNextParen { | |
389 p.noteCaptureSlot(p.consumeAutocap(), pos) | |
390 } | |
391 } | |
392 } | |
393 | |
394 p.ignoreNextParen = false | |
395 | |
396 } | |
397 } | |
398 | |
399 p.assignNameSlots() | |
400 return nil | |
401 } | |
402 | |
403 func (p *parser) reset(topopts RegexOptions) { | |
404 p.currentPos = 0 | |
405 p.autocap = 1 | |
406 p.ignoreNextParen = false | |
407 | |
408 if len(p.optionsStack) > 0 { | |
409 p.optionsStack = p.optionsStack[:0] | |
410 } | |
411 | |
412 p.options = topopts | |
413 p.stack = nil | |
414 } | |
415 | |
416 func (p *parser) scanRegex() (*regexNode, error) { | |
417 ch := '@' // nonspecial ch, means at beginning | |
418 isQuant := false | |
419 | |
420 p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1)) | |
421 | |
422 for p.charsRight() > 0 { | |
423 wasPrevQuantifier := isQuant | |
424 isQuant = false | |
425 | |
426 if err := p.scanBlank(); err != nil { | |
427 return nil, err | |
428 } | |
429 | |
430 startpos := p.textpos() | |
431 | |
432 // move past all of the normal characters. We'll stop when we hit some kind of control character, | |
433 // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace. | |
434 if p.useOptionX() { | |
435 for p.charsRight() > 0 { | |
436 ch = p.rightChar(0) | |
437 //UGLY: clean up, this is ugly | |
438 if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) { | |
439 break | |
440 } | |
441 p.moveRight(1) | |
442 } | |
443 } else { | |
444 for p.charsRight() > 0 { | |
445 ch = p.rightChar(0) | |
446 if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) { | |
447 break | |
448 } | |
449 p.moveRight(1) | |
450 } | |
451 } | |
452 | |
453 endpos := p.textpos() | |
454 | |
455 p.scanBlank() | |
456 | |
457 if p.charsRight() == 0 { | |
458 ch = '!' // nonspecial, means at end | |
459 } else if ch = p.rightChar(0); isSpecial(ch) { | |
460 isQuant = isQuantifier(ch) | |
461 p.moveRight(1) | |
462 } else { | |
463 ch = ' ' // nonspecial, means at ordinary char | |
464 } | |
465 | |
466 if startpos < endpos { | |
467 cchUnquantified := endpos - startpos | |
468 if isQuant { | |
469 cchUnquantified-- | |
470 } | |
471 wasPrevQuantifier = false | |
472 | |
473 if cchUnquantified > 0 { | |
474 p.addToConcatenate(startpos, cchUnquantified, false) | |
475 } | |
476 | |
477 if isQuant { | |
478 p.addUnitOne(p.charAt(endpos - 1)) | |
479 } | |
480 } | |
481 | |
482 switch ch { | |
483 case '!': | |
484 goto BreakOuterScan | |
485 | |
486 case ' ': | |
487 goto ContinueOuterScan | |
488 | |
489 case '[': | |
490 cc, err := p.scanCharSet(p.useOptionI(), false) | |
491 if err != nil { | |
492 return nil, err | |
493 } | |
494 p.addUnitSet(cc) | |
495 | |
496 case '(': | |
497 p.pushOptions() | |
498 | |
499 if grouper, err := p.scanGroupOpen(); err != nil { | |
500 return nil, err | |
501 } else if grouper == nil { | |
502 p.popKeepOptions() | |
503 } else { | |
504 p.pushGroup() | |
505 p.startGroup(grouper) | |
506 } | |
507 | |
508 continue | |
509 | |
510 case '|': | |
511 p.addAlternate() | |
512 goto ContinueOuterScan | |
513 | |
514 case ')': | |
515 if p.emptyStack() { | |
516 return nil, p.getErr(ErrUnexpectedParen) | |
517 } | |
518 | |
519 if err := p.addGroup(); err != nil { | |
520 return nil, err | |
521 } | |
522 if err := p.popGroup(); err != nil { | |
523 return nil, err | |
524 } | |
525 p.popOptions() | |
526 | |
527 if p.unit == nil { | |
528 goto ContinueOuterScan | |
529 } | |
530 | |
531 case '\\': | |
532 n, err := p.scanBackslash(false) | |
533 if err != nil { | |
534 return nil, err | |
535 } | |
536 p.addUnitNode(n) | |
537 | |
538 case '^': | |
539 if p.useOptionM() { | |
540 p.addUnitType(ntBol) | |
541 } else { | |
542 p.addUnitType(ntBeginning) | |
543 } | |
544 | |
545 case '$': | |
546 if p.useOptionM() { | |
547 p.addUnitType(ntEol) | |
548 } else { | |
549 p.addUnitType(ntEndZ) | |
550 } | |
551 | |
552 case '.': | |
553 if p.useOptionE() { | |
554 p.addUnitSet(ECMAAnyClass()) | |
555 } else if p.useOptionS() { | |
556 p.addUnitSet(AnyClass()) | |
557 } else { | |
558 p.addUnitNotone('\n') | |
559 } | |
560 | |
561 case '{', '*', '+', '?': | |
562 if p.unit == nil { | |
563 if wasPrevQuantifier { | |
564 return nil, p.getErr(ErrInvalidRepeatOp) | |
565 } else { | |
566 return nil, p.getErr(ErrMissingRepeatArgument) | |
567 } | |
568 } | |
569 p.moveLeft() | |
570 | |
571 default: | |
572 return nil, p.getErr(ErrInternalError) | |
573 } | |
574 | |
575 if err := p.scanBlank(); err != nil { | |
576 return nil, err | |
577 } | |
578 | |
579 if p.charsRight() > 0 { | |
580 isQuant = p.isTrueQuantifier() | |
581 } | |
582 if p.charsRight() == 0 || !isQuant { | |
583 //maintain odd C# assignment order -- not sure if required, could clean up? | |
584 p.addConcatenate() | |
585 goto ContinueOuterScan | |
586 } | |
587 | |
588 ch = p.moveRightGetChar() | |
589 | |
590 // Handle quantifiers | |
591 for p.unit != nil { | |
592 var min, max int | |
593 var lazy bool | |
594 | |
595 switch ch { | |
596 case '*': | |
597 min = 0 | |
598 max = math.MaxInt32 | |
599 | |
600 case '?': | |
601 min = 0 | |
602 max = 1 | |
603 | |
604 case '+': | |
605 min = 1 | |
606 max = math.MaxInt32 | |
607 | |
608 case '{': | |
609 { | |
610 var err error | |
611 startpos = p.textpos() | |
612 if min, err = p.scanDecimal(); err != nil { | |
613 return nil, err | |
614 } | |
615 max = min | |
616 if startpos < p.textpos() { | |
617 if p.charsRight() > 0 && p.rightChar(0) == ',' { | |
618 p.moveRight(1) | |
619 if p.charsRight() == 0 || p.rightChar(0) == '}' { | |
620 max = math.MaxInt32 | |
621 } else { | |
622 if max, err = p.scanDecimal(); err != nil { | |
623 return nil, err | |
624 } | |
625 } | |
626 } | |
627 } | |
628 | |
629 if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' { | |
630 p.addConcatenate() | |
631 p.textto(startpos - 1) | |
632 goto ContinueOuterScan | |
633 } | |
634 } | |
635 | |
636 default: | |
637 return nil, p.getErr(ErrInternalError) | |
638 } | |
639 | |
640 if err := p.scanBlank(); err != nil { | |
641 return nil, err | |
642 } | |
643 | |
644 if p.charsRight() == 0 || p.rightChar(0) != '?' { | |
645 lazy = false | |
646 } else { | |
647 p.moveRight(1) | |
648 lazy = true | |
649 } | |
650 | |
651 if min > max { | |
652 return nil, p.getErr(ErrInvalidRepeatSize) | |
653 } | |
654 | |
655 p.addConcatenate3(lazy, min, max) | |
656 } | |
657 | |
658 ContinueOuterScan: | |
659 } | |
660 | |
661 BreakOuterScan: | |
662 ; | |
663 | |
664 if !p.emptyStack() { | |
665 return nil, p.getErr(ErrMissingParen) | |
666 } | |
667 | |
668 if err := p.addGroup(); err != nil { | |
669 return nil, err | |
670 } | |
671 | |
672 return p.unit, nil | |
673 | |
674 } | |
675 | |
676 /* | |
677 * Simple parsing for replacement patterns | |
678 */ | |
679 func (p *parser) scanReplacement() (*regexNode, error) { | |
680 var c, startpos int | |
681 | |
682 p.concatenation = newRegexNode(ntConcatenate, p.options) | |
683 | |
684 for { | |
685 c = p.charsRight() | |
686 if c == 0 { | |
687 break | |
688 } | |
689 | |
690 startpos = p.textpos() | |
691 | |
692 for c > 0 && p.rightChar(0) != '$' { | |
693 p.moveRight(1) | |
694 c-- | |
695 } | |
696 | |
697 p.addToConcatenate(startpos, p.textpos()-startpos, true) | |
698 | |
699 if c > 0 { | |
700 if p.moveRightGetChar() == '$' { | |
701 n, err := p.scanDollar() | |
702 if err != nil { | |
703 return nil, err | |
704 } | |
705 p.addUnitNode(n) | |
706 } | |
707 p.addConcatenate() | |
708 } | |
709 } | |
710 | |
711 return p.concatenation, nil | |
712 } | |
713 | |
714 /* | |
715 * Scans $ patterns recognized within replacement patterns | |
716 */ | |
717 func (p *parser) scanDollar() (*regexNode, error) { | |
718 if p.charsRight() == 0 { | |
719 return newRegexNodeCh(ntOne, p.options, '$'), nil | |
720 } | |
721 | |
722 ch := p.rightChar(0) | |
723 angled := false | |
724 backpos := p.textpos() | |
725 lastEndPos := backpos | |
726 | |
727 // Note angle | |
728 | |
729 if ch == '{' && p.charsRight() > 1 { | |
730 angled = true | |
731 p.moveRight(1) | |
732 ch = p.rightChar(0) | |
733 } | |
734 | |
735 // Try to parse backreference: \1 or \{1} or \{cap} | |
736 | |
737 if ch >= '0' && ch <= '9' { | |
738 if !angled && p.useOptionE() { | |
739 capnum := -1 | |
740 newcapnum := int(ch - '0') | |
741 p.moveRight(1) | |
742 if p.isCaptureSlot(newcapnum) { | |
743 capnum = newcapnum | |
744 lastEndPos = p.textpos() | |
745 } | |
746 | |
747 for p.charsRight() > 0 { | |
748 ch = p.rightChar(0) | |
749 if ch < '0' || ch > '9' { | |
750 break | |
751 } | |
752 digit := int(ch - '0') | |
753 if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) { | |
754 return nil, p.getErr(ErrCaptureGroupOutOfRange) | |
755 } | |
756 | |
757 newcapnum = newcapnum*10 + digit | |
758 | |
759 p.moveRight(1) | |
760 if p.isCaptureSlot(newcapnum) { | |
761 capnum = newcapnum | |
762 lastEndPos = p.textpos() | |
763 } | |
764 } | |
765 p.textto(lastEndPos) | |
766 if capnum >= 0 { | |
767 return newRegexNodeM(ntRef, p.options, capnum), nil | |
768 } | |
769 } else { | |
770 capnum, err := p.scanDecimal() | |
771 if err != nil { | |
772 return nil, err | |
773 } | |
774 if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' { | |
775 if p.isCaptureSlot(capnum) { | |
776 return newRegexNodeM(ntRef, p.options, capnum), nil | |
777 } | |
778 } | |
779 } | |
780 } else if angled && IsWordChar(ch) { | |
781 capname := p.scanCapname() | |
782 | |
783 if p.charsRight() > 0 && p.moveRightGetChar() == '}' { | |
784 if p.isCaptureName(capname) { | |
785 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil | |
786 } | |
787 } | |
788 } else if !angled { | |
789 capnum := 1 | |
790 | |
791 switch ch { | |
792 case '$': | |
793 p.moveRight(1) | |
794 return newRegexNodeCh(ntOne, p.options, '$'), nil | |
795 case '&': | |
796 capnum = 0 | |
797 case '`': | |
798 capnum = replaceLeftPortion | |
799 case '\'': | |
800 capnum = replaceRightPortion | |
801 case '+': | |
802 capnum = replaceLastGroup | |
803 case '_': | |
804 capnum = replaceWholeString | |
805 } | |
806 | |
807 if capnum != 1 { | |
808 p.moveRight(1) | |
809 return newRegexNodeM(ntRef, p.options, capnum), nil | |
810 } | |
811 } | |
812 | |
813 // unrecognized $: literalize | |
814 | |
815 p.textto(backpos) | |
816 return newRegexNodeCh(ntOne, p.options, '$'), nil | |
817 } | |
818 | |
819 // scanGroupOpen scans chars following a '(' (not counting the '('), and returns | |
820 // a RegexNode for the type of group scanned, or nil if the group | |
821 // simply changed options (?cimsx-cimsx) or was a comment (#...). | |
822 func (p *parser) scanGroupOpen() (*regexNode, error) { | |
823 var ch rune | |
824 var nt nodeType | |
825 var err error | |
826 close := '>' | |
827 start := p.textpos() | |
828 | |
829 // just return a RegexNode if we have: | |
830 // 1. "(" followed by nothing | |
831 // 2. "(x" where x != ? | |
832 // 3. "(?)" | |
833 if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) { | |
834 if p.useOptionN() || p.ignoreNextParen { | |
835 p.ignoreNextParen = false | |
836 return newRegexNode(ntGroup, p.options), nil | |
837 } | |
838 return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil | |
839 } | |
840 | |
841 p.moveRight(1) | |
842 | |
843 for { | |
844 if p.charsRight() == 0 { | |
845 break | |
846 } | |
847 | |
848 switch ch = p.moveRightGetChar(); ch { | |
849 case ':': | |
850 nt = ntGroup | |
851 | |
852 case '=': | |
853 p.options &= ^RightToLeft | |
854 nt = ntRequire | |
855 | |
856 case '!': | |
857 p.options &= ^RightToLeft | |
858 nt = ntPrevent | |
859 | |
860 case '>': | |
861 nt = ntGreedy | |
862 | |
863 case '\'': | |
864 close = '\'' | |
865 fallthrough | |
866 | |
867 case '<': | |
868 if p.charsRight() == 0 { | |
869 goto BreakRecognize | |
870 } | |
871 | |
872 switch ch = p.moveRightGetChar(); ch { | |
873 case '=': | |
874 if close == '\'' { | |
875 goto BreakRecognize | |
876 } | |
877 | |
878 p.options |= RightToLeft | |
879 nt = ntRequire | |
880 | |
881 case '!': | |
882 if close == '\'' { | |
883 goto BreakRecognize | |
884 } | |
885 | |
886 p.options |= RightToLeft | |
887 nt = ntPrevent | |
888 | |
889 default: | |
890 p.moveLeft() | |
891 capnum := -1 | |
892 uncapnum := -1 | |
893 proceed := false | |
894 | |
895 // grab part before - | |
896 | |
897 if ch >= '0' && ch <= '9' { | |
898 if capnum, err = p.scanDecimal(); err != nil { | |
899 return nil, err | |
900 } | |
901 | |
902 if !p.isCaptureSlot(capnum) { | |
903 capnum = -1 | |
904 } | |
905 | |
906 // check if we have bogus characters after the number | |
907 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { | |
908 return nil, p.getErr(ErrInvalidGroupName) | |
909 } | |
910 if capnum == 0 { | |
911 return nil, p.getErr(ErrCapNumNotZero) | |
912 } | |
913 } else if IsWordChar(ch) { | |
914 capname := p.scanCapname() | |
915 | |
916 if p.isCaptureName(capname) { | |
917 capnum = p.captureSlotFromName(capname) | |
918 } | |
919 | |
920 // check if we have bogus character after the name | |
921 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { | |
922 return nil, p.getErr(ErrInvalidGroupName) | |
923 } | |
924 } else if ch == '-' { | |
925 proceed = true | |
926 } else { | |
927 // bad group name - starts with something other than a word character and isn't a number | |
928 return nil, p.getErr(ErrInvalidGroupName) | |
929 } | |
930 | |
931 // grab part after - if any | |
932 | |
933 if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' { | |
934 p.moveRight(1) | |
935 | |
936 //no more chars left, no closing char, etc | |
937 if p.charsRight() == 0 { | |
938 return nil, p.getErr(ErrInvalidGroupName) | |
939 } | |
940 | |
941 ch = p.rightChar(0) | |
942 if ch >= '0' && ch <= '9' { | |
943 if uncapnum, err = p.scanDecimal(); err != nil { | |
944 return nil, err | |
945 } | |
946 | |
947 if !p.isCaptureSlot(uncapnum) { | |
948 return nil, p.getErr(ErrUndefinedBackRef, uncapnum) | |
949 } | |
950 | |
951 // check if we have bogus characters after the number | |
952 if p.charsRight() > 0 && p.rightChar(0) != close { | |
953 return nil, p.getErr(ErrInvalidGroupName) | |
954 } | |
955 } else if IsWordChar(ch) { | |
956 uncapname := p.scanCapname() | |
957 | |
958 if !p.isCaptureName(uncapname) { | |
959 return nil, p.getErr(ErrUndefinedNameRef, uncapname) | |
960 } | |
961 uncapnum = p.captureSlotFromName(uncapname) | |
962 | |
963 // check if we have bogus character after the name | |
964 if p.charsRight() > 0 && p.rightChar(0) != close { | |
965 return nil, p.getErr(ErrInvalidGroupName) | |
966 } | |
967 } else { | |
968 // bad group name - starts with something other than a word character and isn't a number | |
969 return nil, p.getErr(ErrInvalidGroupName) | |
970 } | |
971 } | |
972 | |
973 // actually make the node | |
974 | |
975 if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close { | |
976 return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil | |
977 } | |
978 goto BreakRecognize | |
979 } | |
980 | |
981 case '(': | |
982 // alternation construct (?(...) | ) | |
983 | |
984 parenPos := p.textpos() | |
985 if p.charsRight() > 0 { | |
986 ch = p.rightChar(0) | |
987 | |
988 // check if the alternation condition is a backref | |
989 if ch >= '0' && ch <= '9' { | |
990 var capnum int | |
991 if capnum, err = p.scanDecimal(); err != nil { | |
992 return nil, err | |
993 } | |
994 if p.charsRight() > 0 && p.moveRightGetChar() == ')' { | |
995 if p.isCaptureSlot(capnum) { | |
996 return newRegexNodeM(ntTestref, p.options, capnum), nil | |
997 } | |
998 return nil, p.getErr(ErrUndefinedReference, capnum) | |
999 } | |
1000 | |
1001 return nil, p.getErr(ErrMalformedReference, capnum) | |
1002 | |
1003 } else if IsWordChar(ch) { | |
1004 capname := p.scanCapname() | |
1005 | |
1006 if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' { | |
1007 return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil | |
1008 } | |
1009 } | |
1010 } | |
1011 // not a backref | |
1012 nt = ntTestgroup | |
1013 p.textto(parenPos - 1) // jump to the start of the parentheses | |
1014 p.ignoreNextParen = true // but make sure we don't try to capture the insides | |
1015 | |
1016 charsRight := p.charsRight() | |
1017 if charsRight >= 3 && p.rightChar(1) == '?' { | |
1018 rightchar2 := p.rightChar(2) | |
1019 // disallow comments in the condition | |
1020 if rightchar2 == '#' { | |
1021 return nil, p.getErr(ErrAlternationCantHaveComment) | |
1022 } | |
1023 | |
1024 // disallow named capture group (?<..>..) in the condition | |
1025 if rightchar2 == '\'' { | |
1026 return nil, p.getErr(ErrAlternationCantCapture) | |
1027 } | |
1028 | |
1029 if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') { | |
1030 return nil, p.getErr(ErrAlternationCantCapture) | |
1031 } | |
1032 } | |
1033 | |
1034 case 'P': | |
1035 if p.useRE2() { | |
1036 // support for P<name> syntax | |
1037 if p.charsRight() < 3 { | |
1038 goto BreakRecognize | |
1039 } | |
1040 | |
1041 ch = p.moveRightGetChar() | |
1042 if ch != '<' { | |
1043 goto BreakRecognize | |
1044 } | |
1045 | |
1046 ch = p.moveRightGetChar() | |
1047 p.moveLeft() | |
1048 | |
1049 if IsWordChar(ch) { | |
1050 capnum := -1 | |
1051 capname := p.scanCapname() | |
1052 | |
1053 if p.isCaptureName(capname) { | |
1054 capnum = p.captureSlotFromName(capname) | |
1055 } | |
1056 | |
1057 // check if we have bogus character after the name | |
1058 if p.charsRight() > 0 && p.rightChar(0) != '>' { | |
1059 return nil, p.getErr(ErrInvalidGroupName) | |
1060 } | |
1061 | |
1062 // actually make the node | |
1063 | |
1064 if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' { | |
1065 return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil | |
1066 } | |
1067 goto BreakRecognize | |
1068 | |
1069 } else { | |
1070 // bad group name - starts with something other than a word character and isn't a number | |
1071 return nil, p.getErr(ErrInvalidGroupName) | |
1072 } | |
1073 } | |
1074 // if we're not using RE2 compat mode then | |
1075 // we just behave like normal | |
1076 fallthrough | |
1077 | |
1078 default: | |
1079 p.moveLeft() | |
1080 | |
1081 nt = ntGroup | |
1082 // disallow options in the children of a testgroup node | |
1083 if p.group.t != ntTestgroup { | |
1084 p.scanOptions() | |
1085 } | |
1086 if p.charsRight() == 0 { | |
1087 goto BreakRecognize | |
1088 } | |
1089 | |
1090 if ch = p.moveRightGetChar(); ch == ')' { | |
1091 return nil, nil | |
1092 } | |
1093 | |
1094 if ch != ':' { | |
1095 goto BreakRecognize | |
1096 } | |
1097 | |
1098 } | |
1099 | |
1100 return newRegexNode(nt, p.options), nil | |
1101 } | |
1102 | |
1103 BreakRecognize: | |
1104 | |
1105 // break Recognize comes here | |
1106 | |
1107 return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()])) | |
1108 } | |
1109 | |
1110 // scans backslash specials and basics | |
1111 func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) { | |
1112 | |
1113 if p.charsRight() == 0 { | |
1114 return nil, p.getErr(ErrIllegalEndEscape) | |
1115 } | |
1116 | |
1117 switch ch := p.rightChar(0); ch { | |
1118 case 'b', 'B', 'A', 'G', 'Z', 'z': | |
1119 p.moveRight(1) | |
1120 return newRegexNode(p.typeFromCode(ch), p.options), nil | |
1121 | |
1122 case 'w': | |
1123 p.moveRight(1) | |
1124 if p.useOptionE() { | |
1125 return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil | |
1126 } | |
1127 return newRegexNodeSet(ntSet, p.options, WordClass()), nil | |
1128 | |
1129 case 'W': | |
1130 p.moveRight(1) | |
1131 if p.useOptionE() { | |
1132 return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil | |
1133 } | |
1134 return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil | |
1135 | |
1136 case 's': | |
1137 p.moveRight(1) | |
1138 if p.useOptionE() { | |
1139 return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil | |
1140 } | |
1141 return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil | |
1142 | |
1143 case 'S': | |
1144 p.moveRight(1) | |
1145 if p.useOptionE() { | |
1146 return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil | |
1147 } | |
1148 return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil | |
1149 | |
1150 case 'd': | |
1151 p.moveRight(1) | |
1152 if p.useOptionE() { | |
1153 return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil | |
1154 } | |
1155 return newRegexNodeSet(ntSet, p.options, DigitClass()), nil | |
1156 | |
1157 case 'D': | |
1158 p.moveRight(1) | |
1159 if p.useOptionE() { | |
1160 return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil | |
1161 } | |
1162 return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil | |
1163 | |
1164 case 'p', 'P': | |
1165 p.moveRight(1) | |
1166 prop, err := p.parseProperty() | |
1167 if err != nil { | |
1168 return nil, err | |
1169 } | |
1170 cc := &CharSet{} | |
1171 cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw) | |
1172 if p.useOptionI() { | |
1173 cc.addLowercase() | |
1174 } | |
1175 | |
1176 return newRegexNodeSet(ntSet, p.options, cc), nil | |
1177 | |
1178 default: | |
1179 return p.scanBasicBackslash(scanOnly) | |
1180 } | |
1181 } | |
1182 | |
1183 // Scans \-style backreferences and character escapes | |
1184 func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) { | |
1185 if p.charsRight() == 0 { | |
1186 return nil, p.getErr(ErrIllegalEndEscape) | |
1187 } | |
1188 angled := false | |
1189 close := '\x00' | |
1190 | |
1191 backpos := p.textpos() | |
1192 ch := p.rightChar(0) | |
1193 | |
1194 // allow \k<foo> instead of \<foo>, which is now deprecated | |
1195 | |
1196 if ch == 'k' { | |
1197 if p.charsRight() >= 2 { | |
1198 p.moveRight(1) | |
1199 ch = p.moveRightGetChar() | |
1200 | |
1201 if ch == '<' || ch == '\'' { | |
1202 angled = true | |
1203 if ch == '\'' { | |
1204 close = '\'' | |
1205 } else { | |
1206 close = '>' | |
1207 } | |
1208 } | |
1209 } | |
1210 | |
1211 if !angled || p.charsRight() <= 0 { | |
1212 return nil, p.getErr(ErrMalformedNameRef) | |
1213 } | |
1214 | |
1215 ch = p.rightChar(0) | |
1216 | |
1217 } else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g | |
1218 angled = true | |
1219 if ch == '\'' { | |
1220 close = '\'' | |
1221 } else { | |
1222 close = '>' | |
1223 } | |
1224 | |
1225 p.moveRight(1) | |
1226 ch = p.rightChar(0) | |
1227 } | |
1228 | |
1229 // Try to parse backreference: \<1> or \<cap> | |
1230 | |
1231 if angled && ch >= '0' && ch <= '9' { | |
1232 capnum, err := p.scanDecimal() | |
1233 if err != nil { | |
1234 return nil, err | |
1235 } | |
1236 | |
1237 if p.charsRight() > 0 && p.moveRightGetChar() == close { | |
1238 if p.isCaptureSlot(capnum) { | |
1239 return newRegexNodeM(ntRef, p.options, capnum), nil | |
1240 } | |
1241 return nil, p.getErr(ErrUndefinedBackRef, capnum) | |
1242 } | |
1243 } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1 | |
1244 capnum, err := p.scanDecimal() | |
1245 if err != nil { | |
1246 return nil, err | |
1247 } | |
1248 | |
1249 if scanOnly { | |
1250 return nil, nil | |
1251 } | |
1252 | |
1253 if p.isCaptureSlot(capnum) { | |
1254 return newRegexNodeM(ntRef, p.options, capnum), nil | |
1255 } | |
1256 if capnum <= 9 && !p.useOptionE() { | |
1257 return nil, p.getErr(ErrUndefinedBackRef, capnum) | |
1258 } | |
1259 | |
1260 } else if angled && IsWordChar(ch) { | |
1261 capname := p.scanCapname() | |
1262 | |
1263 if p.charsRight() > 0 && p.moveRightGetChar() == close { | |
1264 if p.isCaptureName(capname) { | |
1265 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil | |
1266 } | |
1267 return nil, p.getErr(ErrUndefinedNameRef, capname) | |
1268 } | |
1269 } | |
1270 | |
1271 // Not backreference: must be char code | |
1272 | |
1273 p.textto(backpos) | |
1274 ch, err := p.scanCharEscape() | |
1275 if err != nil { | |
1276 return nil, err | |
1277 } | |
1278 | |
1279 if p.useOptionI() { | |
1280 ch = unicode.ToLower(ch) | |
1281 } | |
1282 | |
1283 return newRegexNodeCh(ntOne, p.options, ch), nil | |
1284 } | |
1285 | |
1286 // Scans X for \p{X} or \P{X} | |
1287 func (p *parser) parseProperty() (string, error) { | |
1288 if p.charsRight() < 3 { | |
1289 return "", p.getErr(ErrIncompleteSlashP) | |
1290 } | |
1291 ch := p.moveRightGetChar() | |
1292 if ch != '{' { | |
1293 return "", p.getErr(ErrMalformedSlashP) | |
1294 } | |
1295 | |
1296 startpos := p.textpos() | |
1297 for p.charsRight() > 0 { | |
1298 ch = p.moveRightGetChar() | |
1299 if !(IsWordChar(ch) || ch == '-') { | |
1300 p.moveLeft() | |
1301 break | |
1302 } | |
1303 } | |
1304 capname := string(p.pattern[startpos:p.textpos()]) | |
1305 | |
1306 if p.charsRight() == 0 || p.moveRightGetChar() != '}' { | |
1307 return "", p.getErr(ErrIncompleteSlashP) | |
1308 } | |
1309 | |
1310 if !isValidUnicodeCat(capname) { | |
1311 return "", p.getErr(ErrUnknownSlashP, capname) | |
1312 } | |
1313 | |
1314 return capname, nil | |
1315 } | |
1316 | |
1317 // Returns ReNode type for zero-length assertions with a \ code. | |
1318 func (p *parser) typeFromCode(ch rune) nodeType { | |
1319 switch ch { | |
1320 case 'b': | |
1321 if p.useOptionE() { | |
1322 return ntECMABoundary | |
1323 } | |
1324 return ntBoundary | |
1325 case 'B': | |
1326 if p.useOptionE() { | |
1327 return ntNonECMABoundary | |
1328 } | |
1329 return ntNonboundary | |
1330 case 'A': | |
1331 return ntBeginning | |
1332 case 'G': | |
1333 return ntStart | |
1334 case 'Z': | |
1335 return ntEndZ | |
1336 case 'z': | |
1337 return ntEnd | |
1338 default: | |
1339 return ntNothing | |
1340 } | |
1341 } | |
1342 | |
1343 // Scans whitespace or x-mode comments. | |
1344 func (p *parser) scanBlank() error { | |
1345 if p.useOptionX() { | |
1346 for { | |
1347 for p.charsRight() > 0 && isSpace(p.rightChar(0)) { | |
1348 p.moveRight(1) | |
1349 } | |
1350 | |
1351 if p.charsRight() == 0 { | |
1352 break | |
1353 } | |
1354 | |
1355 if p.rightChar(0) == '#' { | |
1356 for p.charsRight() > 0 && p.rightChar(0) != '\n' { | |
1357 p.moveRight(1) | |
1358 } | |
1359 } else if p.charsRight() >= 3 && p.rightChar(2) == '#' && | |
1360 p.rightChar(1) == '?' && p.rightChar(0) == '(' { | |
1361 for p.charsRight() > 0 && p.rightChar(0) != ')' { | |
1362 p.moveRight(1) | |
1363 } | |
1364 if p.charsRight() == 0 { | |
1365 return p.getErr(ErrUnterminatedComment) | |
1366 } | |
1367 p.moveRight(1) | |
1368 } else { | |
1369 break | |
1370 } | |
1371 } | |
1372 } else { | |
1373 for { | |
1374 if p.charsRight() < 3 || p.rightChar(2) != '#' || | |
1375 p.rightChar(1) != '?' || p.rightChar(0) != '(' { | |
1376 return nil | |
1377 } | |
1378 | |
1379 for p.charsRight() > 0 && p.rightChar(0) != ')' { | |
1380 p.moveRight(1) | |
1381 } | |
1382 if p.charsRight() == 0 { | |
1383 return p.getErr(ErrUnterminatedComment) | |
1384 } | |
1385 p.moveRight(1) | |
1386 } | |
1387 } | |
1388 return nil | |
1389 } | |
1390 | |
1391 func (p *parser) scanCapname() string { | |
1392 startpos := p.textpos() | |
1393 | |
1394 for p.charsRight() > 0 { | |
1395 if !IsWordChar(p.moveRightGetChar()) { | |
1396 p.moveLeft() | |
1397 break | |
1398 } | |
1399 } | |
1400 | |
1401 return string(p.pattern[startpos:p.textpos()]) | |
1402 } | |
1403 | |
1404 //Scans contents of [] (not including []'s), and converts to a set. | |
1405 func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { | |
1406 ch := '\x00' | |
1407 chPrev := '\x00' | |
1408 inRange := false | |
1409 firstChar := true | |
1410 closed := false | |
1411 | |
1412 var cc *CharSet | |
1413 if !scanOnly { | |
1414 cc = &CharSet{} | |
1415 } | |
1416 | |
1417 if p.charsRight() > 0 && p.rightChar(0) == '^' { | |
1418 p.moveRight(1) | |
1419 if !scanOnly { | |
1420 cc.negate = true | |
1421 } | |
1422 } | |
1423 | |
1424 for ; p.charsRight() > 0; firstChar = false { | |
1425 fTranslatedChar := false | |
1426 ch = p.moveRightGetChar() | |
1427 if ch == ']' { | |
1428 if !firstChar { | |
1429 closed = true | |
1430 break | |
1431 } else if p.useOptionE() { | |
1432 if !scanOnly { | |
1433 cc.addRanges(NoneClass().ranges) | |
1434 } | |
1435 closed = true | |
1436 break | |
1437 } | |
1438 | |
1439 } else if ch == '\\' && p.charsRight() > 0 { | |
1440 switch ch = p.moveRightGetChar(); ch { | |
1441 case 'D', 'd': | |
1442 if !scanOnly { | |
1443 if inRange { | |
1444 return nil, p.getErr(ErrBadClassInCharRange, ch) | |
1445 } | |
1446 cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw) | |
1447 } | |
1448 continue | |
1449 | |
1450 case 'S', 's': | |
1451 if !scanOnly { | |
1452 if inRange { | |
1453 return nil, p.getErr(ErrBadClassInCharRange, ch) | |
1454 } | |
1455 cc.addSpace(p.useOptionE(), ch == 'S') | |
1456 } | |
1457 continue | |
1458 | |
1459 case 'W', 'w': | |
1460 if !scanOnly { | |
1461 if inRange { | |
1462 return nil, p.getErr(ErrBadClassInCharRange, ch) | |
1463 } | |
1464 | |
1465 cc.addWord(p.useOptionE(), ch == 'W') | |
1466 } | |
1467 continue | |
1468 | |
1469 case 'p', 'P': | |
1470 if !scanOnly { | |
1471 if inRange { | |
1472 return nil, p.getErr(ErrBadClassInCharRange, ch) | |
1473 } | |
1474 prop, err := p.parseProperty() | |
1475 if err != nil { | |
1476 return nil, err | |
1477 } | |
1478 cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) | |
1479 } else { | |
1480 p.parseProperty() | |
1481 } | |
1482 | |
1483 continue | |
1484 | |
1485 case '-': | |
1486 if !scanOnly { | |
1487 cc.addRange(ch, ch) | |
1488 } | |
1489 continue | |
1490 | |
1491 default: | |
1492 p.moveLeft() | |
1493 var err error | |
1494 ch, err = p.scanCharEscape() // non-literal character | |
1495 if err != nil { | |
1496 return nil, err | |
1497 } | |
1498 fTranslatedChar = true | |
1499 break // this break will only break out of the switch | |
1500 } | |
1501 } else if ch == '[' { | |
1502 // This is code for Posix style properties - [:Ll:] or [:IsTibetan:]. | |
1503 // It currently doesn't do anything other than skip the whole thing! | |
1504 if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange { | |
1505 savePos := p.textpos() | |
1506 | |
1507 p.moveRight(1) | |
1508 negate := false | |
1509 if p.charsRight() > 1 && p.rightChar(0) == '^' { | |
1510 negate = true | |
1511 p.moveRight(1) | |
1512 } | |
1513 | |
1514 nm := p.scanCapname() // snag the name | |
1515 if !scanOnly && p.useRE2() { | |
1516 // look up the name since these are valid for RE2 | |
1517 // add the group based on the name | |
1518 if ok := cc.addNamedASCII(nm, negate); !ok { | |
1519 return nil, p.getErr(ErrInvalidCharRange) | |
1520 } | |
1521 } | |
1522 if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' { | |
1523 p.textto(savePos) | |
1524 } else if p.useRE2() { | |
1525 // move on | |
1526 continue | |
1527 } | |
1528 } | |
1529 } | |
1530 | |
1531 if inRange { | |
1532 inRange = false | |
1533 if !scanOnly { | |
1534 if ch == '[' && !fTranslatedChar && !firstChar { | |
1535 // We thought we were in a range, but we're actually starting a subtraction. | |
1536 // In that case, we'll add chPrev to our char class, skip the opening [, and | |
1537 // scan the new character class recursively. | |
1538 cc.addChar(chPrev) | |
1539 sub, err := p.scanCharSet(caseInsensitive, false) | |
1540 if err != nil { | |
1541 return nil, err | |
1542 } | |
1543 cc.addSubtraction(sub) | |
1544 | |
1545 if p.charsRight() > 0 && p.rightChar(0) != ']' { | |
1546 return nil, p.getErr(ErrSubtractionMustBeLast) | |
1547 } | |
1548 } else { | |
1549 // a regular range, like a-z | |
1550 if chPrev > ch { | |
1551 return nil, p.getErr(ErrReversedCharRange) | |
1552 } | |
1553 cc.addRange(chPrev, ch) | |
1554 } | |
1555 } | |
1556 } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' { | |
1557 // this could be the start of a range | |
1558 chPrev = ch | |
1559 inRange = true | |
1560 p.moveRight(1) | |
1561 } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar { | |
1562 // we aren't in a range, and now there is a subtraction. Usually this happens | |
1563 // only when a subtraction follows a range, like [a-z-[b]] | |
1564 if !scanOnly { | |
1565 p.moveRight(1) | |
1566 sub, err := p.scanCharSet(caseInsensitive, false) | |
1567 if err != nil { | |
1568 return nil, err | |
1569 } | |
1570 cc.addSubtraction(sub) | |
1571 | |
1572 if p.charsRight() > 0 && p.rightChar(0) != ']' { | |
1573 return nil, p.getErr(ErrSubtractionMustBeLast) | |
1574 } | |
1575 } else { | |
1576 p.moveRight(1) | |
1577 p.scanCharSet(caseInsensitive, true) | |
1578 } | |
1579 } else { | |
1580 if !scanOnly { | |
1581 cc.addRange(ch, ch) | |
1582 } | |
1583 } | |
1584 } | |
1585 | |
1586 if !closed { | |
1587 return nil, p.getErr(ErrUnterminatedBracket) | |
1588 } | |
1589 | |
1590 if !scanOnly && caseInsensitive { | |
1591 cc.addLowercase() | |
1592 } | |
1593 | |
1594 return cc, nil | |
1595 } | |
1596 | |
1597 // Scans any number of decimal digits (pegs value at 2^31-1 if too large) | |
1598 func (p *parser) scanDecimal() (int, error) { | |
1599 i := 0 | |
1600 var d int | |
1601 | |
1602 for p.charsRight() > 0 { | |
1603 d = int(p.rightChar(0) - '0') | |
1604 if d < 0 || d > 9 { | |
1605 break | |
1606 } | |
1607 p.moveRight(1) | |
1608 | |
1609 if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) { | |
1610 return 0, p.getErr(ErrCaptureGroupOutOfRange) | |
1611 } | |
1612 | |
1613 i *= 10 | |
1614 i += d | |
1615 } | |
1616 | |
1617 return int(i), nil | |
1618 } | |
1619 | |
1620 // Returns true for options allowed only at the top level | |
1621 func isOnlyTopOption(option RegexOptions) bool { | |
1622 return option == RightToLeft || option == ECMAScript || option == RE2 | |
1623 } | |
1624 | |
1625 // Scans cimsx-cimsx option string, stops at the first unrecognized char. | |
1626 func (p *parser) scanOptions() { | |
1627 | |
1628 for off := false; p.charsRight() > 0; p.moveRight(1) { | |
1629 ch := p.rightChar(0) | |
1630 | |
1631 if ch == '-' { | |
1632 off = true | |
1633 } else if ch == '+' { | |
1634 off = false | |
1635 } else { | |
1636 option := optionFromCode(ch) | |
1637 if option == 0 || isOnlyTopOption(option) { | |
1638 return | |
1639 } | |
1640 | |
1641 if off { | |
1642 p.options &= ^option | |
1643 } else { | |
1644 p.options |= option | |
1645 } | |
1646 } | |
1647 } | |
1648 } | |
1649 | |
1650 // Scans \ code for escape codes that map to single unicode chars. | |
1651 func (p *parser) scanCharEscape() (r rune, err error) { | |
1652 | |
1653 ch := p.moveRightGetChar() | |
1654 | |
1655 if ch >= '0' && ch <= '7' { | |
1656 p.moveLeft() | |
1657 return p.scanOctal(), nil | |
1658 } | |
1659 | |
1660 pos := p.textpos() | |
1661 | |
1662 switch ch { | |
1663 case 'x': | |
1664 // support for \x{HEX} syntax from Perl and PCRE | |
1665 if p.charsRight() > 0 && p.rightChar(0) == '{' { | |
1666 if p.useOptionE() { | |
1667 return ch, nil | |
1668 } | |
1669 p.moveRight(1) | |
1670 return p.scanHexUntilBrace() | |
1671 } else { | |
1672 r, err = p.scanHex(2) | |
1673 } | |
1674 case 'u': | |
1675 r, err = p.scanHex(4) | |
1676 case 'a': | |
1677 return '\u0007', nil | |
1678 case 'b': | |
1679 return '\b', nil | |
1680 case 'e': | |
1681 return '\u001B', nil | |
1682 case 'f': | |
1683 return '\f', nil | |
1684 case 'n': | |
1685 return '\n', nil | |
1686 case 'r': | |
1687 return '\r', nil | |
1688 case 't': | |
1689 return '\t', nil | |
1690 case 'v': | |
1691 return '\u000B', nil | |
1692 case 'c': | |
1693 r, err = p.scanControl() | |
1694 default: | |
1695 if !p.useOptionE() && IsWordChar(ch) { | |
1696 return 0, p.getErr(ErrUnrecognizedEscape, string(ch)) | |
1697 } | |
1698 return ch, nil | |
1699 } | |
1700 if err != nil && p.useOptionE() { | |
1701 p.textto(pos) | |
1702 return ch, nil | |
1703 } | |
1704 return | |
1705 } | |
1706 | |
1707 // Grabs and converts an ascii control character | |
1708 func (p *parser) scanControl() (rune, error) { | |
1709 if p.charsRight() <= 0 { | |
1710 return 0, p.getErr(ErrMissingControl) | |
1711 } | |
1712 | |
1713 ch := p.moveRightGetChar() | |
1714 | |
1715 // \ca interpreted as \cA | |
1716 | |
1717 if ch >= 'a' && ch <= 'z' { | |
1718 ch = (ch - ('a' - 'A')) | |
1719 } | |
1720 ch = (ch - '@') | |
1721 if ch >= 0 && ch < ' ' { | |
1722 return ch, nil | |
1723 } | |
1724 | |
1725 return 0, p.getErr(ErrUnrecognizedControl) | |
1726 | |
1727 } | |
1728 | |
1729 // Scan hex digits until we hit a closing brace. | |
1730 // Non-hex digits, hex value too large for UTF-8, or running out of chars are errors | |
1731 func (p *parser) scanHexUntilBrace() (rune, error) { | |
1732 // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit | |
1733 // so we can enforce that | |
1734 i := 0 | |
1735 hasContent := false | |
1736 | |
1737 for p.charsRight() > 0 { | |
1738 ch := p.moveRightGetChar() | |
1739 if ch == '}' { | |
1740 // hit our close brace, we're done here | |
1741 // prevent \x{} | |
1742 if !hasContent { | |
1743 return 0, p.getErr(ErrTooFewHex) | |
1744 } | |
1745 return rune(i), nil | |
1746 } | |
1747 hasContent = true | |
1748 // no brace needs to be hex digit | |
1749 d := hexDigit(ch) | |
1750 if d < 0 { | |
1751 return 0, p.getErr(ErrMissingBrace) | |
1752 } | |
1753 | |
1754 i *= 0x10 | |
1755 i += d | |
1756 | |
1757 if i > unicode.MaxRune { | |
1758 return 0, p.getErr(ErrInvalidHex) | |
1759 } | |
1760 } | |
1761 | |
1762 // we only make it here if we run out of digits without finding the brace | |
1763 return 0, p.getErr(ErrMissingBrace) | |
1764 } | |
1765 | |
1766 // Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF) | |
1767 func (p *parser) scanHex(c int) (rune, error) { | |
1768 | |
1769 i := 0 | |
1770 | |
1771 if p.charsRight() >= c { | |
1772 for c > 0 { | |
1773 d := hexDigit(p.moveRightGetChar()) | |
1774 if d < 0 { | |
1775 break | |
1776 } | |
1777 i *= 0x10 | |
1778 i += d | |
1779 c-- | |
1780 } | |
1781 } | |
1782 | |
1783 if c > 0 { | |
1784 return 0, p.getErr(ErrTooFewHex) | |
1785 } | |
1786 | |
1787 return rune(i), nil | |
1788 } | |
1789 | |
1790 // Returns n <= 0xF for a hex digit. | |
1791 func hexDigit(ch rune) int { | |
1792 | |
1793 if d := uint(ch - '0'); d <= 9 { | |
1794 return int(d) | |
1795 } | |
1796 | |
1797 if d := uint(ch - 'a'); d <= 5 { | |
1798 return int(d + 0xa) | |
1799 } | |
1800 | |
1801 if d := uint(ch - 'A'); d <= 5 { | |
1802 return int(d + 0xa) | |
1803 } | |
1804 | |
1805 return -1 | |
1806 } | |
1807 | |
1808 // Scans up to three octal digits (stops before exceeding 0377). | |
1809 func (p *parser) scanOctal() rune { | |
1810 // Consume octal chars only up to 3 digits and value 0377 | |
1811 | |
1812 c := 3 | |
1813 | |
1814 if c > p.charsRight() { | |
1815 c = p.charsRight() | |
1816 } | |
1817 | |
1818 //we know the first char is good because the caller had to check | |
1819 i := 0 | |
1820 d := int(p.rightChar(0) - '0') | |
1821 for c > 0 && d <= 7 && d >= 0 { | |
1822 if i >= 0x20 && p.useOptionE() { | |
1823 break | |
1824 } | |
1825 i *= 8 | |
1826 i += d | |
1827 c-- | |
1828 | |
1829 p.moveRight(1) | |
1830 if !p.rightMost() { | |
1831 d = int(p.rightChar(0) - '0') | |
1832 } | |
1833 } | |
1834 | |
1835 // Octal codes only go up to 255. Any larger and the behavior that Perl follows | |
1836 // is simply to truncate the high bits. | |
1837 i &= 0xFF | |
1838 | |
1839 return rune(i) | |
1840 } | |
1841 | |
1842 // Returns the current parsing position. | |
1843 func (p *parser) textpos() int { | |
1844 return p.currentPos | |
1845 } | |
1846 | |
1847 // Zaps to a specific parsing position. | |
1848 func (p *parser) textto(pos int) { | |
1849 p.currentPos = pos | |
1850 } | |
1851 | |
1852 // Returns the char at the right of the current parsing position and advances to the right. | |
1853 func (p *parser) moveRightGetChar() rune { | |
1854 ch := p.pattern[p.currentPos] | |
1855 p.currentPos++ | |
1856 return ch | |
1857 } | |
1858 | |
1859 // Moves the current position to the right. | |
1860 func (p *parser) moveRight(i int) { | |
1861 // default would be 1 | |
1862 p.currentPos += i | |
1863 } | |
1864 | |
1865 // Moves the current parsing position one to the left. | |
1866 func (p *parser) moveLeft() { | |
1867 p.currentPos-- | |
1868 } | |
1869 | |
1870 // Returns the char left of the current parsing position. | |
1871 func (p *parser) charAt(i int) rune { | |
1872 return p.pattern[i] | |
1873 } | |
1874 | |
1875 // Returns the char i chars right of the current parsing position. | |
1876 func (p *parser) rightChar(i int) rune { | |
1877 // default would be 0 | |
1878 return p.pattern[p.currentPos+i] | |
1879 } | |
1880 | |
1881 // Number of characters to the right of the current parsing position. | |
1882 func (p *parser) charsRight() int { | |
1883 return len(p.pattern) - p.currentPos | |
1884 } | |
1885 | |
1886 func (p *parser) rightMost() bool { | |
1887 return p.currentPos == len(p.pattern) | |
1888 } | |
1889 | |
1890 // Looks up the slot number for a given name | |
1891 func (p *parser) captureSlotFromName(capname string) int { | |
1892 return p.capnames[capname] | |
1893 } | |
1894 | |
1895 // True if the capture slot was noted | |
1896 func (p *parser) isCaptureSlot(i int) bool { | |
1897 if p.caps != nil { | |
1898 _, ok := p.caps[i] | |
1899 return ok | |
1900 } | |
1901 | |
1902 return (i >= 0 && i < p.capsize) | |
1903 } | |
1904 | |
1905 // Looks up the slot number for a given name | |
1906 func (p *parser) isCaptureName(capname string) bool { | |
1907 if p.capnames == nil { | |
1908 return false | |
1909 } | |
1910 | |
1911 _, ok := p.capnames[capname] | |
1912 return ok | |
1913 } | |
1914 | |
1915 // option shortcuts | |
1916 | |
1917 // True if N option disabling '(' autocapture is on. | |
1918 func (p *parser) useOptionN() bool { | |
1919 return (p.options & ExplicitCapture) != 0 | |
1920 } | |
1921 | |
1922 // True if I option enabling case-insensitivity is on. | |
1923 func (p *parser) useOptionI() bool { | |
1924 return (p.options & IgnoreCase) != 0 | |
1925 } | |
1926 | |
1927 // True if M option altering meaning of $ and ^ is on. | |
1928 func (p *parser) useOptionM() bool { | |
1929 return (p.options & Multiline) != 0 | |
1930 } | |
1931 | |
1932 // True if S option altering meaning of . is on. | |
1933 func (p *parser) useOptionS() bool { | |
1934 return (p.options & Singleline) != 0 | |
1935 } | |
1936 | |
1937 // True if X option enabling whitespace/comment mode is on. | |
1938 func (p *parser) useOptionX() bool { | |
1939 return (p.options & IgnorePatternWhitespace) != 0 | |
1940 } | |
1941 | |
1942 // True if E option enabling ECMAScript behavior on. | |
1943 func (p *parser) useOptionE() bool { | |
1944 return (p.options & ECMAScript) != 0 | |
1945 } | |
1946 | |
1947 // true to use RE2 compatibility parsing behavior. | |
1948 func (p *parser) useRE2() bool { | |
1949 return (p.options & RE2) != 0 | |
1950 } | |
1951 | |
1952 // True if options stack is empty. | |
1953 func (p *parser) emptyOptionsStack() bool { | |
1954 return len(p.optionsStack) == 0 | |
1955 } | |
1956 | |
1957 // Finish the current quantifiable (when a quantifier is not found or is not possible) | |
1958 func (p *parser) addConcatenate() { | |
1959 // The first (| inside a Testgroup group goes directly to the group | |
1960 p.concatenation.addChild(p.unit) | |
1961 p.unit = nil | |
1962 } | |
1963 | |
1964 // Finish the current quantifiable (when a quantifier is found) | |
1965 func (p *parser) addConcatenate3(lazy bool, min, max int) { | |
1966 p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max)) | |
1967 p.unit = nil | |
1968 } | |
1969 | |
1970 // Sets the current unit to a single char node | |
1971 func (p *parser) addUnitOne(ch rune) { | |
1972 if p.useOptionI() { | |
1973 ch = unicode.ToLower(ch) | |
1974 } | |
1975 | |
1976 p.unit = newRegexNodeCh(ntOne, p.options, ch) | |
1977 } | |
1978 | |
1979 // Sets the current unit to a single inverse-char node | |
1980 func (p *parser) addUnitNotone(ch rune) { | |
1981 if p.useOptionI() { | |
1982 ch = unicode.ToLower(ch) | |
1983 } | |
1984 | |
1985 p.unit = newRegexNodeCh(ntNotone, p.options, ch) | |
1986 } | |
1987 | |
1988 // Sets the current unit to a single set node | |
1989 func (p *parser) addUnitSet(set *CharSet) { | |
1990 p.unit = newRegexNodeSet(ntSet, p.options, set) | |
1991 } | |
1992 | |
1993 // Sets the current unit to a subtree | |
1994 func (p *parser) addUnitNode(node *regexNode) { | |
1995 p.unit = node | |
1996 } | |
1997 | |
1998 // Sets the current unit to an assertion of the specified type | |
1999 func (p *parser) addUnitType(t nodeType) { | |
2000 p.unit = newRegexNode(t, p.options) | |
2001 } | |
2002 | |
2003 // Finish the current group (in response to a ')' or end) | |
2004 func (p *parser) addGroup() error { | |
2005 if p.group.t == ntTestgroup || p.group.t == ntTestref { | |
2006 p.group.addChild(p.concatenation.reverseLeft()) | |
2007 if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 { | |
2008 return p.getErr(ErrTooManyAlternates) | |
2009 } | |
2010 } else { | |
2011 p.alternation.addChild(p.concatenation.reverseLeft()) | |
2012 p.group.addChild(p.alternation) | |
2013 } | |
2014 | |
2015 p.unit = p.group | |
2016 return nil | |
2017 } | |
2018 | |
2019 // Pops the option stack, but keeps the current options unchanged. | |
2020 func (p *parser) popKeepOptions() { | |
2021 lastIdx := len(p.optionsStack) - 1 | |
2022 p.optionsStack = p.optionsStack[:lastIdx] | |
2023 } | |
2024 | |
2025 // Recalls options from the stack. | |
2026 func (p *parser) popOptions() { | |
2027 lastIdx := len(p.optionsStack) - 1 | |
2028 // get the last item on the stack and then remove it by reslicing | |
2029 p.options = p.optionsStack[lastIdx] | |
2030 p.optionsStack = p.optionsStack[:lastIdx] | |
2031 } | |
2032 | |
2033 // Saves options on a stack. | |
2034 func (p *parser) pushOptions() { | |
2035 p.optionsStack = append(p.optionsStack, p.options) | |
2036 } | |
2037 | |
2038 // Add a string to the last concatenate. | |
2039 func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) { | |
2040 var node *regexNode | |
2041 | |
2042 if cch == 0 { | |
2043 return | |
2044 } | |
2045 | |
2046 if cch > 1 { | |
2047 str := p.pattern[pos : pos+cch] | |
2048 | |
2049 if p.useOptionI() && !isReplacement { | |
2050 // We do the ToLower character by character for consistency. With surrogate chars, doing | |
2051 // a ToLower on the entire string could actually change the surrogate pair. This is more correct | |
2052 // linguistically, but since Regex doesn't support surrogates, it's more important to be | |
2053 // consistent. | |
2054 for i := 0; i < len(str); i++ { | |
2055 str[i] = unicode.ToLower(str[i]) | |
2056 } | |
2057 } | |
2058 | |
2059 node = newRegexNodeStr(ntMulti, p.options, str) | |
2060 } else { | |
2061 ch := p.charAt(pos) | |
2062 | |
2063 if p.useOptionI() && !isReplacement { | |
2064 ch = unicode.ToLower(ch) | |
2065 } | |
2066 | |
2067 node = newRegexNodeCh(ntOne, p.options, ch) | |
2068 } | |
2069 | |
2070 p.concatenation.addChild(node) | |
2071 } | |
2072 | |
2073 // Push the parser state (in response to an open paren) | |
2074 func (p *parser) pushGroup() { | |
2075 p.group.next = p.stack | |
2076 p.alternation.next = p.group | |
2077 p.concatenation.next = p.alternation | |
2078 p.stack = p.concatenation | |
2079 } | |
2080 | |
2081 // Remember the pushed state (in response to a ')') | |
2082 func (p *parser) popGroup() error { | |
2083 p.concatenation = p.stack | |
2084 p.alternation = p.concatenation.next | |
2085 p.group = p.alternation.next | |
2086 p.stack = p.group.next | |
2087 | |
2088 // The first () inside a Testgroup group goes directly to the group | |
2089 if p.group.t == ntTestgroup && len(p.group.children) == 0 { | |
2090 if p.unit == nil { | |
2091 return p.getErr(ErrConditionalExpression) | |
2092 } | |
2093 | |
2094 p.group.addChild(p.unit) | |
2095 p.unit = nil | |
2096 } | |
2097 return nil | |
2098 } | |
2099 | |
2100 // True if the group stack is empty. | |
2101 func (p *parser) emptyStack() bool { | |
2102 return p.stack == nil | |
2103 } | |
2104 | |
2105 // Start a new round for the parser state (in response to an open paren or string start) | |
2106 func (p *parser) startGroup(openGroup *regexNode) { | |
2107 p.group = openGroup | |
2108 p.alternation = newRegexNode(ntAlternate, p.options) | |
2109 p.concatenation = newRegexNode(ntConcatenate, p.options) | |
2110 } | |
2111 | |
2112 // Finish the current concatenation (in response to a |) | |
2113 func (p *parser) addAlternate() { | |
2114 // The | parts inside a Testgroup group go directly to the group | |
2115 | |
2116 if p.group.t == ntTestgroup || p.group.t == ntTestref { | |
2117 p.group.addChild(p.concatenation.reverseLeft()) | |
2118 } else { | |
2119 p.alternation.addChild(p.concatenation.reverseLeft()) | |
2120 } | |
2121 | |
2122 p.concatenation = newRegexNode(ntConcatenate, p.options) | |
2123 } | |
2124 | |
2125 // For categorizing ascii characters. | |
2126 | |
2127 const ( | |
2128 Q byte = 5 // quantifier | |
2129 S = 4 // ordinary stopper | |
2130 Z = 3 // ScanBlank stopper | |
2131 X = 2 // whitespace | |
2132 E = 1 // should be escaped | |
2133 ) | |
2134 | |
2135 var _category = []byte{ | |
2136 //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F | |
2137 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
2138 // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? | |
2139 X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, | |
2140 //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ | |
2141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0, | |
2142 //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ | |
2143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0, | |
2144 } | |
2145 | |
2146 func isSpace(ch rune) bool { | |
2147 return (ch <= ' ' && _category[ch] == X) | |
2148 } | |
2149 | |
2150 // Returns true for those characters that terminate a string of ordinary chars. | |
2151 func isSpecial(ch rune) bool { | |
2152 return (ch <= '|' && _category[ch] >= S) | |
2153 } | |
2154 | |
2155 // Returns true for those characters that terminate a string of ordinary chars. | |
2156 func isStopperX(ch rune) bool { | |
2157 return (ch <= '|' && _category[ch] >= X) | |
2158 } | |
2159 | |
2160 // Returns true for those characters that begin a quantifier. | |
2161 func isQuantifier(ch rune) bool { | |
2162 return (ch <= '{' && _category[ch] >= Q) | |
2163 } | |
2164 | |
2165 func (p *parser) isTrueQuantifier() bool { | |
2166 nChars := p.charsRight() | |
2167 if nChars == 0 { | |
2168 return false | |
2169 } | |
2170 | |
2171 startpos := p.textpos() | |
2172 ch := p.charAt(startpos) | |
2173 if ch != '{' { | |
2174 return ch <= '{' && _category[ch] >= Q | |
2175 } | |
2176 | |
2177 //UGLY: this is ugly -- the original code was ugly too | |
2178 pos := startpos | |
2179 for { | |
2180 nChars-- | |
2181 if nChars <= 0 { | |
2182 break | |
2183 } | |
2184 pos++ | |
2185 ch = p.charAt(pos) | |
2186 if ch < '0' || ch > '9' { | |
2187 break | |
2188 } | |
2189 } | |
2190 | |
2191 if nChars == 0 || pos-startpos == 1 { | |
2192 return false | |
2193 } | |
2194 if ch == '}' { | |
2195 return true | |
2196 } | |
2197 if ch != ',' { | |
2198 return false | |
2199 } | |
2200 for { | |
2201 nChars-- | |
2202 if nChars <= 0 { | |
2203 break | |
2204 } | |
2205 pos++ | |
2206 ch = p.charAt(pos) | |
2207 if ch < '0' || ch > '9' { | |
2208 break | |
2209 } | |
2210 } | |
2211 | |
2212 return nChars > 0 && ch == '}' | |
2213 } |