Mercurial > yakumo_izuru > aya

diff vendor/github.com/dlclark/regexp2/syntax/parser.go @ 66:787b5ee0289d draft
Use vendored modules Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author: yakumo.izuru
date: Sun, 23 Jul 2023 13:18:53 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vendor/github.com/dlclark/regexp2/syntax/parser.go	Sun Jul 23 13:18:53 2023 +0000
@@ -0,0 +1,2213 @@
+package syntax
+
+import (
+	"fmt"
+	"math"
+	"os"
+	"sort"
+	"strconv"
+	"unicode"
+)
+
+type RegexOptions int32
+
+const (
+	IgnoreCase              RegexOptions = 0x0001 // "i"
+	Multiline                            = 0x0002 // "m"
+	ExplicitCapture                      = 0x0004 // "n"
+	Compiled                             = 0x0008 // "c"
+	Singleline                           = 0x0010 // "s"
+	IgnorePatternWhitespace              = 0x0020 // "x"
+	RightToLeft                          = 0x0040 // "r"
+	Debug                                = 0x0080 // "d"
+	ECMAScript                           = 0x0100 // "e"
+	RE2                                  = 0x0200 // RE2 compat mode
+)
+
+func optionFromCode(ch rune) RegexOptions {
+	// case-insensitive
+	switch ch {
+	case 'i', 'I':
+		return IgnoreCase
+	case 'r', 'R':
+		return RightToLeft
+	case 'm', 'M':
+		return Multiline
+	case 'n', 'N':
+		return ExplicitCapture
+	case 's', 'S':
+		return Singleline
+	case 'x', 'X':
+		return IgnorePatternWhitespace
+	case 'd', 'D':
+		return Debug
+	case 'e', 'E':
+		return ECMAScript
+	default:
+		return 0
+	}
+}
+
+// An Error describes a failure to parse a regular expression
+// and gives the offending expression.
+type Error struct {
+	Code ErrorCode
+	Expr string
+	Args []interface{}
+}
+
+func (e *Error) Error() string {
+	if len(e.Args) == 0 {
+		return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
+	}
+	return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
+}
+
+// An ErrorCode describes a failure to parse a regular expression.
+type ErrorCode string
+
+const (
+	// internal issue
+	ErrInternalError ErrorCode = "regexp/syntax: internal error"
+	// Parser errors
+	ErrUnterminatedComment        = "unterminated comment"
+	ErrInvalidCharRange           = "invalid character class range"
+	ErrInvalidRepeatSize          = "invalid repeat count"
+	ErrInvalidUTF8                = "invalid UTF-8"
+	ErrCaptureGroupOutOfRange     = "capture group number out of range"
+	ErrUnexpectedParen            = "unexpected )"
+	ErrMissingParen               = "missing closing )"
+	ErrMissingBrace               = "missing closing }"
+	ErrInvalidRepeatOp            = "invalid nested repetition operator"
+	ErrMissingRepeatArgument      = "missing argument to repetition operator"
+	ErrConditionalExpression      = "illegal conditional (?(...)) expression"
+	ErrTooManyAlternates          = "too many | in (?()|)"
+	ErrUnrecognizedGrouping       = "unrecognized grouping construct: (%v"
+	ErrInvalidGroupName           = "invalid group name: group names must begin with a word character and have a matching terminator"
+	ErrCapNumNotZero              = "capture number cannot be zero"
+	ErrUndefinedBackRef           = "reference to undefined group number %v"
+	ErrUndefinedNameRef           = "reference to undefined group name %v"
+	ErrAlternationCantCapture     = "alternation conditions do not capture and cannot be named"
+	ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
+	ErrMalformedReference         = "(?(%v) ) malformed"
+	ErrUndefinedReference         = "(?(%v) ) reference to undefined group"
+	ErrIllegalEndEscape           = "illegal \\ at end of pattern"
+	ErrMalformedSlashP            = "malformed \\p{X} character escape"
+	ErrIncompleteSlashP           = "incomplete \\p{X} character escape"
+	ErrUnknownSlashP              = "unknown unicode category, script, or property '%v'"
+	ErrUnrecognizedEscape         = "unrecognized escape sequence \\%v"
+	ErrMissingControl             = "missing control character"
+	ErrUnrecognizedControl        = "unrecognized control character"
+	ErrTooFewHex                  = "insufficient hexadecimal digits"
+	ErrInvalidHex                 = "hex values may not be larger than 0x10FFFF"
+	ErrMalformedNameRef           = "malformed \\k<...> named back reference"
+	ErrBadClassInCharRange        = "cannot include class \\%v in character range"
+	ErrUnterminatedBracket        = "unterminated [] set"
+	ErrSubtractionMustBeLast      = "a subtraction must be the last element in a character class"
+	ErrReversedCharRange          = "[x-y] range in reverse order"
+)
+
+func (e ErrorCode) String() string {
+	return string(e)
+}
+
+type parser struct {
+	stack         *regexNode
+	group         *regexNode
+	alternation   *regexNode
+	concatenation *regexNode
+	unit          *regexNode
+
+	patternRaw string
+	pattern    []rune
+
+	currentPos  int
+	specialCase *unicode.SpecialCase
+
+	autocap  int
+	capcount int
+	captop   int
+	capsize  int
+
+	caps     map[int]int
+	capnames map[string]int
+
+	capnumlist  []int
+	capnamelist []string
+
+	options         RegexOptions
+	optionsStack    []RegexOptions
+	ignoreNextParen bool
+}
+
+const (
+	maxValueDiv10 int = math.MaxInt32 / 10
+	maxValueMod10     = math.MaxInt32 % 10
+)
+
+// Parse converts a regex string into a parse tree
+func Parse(re string, op RegexOptions) (*RegexTree, error) {
+	p := parser{
+		options: op,
+		caps:    make(map[int]int),
+	}
+	p.setPattern(re)
+
+	if err := p.countCaptures(); err != nil {
+		return nil, err
+	}
+
+	p.reset(op)
+	root, err := p.scanRegex()
+
+	if err != nil {
+		return nil, err
+	}
+	tree := &RegexTree{
+		root:       root,
+		caps:       p.caps,
+		capnumlist: p.capnumlist,
+		captop:     p.captop,
+		Capnames:   p.capnames,
+		Caplist:    p.capnamelist,
+		options:    op,
+	}
+
+	if tree.options&Debug > 0 {
+		os.Stdout.WriteString(tree.Dump())
+	}
+
+	return tree, nil
+}
+
+func (p *parser) setPattern(pattern string) {
+	p.patternRaw = pattern
+	p.pattern = make([]rune, 0, len(pattern))
+
+	//populate our rune array to handle utf8 encoding
+	for _, r := range pattern {
+		p.pattern = append(p.pattern, r)
+	}
+}
+func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
+	return &Error{Code: code, Expr: p.patternRaw, Args: args}
+}
+
+func (p *parser) noteCaptureSlot(i, pos int) {
+	if _, ok := p.caps[i]; !ok {
+		// the rhs of the hashtable isn't used in the parser
+		p.caps[i] = pos
+		p.capcount++
+
+		if p.captop <= i {
+			if i == math.MaxInt32 {
+				p.captop = i
+			} else {
+				p.captop = i + 1
+			}
+		}
+	}
+}
+
+func (p *parser) noteCaptureName(name string, pos int) {
+	if p.capnames == nil {
+		p.capnames = make(map[string]int)
+	}
+
+	if _, ok := p.capnames[name]; !ok {
+		p.capnames[name] = pos
+		p.capnamelist = append(p.capnamelist, name)
+	}
+}
+
+func (p *parser) assignNameSlots() {
+	if p.capnames != nil {
+		for _, name := range p.capnamelist {
+			for p.isCaptureSlot(p.autocap) {
+				p.autocap++
+			}
+			pos := p.capnames[name]
+			p.capnames[name] = p.autocap
+			p.noteCaptureSlot(p.autocap, pos)
+
+			p.autocap++
+		}
+	}
+
+	// if the caps array has at least one gap, construct the list of used slots
+	if p.capcount < p.captop {
+		p.capnumlist = make([]int, p.capcount)
+		i := 0
+
+		for k := range p.caps {
+			p.capnumlist[i] = k
+			i++
+		}
+
+		sort.Ints(p.capnumlist)
+	}
+
+	// merge capsnumlist into capnamelist
+	if p.capnames != nil || p.capnumlist != nil {
+		var oldcapnamelist []string
+		var next int
+		var k int
+
+		if p.capnames == nil {
+			oldcapnamelist = nil
+			p.capnames = make(map[string]int)
+			p.capnamelist = []string{}
+			next = -1
+		} else {
+			oldcapnamelist = p.capnamelist
+			p.capnamelist = []string{}
+			next = p.capnames[oldcapnamelist[0]]
+		}
+
+		for i := 0; i < p.capcount; i++ {
+			j := i
+			if p.capnumlist != nil {
+				j = p.capnumlist[i]
+			}
+
+			if next == j {
+				p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
+				k++
+
+				if k == len(oldcapnamelist) {
+					next = -1
+				} else {
+					next = p.capnames[oldcapnamelist[k]]
+				}
+
+			} else {
+				//feature: culture?
+				str := strconv.Itoa(j)
+				p.capnamelist = append(p.capnamelist, str)
+				p.capnames[str] = j
+			}
+		}
+	}
+}
+
+func (p *parser) consumeAutocap() int {
+	r := p.autocap
+	p.autocap++
+	return r
+}
+
+// CountCaptures is a prescanner for deducing the slots used for
+// captures by doing a partial tokenization of the pattern.
+func (p *parser) countCaptures() error {
+	var ch rune
+
+	p.noteCaptureSlot(0, 0)
+
+	p.autocap = 1
+
+	for p.charsRight() > 0 {
+		pos := p.textpos()
+		ch = p.moveRightGetChar()
+		switch ch {
+		case '\\':
+			if p.charsRight() > 0 {
+				p.scanBackslash(true)
+			}
+
+		case '#':
+			if p.useOptionX() {
+				p.moveLeft()
+				p.scanBlank()
+			}
+
+		case '[':
+			p.scanCharSet(false, true)
+
+		case ')':
+			if !p.emptyOptionsStack() {
+				p.popOptions()
+			}
+
+		case '(':
+			if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
+				p.moveLeft()
+				p.scanBlank()
+			} else {
+				p.pushOptions()
+				if p.charsRight() > 0 && p.rightChar(0) == '?' {
+					// we have (?...
+					p.moveRight(1)
+
+					if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
+						// named group: (?<... or (?'...
+
+						p.moveRight(1)
+						ch = p.rightChar(0)
+
+						if ch != '0' && IsWordChar(ch) {
+							if ch >= '1' && ch <= '9' {
+								dec, err := p.scanDecimal()
+								if err != nil {
+									return err
+								}
+								p.noteCaptureSlot(dec, pos)
+							} else {
+								p.noteCaptureName(p.scanCapname(), pos)
+							}
+						}
+					} else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
+						// RE2-compat (?P<)
+						p.moveRight(2)
+						ch = p.rightChar(0)
+						if IsWordChar(ch) {
+							p.noteCaptureName(p.scanCapname(), pos)
+						}
+
+					} else {
+						// (?...
+
+						// get the options if it's an option construct (?cimsx-cimsx...)
+						p.scanOptions()
+
+						if p.charsRight() > 0 {
+							if p.rightChar(0) == ')' {
+								// (?cimsx-cimsx)
+								p.moveRight(1)
+								p.popKeepOptions()
+							} else if p.rightChar(0) == '(' {
+								// alternation construct: (?(foo)yes|no)
+								// ignore the next paren so we don't capture the condition
+								p.ignoreNextParen = true
+
+								// break from here so we don't reset ignoreNextParen
+								continue
+							}
+						}
+					}
+				} else {
+					if !p.useOptionN() && !p.ignoreNextParen {
+						p.noteCaptureSlot(p.consumeAutocap(), pos)
+					}
+				}
+			}
+
+			p.ignoreNextParen = false
+
+		}
+	}
+
+	p.assignNameSlots()
+	return nil
+}
+
+func (p *parser) reset(topopts RegexOptions) {
+	p.currentPos = 0
+	p.autocap = 1
+	p.ignoreNextParen = false
+
+	if len(p.optionsStack) > 0 {
+		p.optionsStack = p.optionsStack[:0]
+	}
+
+	p.options = topopts
+	p.stack = nil
+}
+
+func (p *parser) scanRegex() (*regexNode, error) {
+	ch := '@' // nonspecial ch, means at beginning
+	isQuant := false
+
+	p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
+
+	for p.charsRight() > 0 {
+		wasPrevQuantifier := isQuant
+		isQuant = false
+
+		if err := p.scanBlank(); err != nil {
+			return nil, err
+		}
+
+		startpos := p.textpos()
+
+		// move past all of the normal characters.  We'll stop when we hit some kind of control character,
+		// or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
+		if p.useOptionX() {
+			for p.charsRight() > 0 {
+				ch = p.rightChar(0)
+				//UGLY: clean up, this is ugly
+				if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
+					break
+				}
+				p.moveRight(1)
+			}
+		} else {
+			for p.charsRight() > 0 {
+				ch = p.rightChar(0)
+				if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
+					break
+				}
+				p.moveRight(1)
+			}
+		}
+
+		endpos := p.textpos()
+
+		p.scanBlank()
+
+		if p.charsRight() == 0 {
+			ch = '!' // nonspecial, means at end
+		} else if ch = p.rightChar(0); isSpecial(ch) {
+			isQuant = isQuantifier(ch)
+			p.moveRight(1)
+		} else {
+			ch = ' ' // nonspecial, means at ordinary char
+		}
+
+		if startpos < endpos {
+			cchUnquantified := endpos - startpos
+			if isQuant {
+				cchUnquantified--
+			}
+			wasPrevQuantifier = false
+
+			if cchUnquantified > 0 {
+				p.addToConcatenate(startpos, cchUnquantified, false)
+			}
+
+			if isQuant {
+				p.addUnitOne(p.charAt(endpos - 1))
+			}
+		}
+
+		switch ch {
+		case '!':
+			goto BreakOuterScan
+
+		case ' ':
+			goto ContinueOuterScan
+
+		case '[':
+			cc, err := p.scanCharSet(p.useOptionI(), false)
+			if err != nil {
+				return nil, err
+			}
+			p.addUnitSet(cc)
+
+		case '(':
+			p.pushOptions()
+
+			if grouper, err := p.scanGroupOpen(); err != nil {
+				return nil, err
+			} else if grouper == nil {
+				p.popKeepOptions()
+			} else {
+				p.pushGroup()
+				p.startGroup(grouper)
+			}
+
+			continue
+
+		case '|':
+			p.addAlternate()
+			goto ContinueOuterScan
+
+		case ')':
+			if p.emptyStack() {
+				return nil, p.getErr(ErrUnexpectedParen)
+			}
+
+			if err := p.addGroup(); err != nil {
+				return nil, err
+			}
+			if err := p.popGroup(); err != nil {
+				return nil, err
+			}
+			p.popOptions()
+
+			if p.unit == nil {
+				goto ContinueOuterScan
+			}
+
+		case '\\':
+			n, err := p.scanBackslash(false)
+			if err != nil {
+				return nil, err
+			}
+			p.addUnitNode(n)
+
+		case '^':
+			if p.useOptionM() {
+				p.addUnitType(ntBol)
+			} else {
+				p.addUnitType(ntBeginning)
+			}
+
+		case '$':
+			if p.useOptionM() {
+				p.addUnitType(ntEol)
+			} else {
+				p.addUnitType(ntEndZ)
+			}
+
+		case '.':
+			if p.useOptionE() {
+				p.addUnitSet(ECMAAnyClass())
+			} else if p.useOptionS() {
+				p.addUnitSet(AnyClass())
+			} else {
+				p.addUnitNotone('\n')
+			}
+
+		case '{', '*', '+', '?':
+			if p.unit == nil {
+				if wasPrevQuantifier {
+					return nil, p.getErr(ErrInvalidRepeatOp)
+				} else {
+					return nil, p.getErr(ErrMissingRepeatArgument)
+				}
+			}
+			p.moveLeft()
+
+		default:
+			return nil, p.getErr(ErrInternalError)
+		}
+
+		if err := p.scanBlank(); err != nil {
+			return nil, err
+		}
+
+		if p.charsRight() > 0 {
+			isQuant = p.isTrueQuantifier()
+		}
+		if p.charsRight() == 0 || !isQuant {
+			//maintain odd C# assignment order -- not sure if required, could clean up?
+			p.addConcatenate()
+			goto ContinueOuterScan
+		}
+
+		ch = p.moveRightGetChar()
+
+		// Handle quantifiers
+		for p.unit != nil {
+			var min, max int
+			var lazy bool
+
+			switch ch {
+			case '*':
+				min = 0
+				max = math.MaxInt32
+
+			case '?':
+				min = 0
+				max = 1
+
+			case '+':
+				min = 1
+				max = math.MaxInt32
+
+			case '{':
+				{
+					var err error
+					startpos = p.textpos()
+					if min, err = p.scanDecimal(); err != nil {
+						return nil, err
+					}
+					max = min
+					if startpos < p.textpos() {
+						if p.charsRight() > 0 && p.rightChar(0) == ',' {
+							p.moveRight(1)
+							if p.charsRight() == 0 || p.rightChar(0) == '}' {
+								max = math.MaxInt32
+							} else {
+								if max, err = p.scanDecimal(); err != nil {
+									return nil, err
+								}
+							}
+						}
+					}
+
+					if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
+						p.addConcatenate()
+						p.textto(startpos - 1)
+						goto ContinueOuterScan
+					}
+				}
+
+			default:
+				return nil, p.getErr(ErrInternalError)
+			}
+
+			if err := p.scanBlank(); err != nil {
+				return nil, err
+			}
+
+			if p.charsRight() == 0 || p.rightChar(0) != '?' {
+				lazy = false
+			} else {
+				p.moveRight(1)
+				lazy = true
+			}
+
+			if min > max {
+				return nil, p.getErr(ErrInvalidRepeatSize)
+			}
+
+			p.addConcatenate3(lazy, min, max)
+		}
+
+	ContinueOuterScan:
+	}
+
+BreakOuterScan:
+	;
+
+	if !p.emptyStack() {
+		return nil, p.getErr(ErrMissingParen)
+	}
+
+	if err := p.addGroup(); err != nil {
+		return nil, err
+	}
+
+	return p.unit, nil
+
+}
+
+/*
+ * Simple parsing for replacement patterns
+ */
+func (p *parser) scanReplacement() (*regexNode, error) {
+	var c, startpos int
+
+	p.concatenation = newRegexNode(ntConcatenate, p.options)
+
+	for {
+		c = p.charsRight()
+		if c == 0 {
+			break
+		}
+
+		startpos = p.textpos()
+
+		for c > 0 && p.rightChar(0) != '$' {
+			p.moveRight(1)
+			c--
+		}
+
+		p.addToConcatenate(startpos, p.textpos()-startpos, true)
+
+		if c > 0 {
+			if p.moveRightGetChar() == '$' {
+				n, err := p.scanDollar()
+				if err != nil {
+					return nil, err
+				}
+				p.addUnitNode(n)
+			}
+			p.addConcatenate()
+		}
+	}
+
+	return p.concatenation, nil
+}
+
+/*
+ * Scans $ patterns recognized within replacement patterns
+ */
+func (p *parser) scanDollar() (*regexNode, error) {
+	if p.charsRight() == 0 {
+		return newRegexNodeCh(ntOne, p.options, '$'), nil
+	}
+
+	ch := p.rightChar(0)
+	angled := false
+	backpos := p.textpos()
+	lastEndPos := backpos
+
+	// Note angle
+
+	if ch == '{' && p.charsRight() > 1 {
+		angled = true
+		p.moveRight(1)
+		ch = p.rightChar(0)
+	}
+
+	// Try to parse backreference: \1 or \{1} or \{cap}
+
+	if ch >= '0' && ch <= '9' {
+		if !angled && p.useOptionE() {
+			capnum := -1
+			newcapnum := int(ch - '0')
+			p.moveRight(1)
+			if p.isCaptureSlot(newcapnum) {
+				capnum = newcapnum
+				lastEndPos = p.textpos()
+			}
+
+			for p.charsRight() > 0 {
+				ch = p.rightChar(0)
+				if ch < '0' || ch > '9' {
+					break
+				}
+				digit := int(ch - '0')
+				if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
+					return nil, p.getErr(ErrCaptureGroupOutOfRange)
+				}
+
+				newcapnum = newcapnum*10 + digit
+
+				p.moveRight(1)
+				if p.isCaptureSlot(newcapnum) {
+					capnum = newcapnum
+					lastEndPos = p.textpos()
+				}
+			}
+			p.textto(lastEndPos)
+			if capnum >= 0 {
+				return newRegexNodeM(ntRef, p.options, capnum), nil
+			}
+		} else {
+			capnum, err := p.scanDecimal()
+			if err != nil {
+				return nil, err
+			}
+			if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
+				if p.isCaptureSlot(capnum) {
+					return newRegexNodeM(ntRef, p.options, capnum), nil
+				}
+			}
+		}
+	} else if angled && IsWordChar(ch) {
+		capname := p.scanCapname()
+
+		if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
+			if p.isCaptureName(capname) {
+				return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
+			}
+		}
+	} else if !angled {
+		capnum := 1
+
+		switch ch {
+		case '$':
+			p.moveRight(1)
+			return newRegexNodeCh(ntOne, p.options, '$'), nil
+		case '&':
+			capnum = 0
+		case '`':
+			capnum = replaceLeftPortion
+		case '\'':
+			capnum = replaceRightPortion
+		case '+':
+			capnum = replaceLastGroup
+		case '_':
+			capnum = replaceWholeString
+		}
+
+		if capnum != 1 {
+			p.moveRight(1)
+			return newRegexNodeM(ntRef, p.options, capnum), nil
+		}
+	}
+
+	// unrecognized $: literalize
+
+	p.textto(backpos)
+	return newRegexNodeCh(ntOne, p.options, '$'), nil
+}
+
+// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
+// a RegexNode for the type of group scanned, or nil if the group
+// simply changed options (?cimsx-cimsx) or was a comment (#...).
+func (p *parser) scanGroupOpen() (*regexNode, error) {
+	var ch rune
+	var nt nodeType
+	var err error
+	close := '>'
+	start := p.textpos()
+
+	// just return a RegexNode if we have:
+	// 1. "(" followed by nothing
+	// 2. "(x" where x != ?
+	// 3. "(?)"
+	if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
+		if p.useOptionN() || p.ignoreNextParen {
+			p.ignoreNextParen = false
+			return newRegexNode(ntGroup, p.options), nil
+		}
+		return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
+	}
+
+	p.moveRight(1)
+
+	for {
+		if p.charsRight() == 0 {
+			break
+		}
+
+		switch ch = p.moveRightGetChar(); ch {
+		case ':':
+			nt = ntGroup
+
+		case '=':
+			p.options &= ^RightToLeft
+			nt = ntRequire
+
+		case '!':
+			p.options &= ^RightToLeft
+			nt = ntPrevent
+
+		case '>':
+			nt = ntGreedy
+
+		case '\'':
+			close = '\''
+			fallthrough
+
+		case '<':
+			if p.charsRight() == 0 {
+				goto BreakRecognize
+			}
+
+			switch ch = p.moveRightGetChar(); ch {
+			case '=':
+				if close == '\'' {
+					goto BreakRecognize
+				}
+
+				p.options |= RightToLeft
+				nt = ntRequire
+
+			case '!':
+				if close == '\'' {
+					goto BreakRecognize
+				}
+
+				p.options |= RightToLeft
+				nt = ntPrevent
+
+			default:
+				p.moveLeft()
+				capnum := -1
+				uncapnum := -1
+				proceed := false
+
+				// grab part before -
+
+				if ch >= '0' && ch <= '9' {
+					if capnum, err = p.scanDecimal(); err != nil {
+						return nil, err
+					}
+
+					if !p.isCaptureSlot(capnum) {
+						capnum = -1
+					}
+
+					// check if we have bogus characters after the number
+					if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
+						return nil, p.getErr(ErrInvalidGroupName)
+					}
+					if capnum == 0 {
+						return nil, p.getErr(ErrCapNumNotZero)
+					}
+				} else if IsWordChar(ch) {
+					capname := p.scanCapname()
+
+					if p.isCaptureName(capname) {
+						capnum = p.captureSlotFromName(capname)
+					}
+
+					// check if we have bogus character after the name
+					if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
+						return nil, p.getErr(ErrInvalidGroupName)
+					}
+				} else if ch == '-' {
+					proceed = true
+				} else {
+					// bad group name - starts with something other than a word character and isn't a number
+					return nil, p.getErr(ErrInvalidGroupName)
+				}
+
+				// grab part after - if any
+
+				if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
+					p.moveRight(1)
+
+					//no more chars left, no closing char, etc
+					if p.charsRight() == 0 {
+						return nil, p.getErr(ErrInvalidGroupName)
+					}
+
+					ch = p.rightChar(0)
+					if ch >= '0' && ch <= '9' {
+						if uncapnum, err = p.scanDecimal(); err != nil {
+							return nil, err
+						}
+
+						if !p.isCaptureSlot(uncapnum) {
+							return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
+						}
+
+						// check if we have bogus characters after the number
+						if p.charsRight() > 0 && p.rightChar(0) != close {
+							return nil, p.getErr(ErrInvalidGroupName)
+						}
+					} else if IsWordChar(ch) {
+						uncapname := p.scanCapname()
+
+						if !p.isCaptureName(uncapname) {
+							return nil, p.getErr(ErrUndefinedNameRef, uncapname)
+						}
+						uncapnum = p.captureSlotFromName(uncapname)
+
+						// check if we have bogus character after the name
+						if p.charsRight() > 0 && p.rightChar(0) != close {
+							return nil, p.getErr(ErrInvalidGroupName)
+						}
+					} else {
+						// bad group name - starts with something other than a word character and isn't a number
+						return nil, p.getErr(ErrInvalidGroupName)
+					}
+				}
+
+				// actually make the node
+
+				if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
+					return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
+				}
+				goto BreakRecognize
+			}
+
+		case '(':
+			// alternation construct (?(...) | )
+
+			parenPos := p.textpos()
+			if p.charsRight() > 0 {
+				ch = p.rightChar(0)
+
+				// check if the alternation condition is a backref
+				if ch >= '0' && ch <= '9' {
+					var capnum int
+					if capnum, err = p.scanDecimal(); err != nil {
+						return nil, err
+					}
+					if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
+						if p.isCaptureSlot(capnum) {
+							return newRegexNodeM(ntTestref, p.options, capnum), nil
+						}
+						return nil, p.getErr(ErrUndefinedReference, capnum)
+					}
+
+					return nil, p.getErr(ErrMalformedReference, capnum)
+
+				} else if IsWordChar(ch) {
+					capname := p.scanCapname()
+
+					if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
+						return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
+					}
+				}
+			}
+			// not a backref
+			nt = ntTestgroup
+			p.textto(parenPos - 1)   // jump to the start of the parentheses
+			p.ignoreNextParen = true // but make sure we don't try to capture the insides
+
+			charsRight := p.charsRight()
+			if charsRight >= 3 && p.rightChar(1) == '?' {
+				rightchar2 := p.rightChar(2)
+				// disallow comments in the condition
+				if rightchar2 == '#' {
+					return nil, p.getErr(ErrAlternationCantHaveComment)
+				}
+
+				// disallow named capture group (?<..>..) in the condition
+				if rightchar2 == '\'' {
+					return nil, p.getErr(ErrAlternationCantCapture)
+				}
+
+				if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
+					return nil, p.getErr(ErrAlternationCantCapture)
+				}
+			}
+
+		case 'P':
+			if p.useRE2() {
+				// support for P<name> syntax
+				if p.charsRight() < 3 {
+					goto BreakRecognize
+				}
+
+				ch = p.moveRightGetChar()
+				if ch != '<' {
+					goto BreakRecognize
+				}
+
+				ch = p.moveRightGetChar()
+				p.moveLeft()
+
+				if IsWordChar(ch) {
+					capnum := -1
+					capname := p.scanCapname()
+
+					if p.isCaptureName(capname) {
+						capnum = p.captureSlotFromName(capname)
+					}
+
+					// check if we have bogus character after the name
+					if p.charsRight() > 0 && p.rightChar(0) != '>' {
+						return nil, p.getErr(ErrInvalidGroupName)
+					}
+
+					// actually make the node
+
+					if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
+						return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
+					}
+					goto BreakRecognize
+
+				} else {
+					// bad group name - starts with something other than a word character and isn't a number
+					return nil, p.getErr(ErrInvalidGroupName)
+				}
+			}
+			// if we're not using RE2 compat mode then
+			// we just behave like normal
+			fallthrough
+
+		default:
+			p.moveLeft()
+
+			nt = ntGroup
+			// disallow options in the children of a testgroup node
+			if p.group.t != ntTestgroup {
+				p.scanOptions()
+			}
+			if p.charsRight() == 0 {
+				goto BreakRecognize
+			}
+
+			if ch = p.moveRightGetChar(); ch == ')' {
+				return nil, nil
+			}
+
+			if ch != ':' {
+				goto BreakRecognize
+			}
+
+		}
+
+		return newRegexNode(nt, p.options), nil
+	}
+
+BreakRecognize:
+
+	// break Recognize comes here
+
+	return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
+}
+
+// scans backslash specials and basics
+func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
+
+	if p.charsRight() == 0 {
+		return nil, p.getErr(ErrIllegalEndEscape)
+	}
+
+	switch ch := p.rightChar(0); ch {
+	case 'b', 'B', 'A', 'G', 'Z', 'z':
+		p.moveRight(1)
+		return newRegexNode(p.typeFromCode(ch), p.options), nil
+
+	case 'w':
+		p.moveRight(1)
+		if p.useOptionE() {
+			return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
+		}
+		return newRegexNodeSet(ntSet, p.options, WordClass()), nil
+
+	case 'W':
+		p.moveRight(1)
+		if p.useOptionE() {
+			return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
+		}
+		return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
+
+	case 's':
+		p.moveRight(1)
+		if p.useOptionE() {
+			return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
+		}
+		return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
+
+	case 'S':
+		p.moveRight(1)
+		if p.useOptionE() {
+			return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
+		}
+		return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
+
+	case 'd':
+		p.moveRight(1)
+		if p.useOptionE() {
+			return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
+		}
+		return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
+
+	case 'D':
+		p.moveRight(1)
+		if p.useOptionE() {
+			return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
+		}
+		return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
+
+	case 'p', 'P':
+		p.moveRight(1)
+		prop, err := p.parseProperty()
+		if err != nil {
+			return nil, err
+		}
+		cc := &CharSet{}
+		cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
+		if p.useOptionI() {
+			cc.addLowercase()
+		}
+
+		return newRegexNodeSet(ntSet, p.options, cc), nil
+
+	default:
+		return p.scanBasicBackslash(scanOnly)
+	}
+}
+
+// Scans \-style backreferences and character escapes
+func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
+	if p.charsRight() == 0 {
+		return nil, p.getErr(ErrIllegalEndEscape)
+	}
+	angled := false
+	close := '\x00'
+
+	backpos := p.textpos()
+	ch := p.rightChar(0)
+
+	// allow \k<foo> instead of \<foo>, which is now deprecated
+
+	if ch == 'k' {
+		if p.charsRight() >= 2 {
+			p.moveRight(1)
+			ch = p.moveRightGetChar()
+
+			if ch == '<' || ch == '\'' {
+				angled = true
+				if ch == '\'' {
+					close = '\''
+				} else {
+					close = '>'
+				}
+			}
+		}
+
+		if !angled || p.charsRight() <= 0 {
+			return nil, p.getErr(ErrMalformedNameRef)
+		}
+
+		ch = p.rightChar(0)
+
+	} else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
+		angled = true
+		if ch == '\'' {
+			close = '\''
+		} else {
+			close = '>'
+		}
+
+		p.moveRight(1)
+		ch = p.rightChar(0)
+	}
+
+	// Try to parse backreference: \<1> or \<cap>
+
+	if angled && ch >= '0' && ch <= '9' {
+		capnum, err := p.scanDecimal()
+		if err != nil {
+			return nil, err
+		}
+
+		if p.charsRight() > 0 && p.moveRightGetChar() == close {
+			if p.isCaptureSlot(capnum) {
+				return newRegexNodeM(ntRef, p.options, capnum), nil
+			}
+			return nil, p.getErr(ErrUndefinedBackRef, capnum)
+		}
+	} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
+		capnum, err := p.scanDecimal()
+		if err != nil {
+			return nil, err
+		}
+
+		if scanOnly {
+			return nil, nil
+		}
+
+		if p.isCaptureSlot(capnum) {
+			return newRegexNodeM(ntRef, p.options, capnum), nil
+		}
+		if capnum <= 9 && !p.useOptionE() {
+			return nil, p.getErr(ErrUndefinedBackRef, capnum)
+		}
+
+	} else if angled && IsWordChar(ch) {
+		capname := p.scanCapname()
+
+		if p.charsRight() > 0 && p.moveRightGetChar() == close {
+			if p.isCaptureName(capname) {
+				return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
+			}
+			return nil, p.getErr(ErrUndefinedNameRef, capname)
+		}
+	}
+
+	// Not backreference: must be char code
+
+	p.textto(backpos)
+	ch, err := p.scanCharEscape()
+	if err != nil {
+		return nil, err
+	}
+
+	if p.useOptionI() {
+		ch = unicode.ToLower(ch)
+	}
+
+	return newRegexNodeCh(ntOne, p.options, ch), nil
+}
+
+// Scans X for \p{X} or \P{X}
+func (p *parser) parseProperty() (string, error) {
+	if p.charsRight() < 3 {
+		return "", p.getErr(ErrIncompleteSlashP)
+	}
+	ch := p.moveRightGetChar()
+	if ch != '{' {
+		return "", p.getErr(ErrMalformedSlashP)
+	}
+
+	startpos := p.textpos()
+	for p.charsRight() > 0 {
+		ch = p.moveRightGetChar()
+		if !(IsWordChar(ch) || ch == '-') {
+			p.moveLeft()
+			break
+		}
+	}
+	capname := string(p.pattern[startpos:p.textpos()])
+
+	if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
+		return "", p.getErr(ErrIncompleteSlashP)
+	}
+
+	if !isValidUnicodeCat(capname) {
+		return "", p.getErr(ErrUnknownSlashP, capname)
+	}
+
+	return capname, nil
+}
+
+// Returns ReNode type for zero-length assertions with a \ code.
+func (p *parser) typeFromCode(ch rune) nodeType {
+	switch ch {
+	case 'b':
+		if p.useOptionE() {
+			return ntECMABoundary
+		}
+		return ntBoundary
+	case 'B':
+		if p.useOptionE() {
+			return ntNonECMABoundary
+		}
+		return ntNonboundary
+	case 'A':
+		return ntBeginning
+	case 'G':
+		return ntStart
+	case 'Z':
+		return ntEndZ
+	case 'z':
+		return ntEnd
+	default:
+		return ntNothing
+	}
+}
+
+// Scans whitespace or x-mode comments.
+func (p *parser) scanBlank() error {
+	if p.useOptionX() {
+		for {
+			for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
+				p.moveRight(1)
+			}
+
+			if p.charsRight() == 0 {
+				break
+			}
+
+			if p.rightChar(0) == '#' {
+				for p.charsRight() > 0 && p.rightChar(0) != '\n' {
+					p.moveRight(1)
+				}
+			} else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
+				p.rightChar(1) == '?' && p.rightChar(0) == '(' {
+				for p.charsRight() > 0 && p.rightChar(0) != ')' {
+					p.moveRight(1)
+				}
+				if p.charsRight() == 0 {
+					return p.getErr(ErrUnterminatedComment)
+				}
+				p.moveRight(1)
+			} else {
+				break
+			}
+		}
+	} else {
+		for {
+			if p.charsRight() < 3 || p.rightChar(2) != '#' ||
+				p.rightChar(1) != '?' || p.rightChar(0) != '(' {
+				return nil
+			}
+
+			for p.charsRight() > 0 && p.rightChar(0) != ')' {
+				p.moveRight(1)
+			}
+			if p.charsRight() == 0 {
+				return p.getErr(ErrUnterminatedComment)
+			}
+			p.moveRight(1)
+		}
+	}
+	return nil
+}
+
+func (p *parser) scanCapname() string {
+	startpos := p.textpos()
+
+	for p.charsRight() > 0 {
+		if !IsWordChar(p.moveRightGetChar()) {
+			p.moveLeft()
+			break
+		}
+	}
+
+	return string(p.pattern[startpos:p.textpos()])
+}
+
+//Scans contents of [] (not including []'s), and converts to a set.
+func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
+	ch := '\x00'
+	chPrev := '\x00'
+	inRange := false
+	firstChar := true
+	closed := false
+
+	var cc *CharSet
+	if !scanOnly {
+		cc = &CharSet{}
+	}
+
+	if p.charsRight() > 0 && p.rightChar(0) == '^' {
+		p.moveRight(1)
+		if !scanOnly {
+			cc.negate = true
+		}
+	}
+
+	for ; p.charsRight() > 0; firstChar = false {
+		fTranslatedChar := false
+		ch = p.moveRightGetChar()
+		if ch == ']' {
+			if !firstChar {
+				closed = true
+				break
+			} else if p.useOptionE() {
+				if !scanOnly {
+					cc.addRanges(NoneClass().ranges)
+				}
+				closed = true
+				break
+			}
+
+		} else if ch == '\\' && p.charsRight() > 0 {
+			switch ch = p.moveRightGetChar(); ch {
+			case 'D', 'd':
+				if !scanOnly {
+					if inRange {
+						return nil, p.getErr(ErrBadClassInCharRange, ch)
+					}
+					cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
+				}
+				continue
+
+			case 'S', 's':
+				if !scanOnly {
+					if inRange {
+						return nil, p.getErr(ErrBadClassInCharRange, ch)
+					}
+					cc.addSpace(p.useOptionE(), ch == 'S')
+				}
+				continue
+
+			case 'W', 'w':
+				if !scanOnly {
+					if inRange {
+						return nil, p.getErr(ErrBadClassInCharRange, ch)
+					}
+
+					cc.addWord(p.useOptionE(), ch == 'W')
+				}
+				continue
+
+			case 'p', 'P':
+				if !scanOnly {
+					if inRange {
+						return nil, p.getErr(ErrBadClassInCharRange, ch)
+					}
+					prop, err := p.parseProperty()
+					if err != nil {
+						return nil, err
+					}
+					cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
+				} else {
+					p.parseProperty()
+				}
+
+				continue
+
+			case '-':
+				if !scanOnly {
+					cc.addRange(ch, ch)
+				}
+				continue
+
+			default:
+				p.moveLeft()
+				var err error
+				ch, err = p.scanCharEscape() // non-literal character
+				if err != nil {
+					return nil, err
+				}
+				fTranslatedChar = true
+				break // this break will only break out of the switch
+			}
+		} else if ch == '[' {
+			// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
+			// It currently doesn't do anything other than skip the whole thing!
+			if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
+				savePos := p.textpos()
+
+				p.moveRight(1)
+				negate := false
+				if p.charsRight() > 1 && p.rightChar(0) == '^' {
+					negate = true
+					p.moveRight(1)
+				}
+
+				nm := p.scanCapname() // snag the name
+				if !scanOnly && p.useRE2() {
+					// look up the name since these are valid for RE2
+					// add the group based on the name
+					if ok := cc.addNamedASCII(nm, negate); !ok {
+						return nil, p.getErr(ErrInvalidCharRange)
+					}
+				}
+				if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
+					p.textto(savePos)
+				} else if p.useRE2() {
+					// move on
+					continue
+				}
+			}
+		}
+
+		if inRange {
+			inRange = false
+			if !scanOnly {
+				if ch == '[' && !fTranslatedChar && !firstChar {
+					// We thought we were in a range, but we're actually starting a subtraction.
+					// In that case, we'll add chPrev to our char class, skip the opening [, and
+					// scan the new character class recursively.
+					cc.addChar(chPrev)
+					sub, err := p.scanCharSet(caseInsensitive, false)
+					if err != nil {
+						return nil, err
+					}
+					cc.addSubtraction(sub)
+
+					if p.charsRight() > 0 && p.rightChar(0) != ']' {
+						return nil, p.getErr(ErrSubtractionMustBeLast)
+					}
+				} else {
+					// a regular range, like a-z
+					if chPrev > ch {
+						return nil, p.getErr(ErrReversedCharRange)
+					}
+					cc.addRange(chPrev, ch)
+				}
+			}
+		} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
+			// this could be the start of a range
+			chPrev = ch
+			inRange = true
+			p.moveRight(1)
+		} else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
+			// we aren't in a range, and now there is a subtraction.  Usually this happens
+			// only when a subtraction follows a range, like [a-z-[b]]
+			if !scanOnly {
+				p.moveRight(1)
+				sub, err := p.scanCharSet(caseInsensitive, false)
+				if err != nil {
+					return nil, err
+				}
+				cc.addSubtraction(sub)
+
+				if p.charsRight() > 0 && p.rightChar(0) != ']' {
+					return nil, p.getErr(ErrSubtractionMustBeLast)
+				}
+			} else {
+				p.moveRight(1)
+				p.scanCharSet(caseInsensitive, true)
+			}
+		} else {
+			if !scanOnly {
+				cc.addRange(ch, ch)
+			}
+		}
+	}
+
+	if !closed {
+		return nil, p.getErr(ErrUnterminatedBracket)
+	}
+
+	if !scanOnly && caseInsensitive {
+		cc.addLowercase()
+	}
+
+	return cc, nil
+}
+
+// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
+func (p *parser) scanDecimal() (int, error) {
+	i := 0
+	var d int
+
+	for p.charsRight() > 0 {
+		d = int(p.rightChar(0) - '0')
+		if d < 0 || d > 9 {
+			break
+		}
+		p.moveRight(1)
+
+		if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
+			return 0, p.getErr(ErrCaptureGroupOutOfRange)
+		}
+
+		i *= 10
+		i += d
+	}
+
+	return int(i), nil
+}
+
+// Returns true for options allowed only at the top level
+func isOnlyTopOption(option RegexOptions) bool {
+	return option == RightToLeft || option == ECMAScript || option == RE2
+}
+
+// Scans cimsx-cimsx option string, stops at the first unrecognized char.
+func (p *parser) scanOptions() {
+
+	for off := false; p.charsRight() > 0; p.moveRight(1) {
+		ch := p.rightChar(0)
+
+		if ch == '-' {
+			off = true
+		} else if ch == '+' {
+			off = false
+		} else {
+			option := optionFromCode(ch)
+			if option == 0 || isOnlyTopOption(option) {
+				return
+			}
+
+			if off {
+				p.options &= ^option
+			} else {
+				p.options |= option
+			}
+		}
+	}
+}
+
+// Scans \ code for escape codes that map to single unicode chars.
+func (p *parser) scanCharEscape() (r rune, err error) {
+
+	ch := p.moveRightGetChar()
+
+	if ch >= '0' && ch <= '7' {
+		p.moveLeft()
+		return p.scanOctal(), nil
+	}
+
+	pos := p.textpos()
+
+	switch ch {
+	case 'x':
+		// support for \x{HEX} syntax from Perl and PCRE
+		if p.charsRight() > 0 && p.rightChar(0) == '{' {
+			if p.useOptionE() {
+				return ch, nil
+			}
+			p.moveRight(1)
+			return p.scanHexUntilBrace()
+		} else {
+			r, err = p.scanHex(2)
+		}
+	case 'u':
+		r, err = p.scanHex(4)
+	case 'a':
+		return '\u0007', nil
+	case 'b':
+		return '\b', nil
+	case 'e':
+		return '\u001B', nil
+	case 'f':
+		return '\f', nil
+	case 'n':
+		return '\n', nil
+	case 'r':
+		return '\r', nil
+	case 't':
+		return '\t', nil
+	case 'v':
+		return '\u000B', nil
+	case 'c':
+		r, err = p.scanControl()
+	default:
+		if !p.useOptionE() && IsWordChar(ch) {
+			return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
+		}
+		return ch, nil
+	}
+	if err != nil && p.useOptionE() {
+		p.textto(pos)
+		return ch, nil
+	}
+	return
+}
+
+// Grabs and converts an ascii control character
+func (p *parser) scanControl() (rune, error) {
+	if p.charsRight() <= 0 {
+		return 0, p.getErr(ErrMissingControl)
+	}
+
+	ch := p.moveRightGetChar()
+
+	// \ca interpreted as \cA
+
+	if ch >= 'a' && ch <= 'z' {
+		ch = (ch - ('a' - 'A'))
+	}
+	ch = (ch - '@')
+	if ch >= 0 && ch < ' ' {
+		return ch, nil
+	}
+
+	return 0, p.getErr(ErrUnrecognizedControl)
+
+}
+
+// Scan hex digits until we hit a closing brace.
+// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
+func (p *parser) scanHexUntilBrace() (rune, error) {
+	// PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
+	// so we can enforce that
+	i := 0
+	hasContent := false
+
+	for p.charsRight() > 0 {
+		ch := p.moveRightGetChar()
+		if ch == '}' {
+			// hit our close brace, we're done here
+			// prevent \x{}
+			if !hasContent {
+				return 0, p.getErr(ErrTooFewHex)
+			}
+			return rune(i), nil
+		}
+		hasContent = true
+		// no brace needs to be hex digit
+		d := hexDigit(ch)
+		if d < 0 {
+			return 0, p.getErr(ErrMissingBrace)
+		}
+
+		i *= 0x10
+		i += d
+
+		if i > unicode.MaxRune {
+			return 0, p.getErr(ErrInvalidHex)
+		}
+	}
+
+	// we only make it here if we run out of digits without finding the brace
+	return 0, p.getErr(ErrMissingBrace)
+}
+
+// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
+func (p *parser) scanHex(c int) (rune, error) {
+
+	i := 0
+
+	if p.charsRight() >= c {
+		for c > 0 {
+			d := hexDigit(p.moveRightGetChar())
+			if d < 0 {
+				break
+			}
+			i *= 0x10
+			i += d
+			c--
+		}
+	}
+
+	if c > 0 {
+		return 0, p.getErr(ErrTooFewHex)
+	}
+
+	return rune(i), nil
+}
+
+// Returns n <= 0xF for a hex digit.
+func hexDigit(ch rune) int {
+
+	if d := uint(ch - '0'); d <= 9 {
+		return int(d)
+	}
+
+	if d := uint(ch - 'a'); d <= 5 {
+		return int(d + 0xa)
+	}
+
+	if d := uint(ch - 'A'); d <= 5 {
+		return int(d + 0xa)
+	}
+
+	return -1
+}
+
+// Scans up to three octal digits (stops before exceeding 0377).
+func (p *parser) scanOctal() rune {
+	// Consume octal chars only up to 3 digits and value 0377
+
+	c := 3
+
+	if c > p.charsRight() {
+		c = p.charsRight()
+	}
+
+	//we know the first char is good because the caller had to check
+	i := 0
+	d := int(p.rightChar(0) - '0')
+	for c > 0 && d <= 7 && d >= 0 {
+		if i >= 0x20 && p.useOptionE() {
+			break
+		}
+		i *= 8
+		i += d
+		c--
+
+		p.moveRight(1)
+		if !p.rightMost() {
+			d = int(p.rightChar(0) - '0')
+		}
+	}
+
+	// Octal codes only go up to 255.  Any larger and the behavior that Perl follows
+	// is simply to truncate the high bits.
+	i &= 0xFF
+
+	return rune(i)
+}
+
+// Returns the current parsing position.
+func (p *parser) textpos() int {
+	return p.currentPos
+}
+
+// Zaps to a specific parsing position.
+func (p *parser) textto(pos int) {
+	p.currentPos = pos
+}
+
+// Returns the char at the right of the current parsing position and advances to the right.
+func (p *parser) moveRightGetChar() rune {
+	ch := p.pattern[p.currentPos]
+	p.currentPos++
+	return ch
+}
+
+// Moves the current position to the right.
+func (p *parser) moveRight(i int) {
+	// default would be 1
+	p.currentPos += i
+}
+
+// Moves the current parsing position one to the left.
+func (p *parser) moveLeft() {
+	p.currentPos--
+}
+
+// Returns the char left of the current parsing position.
+func (p *parser) charAt(i int) rune {
+	return p.pattern[i]
+}
+
+// Returns the char i chars right of the current parsing position.
+func (p *parser) rightChar(i int) rune {
+	// default would be 0
+	return p.pattern[p.currentPos+i]
+}
+
+// Number of characters to the right of the current parsing position.
+func (p *parser) charsRight() int {
+	return len(p.pattern) - p.currentPos
+}
+
+func (p *parser) rightMost() bool {
+	return p.currentPos == len(p.pattern)
+}
+
+// Looks up the slot number for a given name
+func (p *parser) captureSlotFromName(capname string) int {
+	return p.capnames[capname]
+}
+
+// True if the capture slot was noted
+func (p *parser) isCaptureSlot(i int) bool {
+	if p.caps != nil {
+		_, ok := p.caps[i]
+		return ok
+	}
+
+	return (i >= 0 && i < p.capsize)
+}
+
+// Looks up the slot number for a given name
+func (p *parser) isCaptureName(capname string) bool {
+	if p.capnames == nil {
+		return false
+	}
+
+	_, ok := p.capnames[capname]
+	return ok
+}
+
+// option shortcuts
+
+// True if N option disabling '(' autocapture is on.
+func (p *parser) useOptionN() bool {
+	return (p.options & ExplicitCapture) != 0
+}
+
+// True if I option enabling case-insensitivity is on.
+func (p *parser) useOptionI() bool {
+	return (p.options & IgnoreCase) != 0
+}
+
+// True if M option altering meaning of $ and ^ is on.
+func (p *parser) useOptionM() bool {
+	return (p.options & Multiline) != 0
+}
+
+// True if S option altering meaning of . is on.
+func (p *parser) useOptionS() bool {
+	return (p.options & Singleline) != 0
+}
+
+// True if X option enabling whitespace/comment mode is on.
+func (p *parser) useOptionX() bool {
+	return (p.options & IgnorePatternWhitespace) != 0
+}
+
+// True if E option enabling ECMAScript behavior on.
+func (p *parser) useOptionE() bool {
+	return (p.options & ECMAScript) != 0
+}
+
+// true to use RE2 compatibility parsing behavior.
+func (p *parser) useRE2() bool {
+	return (p.options & RE2) != 0
+}
+
+// True if options stack is empty.
+func (p *parser) emptyOptionsStack() bool {
+	return len(p.optionsStack) == 0
+}
+
+// Finish the current quantifiable (when a quantifier is not found or is not possible)
+func (p *parser) addConcatenate() {
+	// The first (| inside a Testgroup group goes directly to the group
+	p.concatenation.addChild(p.unit)
+	p.unit = nil
+}
+
+// Finish the current quantifiable (when a quantifier is found)
+func (p *parser) addConcatenate3(lazy bool, min, max int) {
+	p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
+	p.unit = nil
+}
+
+// Sets the current unit to a single char node
+func (p *parser) addUnitOne(ch rune) {
+	if p.useOptionI() {
+		ch = unicode.ToLower(ch)
+	}
+
+	p.unit = newRegexNodeCh(ntOne, p.options, ch)
+}
+
+// Sets the current unit to a single inverse-char node
+func (p *parser) addUnitNotone(ch rune) {
+	if p.useOptionI() {
+		ch = unicode.ToLower(ch)
+	}
+
+	p.unit = newRegexNodeCh(ntNotone, p.options, ch)
+}
+
+// Sets the current unit to a single set node
+func (p *parser) addUnitSet(set *CharSet) {
+	p.unit = newRegexNodeSet(ntSet, p.options, set)
+}
+
+// Sets the current unit to a subtree
+func (p *parser) addUnitNode(node *regexNode) {
+	p.unit = node
+}
+
+// Sets the current unit to an assertion of the specified type
+func (p *parser) addUnitType(t nodeType) {
+	p.unit = newRegexNode(t, p.options)
+}
+
+// Finish the current group (in response to a ')' or end)
+func (p *parser) addGroup() error {
+	if p.group.t == ntTestgroup || p.group.t == ntTestref {
+		p.group.addChild(p.concatenation.reverseLeft())
+		if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
+			return p.getErr(ErrTooManyAlternates)
+		}
+	} else {
+		p.alternation.addChild(p.concatenation.reverseLeft())
+		p.group.addChild(p.alternation)
+	}
+
+	p.unit = p.group
+	return nil
+}
+
+// Pops the option stack, but keeps the current options unchanged.
+func (p *parser) popKeepOptions() {
+	lastIdx := len(p.optionsStack) - 1
+	p.optionsStack = p.optionsStack[:lastIdx]
+}
+
+// Recalls options from the stack.
+func (p *parser) popOptions() {
+	lastIdx := len(p.optionsStack) - 1
+	// get the last item on the stack and then remove it by reslicing
+	p.options = p.optionsStack[lastIdx]
+	p.optionsStack = p.optionsStack[:lastIdx]
+}
+
+// Saves options on a stack.
+func (p *parser) pushOptions() {
+	p.optionsStack = append(p.optionsStack, p.options)
+}
+
+// Add a string to the last concatenate.
+func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
+	var node *regexNode
+
+	if cch == 0 {
+		return
+	}
+
+	if cch > 1 {
+		str := p.pattern[pos : pos+cch]
+
+		if p.useOptionI() && !isReplacement {
+			// We do the ToLower character by character for consistency.  With surrogate chars, doing
+			// a ToLower on the entire string could actually change the surrogate pair.  This is more correct
+			// linguistically, but since Regex doesn't support surrogates, it's more important to be
+			// consistent.
+			for i := 0; i < len(str); i++ {
+				str[i] = unicode.ToLower(str[i])
+			}
+		}
+
+		node = newRegexNodeStr(ntMulti, p.options, str)
+	} else {
+		ch := p.charAt(pos)
+
+		if p.useOptionI() && !isReplacement {
+			ch = unicode.ToLower(ch)
+		}
+
+		node = newRegexNodeCh(ntOne, p.options, ch)
+	}
+
+	p.concatenation.addChild(node)
+}
+
+// Push the parser state (in response to an open paren)
+func (p *parser) pushGroup() {
+	p.group.next = p.stack
+	p.alternation.next = p.group
+	p.concatenation.next = p.alternation
+	p.stack = p.concatenation
+}
+
+// Remember the pushed state (in response to a ')')
+func (p *parser) popGroup() error {
+	p.concatenation = p.stack
+	p.alternation = p.concatenation.next
+	p.group = p.alternation.next
+	p.stack = p.group.next
+
+	// The first () inside a Testgroup group goes directly to the group
+	if p.group.t == ntTestgroup && len(p.group.children) == 0 {
+		if p.unit == nil {
+			return p.getErr(ErrConditionalExpression)
+		}
+
+		p.group.addChild(p.unit)
+		p.unit = nil
+	}
+	return nil
+}
+
+// True if the group stack is empty.
+func (p *parser) emptyStack() bool {
+	return p.stack == nil
+}
+
+// Start a new round for the parser state (in response to an open paren or string start)
+func (p *parser) startGroup(openGroup *regexNode) {
+	p.group = openGroup
+	p.alternation = newRegexNode(ntAlternate, p.options)
+	p.concatenation = newRegexNode(ntConcatenate, p.options)
+}
+
+// Finish the current concatenation (in response to a |)
+func (p *parser) addAlternate() {
+	// The | parts inside a Testgroup group go directly to the group
+
+	if p.group.t == ntTestgroup || p.group.t == ntTestref {
+		p.group.addChild(p.concatenation.reverseLeft())
+	} else {
+		p.alternation.addChild(p.concatenation.reverseLeft())
+	}
+
+	p.concatenation = newRegexNode(ntConcatenate, p.options)
+}
+
+// For categorizing ascii characters.
+
+const (
+	Q byte = 5 // quantifier
+	S      = 4 // ordinary stopper
+	Z      = 3 // ScanBlank stopper
+	X      = 2 // whitespace
+	E      = 1 // should be escaped
+)
+
+var _category = []byte{
+	//01  2  3  4  5  6  7  8  9  A  B  C  D  E  F  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
+	0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	// !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /  0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
+	X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
+	//@A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
+	//'a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
+}
+
+func isSpace(ch rune) bool {
+	return (ch <= ' ' && _category[ch] == X)
+}
+
+// Returns true for those characters that terminate a string of ordinary chars.
+func isSpecial(ch rune) bool {
+	return (ch <= '|' && _category[ch] >= S)
+}
+
+// Returns true for those characters that terminate a string of ordinary chars.
+func isStopperX(ch rune) bool {
+	return (ch <= '|' && _category[ch] >= X)
+}
+
+// Returns true for those characters that begin a quantifier.
+func isQuantifier(ch rune) bool {
+	return (ch <= '{' && _category[ch] >= Q)
+}
+
+func (p *parser) isTrueQuantifier() bool {
+	nChars := p.charsRight()
+	if nChars == 0 {
+		return false
+	}
+
+	startpos := p.textpos()
+	ch := p.charAt(startpos)
+	if ch != '{' {
+		return ch <= '{' && _category[ch] >= Q
+	}
+
+	//UGLY: this is ugly -- the original code was ugly too
+	pos := startpos
+	for {
+		nChars--
+		if nChars <= 0 {
+			break
+		}
+		pos++
+		ch = p.charAt(pos)
+		if ch < '0' || ch > '9' {
+			break
+		}
+	}
+
+	if nChars == 0 || pos-startpos == 1 {
+		return false
+	}
+	if ch == '}' {
+		return true
+	}
+	if ch != ',' {
+		return false
+	}
+	for {
+		nChars--
+		if nChars <= 0 {
+			break
+		}
+		pos++
+		ch = p.charAt(pos)
+		if ch < '0' || ch > '9' {
+			break
+		}
+	}
+
+	return nChars > 0 && ch == '}'
+}
author	yakumo.izuru
date	Sun, 23 Jul 2023 13:18:53 +0000
parents
children