Mercurial > yakumo_izuru > aya
diff vendor/github.com/dlclark/regexp2/syntax/writer.go @ 66:787b5ee0289d draft
Use vendored modules
Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author | yakumo.izuru |
---|---|
date | Sun, 23 Jul 2023 13:18:53 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vendor/github.com/dlclark/regexp2/syntax/writer.go Sun Jul 23 13:18:53 2023 +0000 @@ -0,0 +1,500 @@ +package syntax + +import ( + "bytes" + "fmt" + "math" + "os" +) + +func Write(tree *RegexTree) (*Code, error) { + w := writer{ + intStack: make([]int, 0, 32), + emitted: make([]int, 2), + stringhash: make(map[string]int), + sethash: make(map[string]int), + } + + code, err := w.codeFromTree(tree) + + if tree.options&Debug > 0 && code != nil { + os.Stdout.WriteString(code.Dump()) + os.Stdout.WriteString("\n") + } + + return code, err +} + +type writer struct { + emitted []int + + intStack []int + curpos int + stringhash map[string]int + stringtable [][]rune + sethash map[string]int + settable []*CharSet + counting bool + count int + trackcount int + caps map[int]int +} + +const ( + beforeChild nodeType = 64 + afterChild = 128 + //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix + MaxPrefixSize = 50 +) + +// The top level RegexCode generator. It does a depth-first walk +// through the tree and calls EmitFragment to emits code before +// and after each child of an interior node, and at each leaf. +// +// It runs two passes, first to count the size of the generated +// code, and second to generate the code. +// +// We should time it against the alternative, which is +// to just generate the code and grow the array as we go. +func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) { + var ( + curNode *regexNode + curChild int + capsize int + ) + // construct sparse capnum mapping if some numbers are unused + + if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) { + capsize = tree.captop + w.caps = nil + } else { + capsize = len(tree.capnumlist) + w.caps = tree.caps + for i := 0; i < len(tree.capnumlist); i++ { + w.caps[tree.capnumlist[i]] = i + } + } + + w.counting = true + + for { + if !w.counting { + w.emitted = make([]int, w.count) + } + + curNode = tree.root + curChild = 0 + + w.emit1(Lazybranch, 0) + + for { + if len(curNode.children) == 0 { + w.emitFragment(curNode.t, curNode, 0) + } else if curChild < len(curNode.children) { + w.emitFragment(curNode.t|beforeChild, curNode, curChild) + + curNode = curNode.children[curChild] + + w.pushInt(curChild) + curChild = 0 + continue + } + + if w.emptyStack() { + break + } + + curChild = w.popInt() + curNode = curNode.next + + w.emitFragment(curNode.t|afterChild, curNode, curChild) + curChild++ + } + + w.patchJump(0, w.curPos()) + w.emit(Stop) + + if !w.counting { + break + } + + w.counting = false + } + + fcPrefix := getFirstCharsPrefix(tree) + prefix := getPrefix(tree) + rtl := (tree.options & RightToLeft) != 0 + + var bmPrefix *BmPrefix + //TODO: benchmark string prefixes + if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 { + if len(prefix.PrefixStr) > MaxPrefixSize { + // limit prefix changes to 10k + prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize] + } + bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl) + } else { + bmPrefix = nil + } + + return &Code{ + Codes: w.emitted, + Strings: w.stringtable, + Sets: w.settable, + TrackCount: w.trackcount, + Caps: w.caps, + Capsize: capsize, + FcPrefix: fcPrefix, + BmPrefix: bmPrefix, + Anchors: getAnchors(tree), + RightToLeft: rtl, + }, nil +} + +// The main RegexCode generator. It does a depth-first walk +// through the tree and calls EmitFragment to emits code before +// and after each child of an interior node, and at each leaf. +func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error { + bits := InstOp(0) + + if nodetype <= ntRef { + if (node.options & RightToLeft) != 0 { + bits |= Rtl + } + if (node.options & IgnoreCase) != 0 { + bits |= Ci + } + } + ntBits := nodeType(bits) + + switch nodetype { + case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty: + break + + case ntAlternate | beforeChild: + if curIndex < len(node.children)-1 { + w.pushInt(w.curPos()) + w.emit1(Lazybranch, 0) + } + + case ntAlternate | afterChild: + if curIndex < len(node.children)-1 { + lbPos := w.popInt() + w.pushInt(w.curPos()) + w.emit1(Goto, 0) + w.patchJump(lbPos, w.curPos()) + } else { + for i := 0; i < curIndex; i++ { + w.patchJump(w.popInt(), w.curPos()) + } + } + break + + case ntTestref | beforeChild: + if curIndex == 0 { + w.emit(Setjump) + w.pushInt(w.curPos()) + w.emit1(Lazybranch, 0) + w.emit1(Testref, w.mapCapnum(node.m)) + w.emit(Forejump) + } + + case ntTestref | afterChild: + if curIndex == 0 { + branchpos := w.popInt() + w.pushInt(w.curPos()) + w.emit1(Goto, 0) + w.patchJump(branchpos, w.curPos()) + w.emit(Forejump) + if len(node.children) <= 1 { + w.patchJump(w.popInt(), w.curPos()) + } + } else if curIndex == 1 { + w.patchJump(w.popInt(), w.curPos()) + } + + case ntTestgroup | beforeChild: + if curIndex == 0 { + w.emit(Setjump) + w.emit(Setmark) + w.pushInt(w.curPos()) + w.emit1(Lazybranch, 0) + } + + case ntTestgroup | afterChild: + if curIndex == 0 { + w.emit(Getmark) + w.emit(Forejump) + } else if curIndex == 1 { + Branchpos := w.popInt() + w.pushInt(w.curPos()) + w.emit1(Goto, 0) + w.patchJump(Branchpos, w.curPos()) + w.emit(Getmark) + w.emit(Forejump) + if len(node.children) <= 2 { + w.patchJump(w.popInt(), w.curPos()) + } + } else if curIndex == 2 { + w.patchJump(w.popInt(), w.curPos()) + } + + case ntLoop | beforeChild, ntLazyloop | beforeChild: + + if node.n < math.MaxInt32 || node.m > 1 { + if node.m == 0 { + w.emit1(Nullcount, 0) + } else { + w.emit1(Setcount, 1-node.m) + } + } else if node.m == 0 { + w.emit(Nullmark) + } else { + w.emit(Setmark) + } + + if node.m == 0 { + w.pushInt(w.curPos()) + w.emit1(Goto, 0) + } + w.pushInt(w.curPos()) + + case ntLoop | afterChild, ntLazyloop | afterChild: + + startJumpPos := w.curPos() + lazy := (nodetype - (ntLoop | afterChild)) + + if node.n < math.MaxInt32 || node.m > 1 { + if node.n == math.MaxInt32 { + w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32) + } else { + w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m) + } + } else { + w.emit1(InstOp(Branchmark+lazy), w.popInt()) + } + + if node.m == 0 { + w.patchJump(w.popInt(), startJumpPos) + } + + case ntGroup | beforeChild, ntGroup | afterChild: + + case ntCapture | beforeChild: + w.emit(Setmark) + + case ntCapture | afterChild: + w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n)) + + case ntRequire | beforeChild: + // NOTE: the following line causes lookahead/lookbehind to be + // NON-BACKTRACKING. It can be commented out with (*) + w.emit(Setjump) + + w.emit(Setmark) + + case ntRequire | afterChild: + w.emit(Getmark) + + // NOTE: the following line causes lookahead/lookbehind to be + // NON-BACKTRACKING. It can be commented out with (*) + w.emit(Forejump) + + case ntPrevent | beforeChild: + w.emit(Setjump) + w.pushInt(w.curPos()) + w.emit1(Lazybranch, 0) + + case ntPrevent | afterChild: + w.emit(Backjump) + w.patchJump(w.popInt(), w.curPos()) + w.emit(Forejump) + + case ntGreedy | beforeChild: + w.emit(Setjump) + + case ntGreedy | afterChild: + w.emit(Forejump) + + case ntOne, ntNotone: + w.emit1(InstOp(node.t|ntBits), int(node.ch)) + + case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy: + if node.m > 0 { + if node.t == ntOneloop || node.t == ntOnelazy { + w.emit2(Onerep|bits, int(node.ch), node.m) + } else { + w.emit2(Notonerep|bits, int(node.ch), node.m) + } + } + if node.n > node.m { + if node.n == math.MaxInt32 { + w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32) + } else { + w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m) + } + } + + case ntSetloop, ntSetlazy: + if node.m > 0 { + w.emit2(Setrep|bits, w.setCode(node.set), node.m) + } + if node.n > node.m { + if node.n == math.MaxInt32 { + w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32) + } else { + w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m) + } + } + + case ntMulti: + w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str)) + + case ntSet: + w.emit1(InstOp(node.t|ntBits), w.setCode(node.set)) + + case ntRef: + w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m)) + + case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: + w.emit(InstOp(node.t)) + + default: + return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype) + } + + return nil +} + +// To avoid recursion, we use a simple integer stack. +// This is the push. +func (w *writer) pushInt(i int) { + w.intStack = append(w.intStack, i) +} + +// Returns true if the stack is empty. +func (w *writer) emptyStack() bool { + return len(w.intStack) == 0 +} + +// This is the pop. +func (w *writer) popInt() int { + //get our item + idx := len(w.intStack) - 1 + i := w.intStack[idx] + //trim our slice + w.intStack = w.intStack[:idx] + return i +} + +// Returns the current position in the emitted code. +func (w *writer) curPos() int { + return w.curpos +} + +// Fixes up a jump instruction at the specified offset +// so that it jumps to the specified jumpDest. +func (w *writer) patchJump(offset, jumpDest int) { + w.emitted[offset+1] = jumpDest +} + +// Returns an index in the set table for a charset +// uses a map to eliminate duplicates. +func (w *writer) setCode(set *CharSet) int { + if w.counting { + return 0 + } + + buf := &bytes.Buffer{} + + set.mapHashFill(buf) + hash := buf.String() + i, ok := w.sethash[hash] + if !ok { + i = len(w.sethash) + w.sethash[hash] = i + w.settable = append(w.settable, set) + } + return i +} + +// Returns an index in the string table for a string. +// uses a map to eliminate duplicates. +func (w *writer) stringCode(str []rune) int { + if w.counting { + return 0 + } + + hash := string(str) + i, ok := w.stringhash[hash] + if !ok { + i = len(w.stringhash) + w.stringhash[hash] = i + w.stringtable = append(w.stringtable, str) + } + + return i +} + +// When generating code on a regex that uses a sparse set +// of capture slots, we hash them to a dense set of indices +// for an array of capture slots. Instead of doing the hash +// at match time, it's done at compile time, here. +func (w *writer) mapCapnum(capnum int) int { + if capnum == -1 { + return -1 + } + + if w.caps != nil { + return w.caps[capnum] + } + + return capnum +} + +// Emits a zero-argument operation. Note that the emit +// functions all run in two modes: they can emit code, or +// they can just count the size of the code. +func (w *writer) emit(op InstOp) { + if w.counting { + w.count++ + if opcodeBacktracks(op) { + w.trackcount++ + } + return + } + w.emitted[w.curpos] = int(op) + w.curpos++ +} + +// Emits a one-argument operation. +func (w *writer) emit1(op InstOp, opd1 int) { + if w.counting { + w.count += 2 + if opcodeBacktracks(op) { + w.trackcount++ + } + return + } + w.emitted[w.curpos] = int(op) + w.curpos++ + w.emitted[w.curpos] = opd1 + w.curpos++ +} + +// Emits a two-argument operation. +func (w *writer) emit2(op InstOp, opd1, opd2 int) { + if w.counting { + w.count += 3 + if opcodeBacktracks(op) { + w.trackcount++ + } + return + } + w.emitted[w.curpos] = int(op) + w.curpos++ + w.emitted[w.curpos] = opd1 + w.curpos++ + w.emitted[w.curpos] = opd2 + w.curpos++ +}