diff vendor/github.com/dlclark/regexp2/syntax/writer.go @ 66:787b5ee0289d draft

Use vendored modules Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author yakumo.izuru
date Sun, 23 Jul 2023 13:18:53 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vendor/github.com/dlclark/regexp2/syntax/writer.go	Sun Jul 23 13:18:53 2023 +0000
@@ -0,0 +1,500 @@
+package syntax
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"os"
+)
+
+func Write(tree *RegexTree) (*Code, error) {
+	w := writer{
+		intStack:   make([]int, 0, 32),
+		emitted:    make([]int, 2),
+		stringhash: make(map[string]int),
+		sethash:    make(map[string]int),
+	}
+
+	code, err := w.codeFromTree(tree)
+
+	if tree.options&Debug > 0 && code != nil {
+		os.Stdout.WriteString(code.Dump())
+		os.Stdout.WriteString("\n")
+	}
+
+	return code, err
+}
+
+type writer struct {
+	emitted []int
+
+	intStack    []int
+	curpos      int
+	stringhash  map[string]int
+	stringtable [][]rune
+	sethash     map[string]int
+	settable    []*CharSet
+	counting    bool
+	count       int
+	trackcount  int
+	caps        map[int]int
+}
+
+const (
+	beforeChild nodeType = 64
+	afterChild           = 128
+	//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
+	MaxPrefixSize = 50
+)
+
+// The top level RegexCode generator. It does a depth-first walk
+// through the tree and calls EmitFragment to emits code before
+// and after each child of an interior node, and at each leaf.
+//
+// It runs two passes, first to count the size of the generated
+// code, and second to generate the code.
+//
+// We should time it against the alternative, which is
+// to just generate the code and grow the array as we go.
+func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
+	var (
+		curNode  *regexNode
+		curChild int
+		capsize  int
+	)
+	// construct sparse capnum mapping if some numbers are unused
+
+	if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
+		capsize = tree.captop
+		w.caps = nil
+	} else {
+		capsize = len(tree.capnumlist)
+		w.caps = tree.caps
+		for i := 0; i < len(tree.capnumlist); i++ {
+			w.caps[tree.capnumlist[i]] = i
+		}
+	}
+
+	w.counting = true
+
+	for {
+		if !w.counting {
+			w.emitted = make([]int, w.count)
+		}
+
+		curNode = tree.root
+		curChild = 0
+
+		w.emit1(Lazybranch, 0)
+
+		for {
+			if len(curNode.children) == 0 {
+				w.emitFragment(curNode.t, curNode, 0)
+			} else if curChild < len(curNode.children) {
+				w.emitFragment(curNode.t|beforeChild, curNode, curChild)
+
+				curNode = curNode.children[curChild]
+
+				w.pushInt(curChild)
+				curChild = 0
+				continue
+			}
+
+			if w.emptyStack() {
+				break
+			}
+
+			curChild = w.popInt()
+			curNode = curNode.next
+
+			w.emitFragment(curNode.t|afterChild, curNode, curChild)
+			curChild++
+		}
+
+		w.patchJump(0, w.curPos())
+		w.emit(Stop)
+
+		if !w.counting {
+			break
+		}
+
+		w.counting = false
+	}
+
+	fcPrefix := getFirstCharsPrefix(tree)
+	prefix := getPrefix(tree)
+	rtl := (tree.options & RightToLeft) != 0
+
+	var bmPrefix *BmPrefix
+	//TODO: benchmark string prefixes
+	if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
+		if len(prefix.PrefixStr) > MaxPrefixSize {
+			// limit prefix changes to 10k
+			prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
+		}
+		bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
+	} else {
+		bmPrefix = nil
+	}
+
+	return &Code{
+		Codes:       w.emitted,
+		Strings:     w.stringtable,
+		Sets:        w.settable,
+		TrackCount:  w.trackcount,
+		Caps:        w.caps,
+		Capsize:     capsize,
+		FcPrefix:    fcPrefix,
+		BmPrefix:    bmPrefix,
+		Anchors:     getAnchors(tree),
+		RightToLeft: rtl,
+	}, nil
+}
+
+// The main RegexCode generator. It does a depth-first walk
+// through the tree and calls EmitFragment to emits code before
+// and after each child of an interior node, and at each leaf.
+func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
+	bits := InstOp(0)
+
+	if nodetype <= ntRef {
+		if (node.options & RightToLeft) != 0 {
+			bits |= Rtl
+		}
+		if (node.options & IgnoreCase) != 0 {
+			bits |= Ci
+		}
+	}
+	ntBits := nodeType(bits)
+
+	switch nodetype {
+	case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
+		break
+
+	case ntAlternate | beforeChild:
+		if curIndex < len(node.children)-1 {
+			w.pushInt(w.curPos())
+			w.emit1(Lazybranch, 0)
+		}
+
+	case ntAlternate | afterChild:
+		if curIndex < len(node.children)-1 {
+			lbPos := w.popInt()
+			w.pushInt(w.curPos())
+			w.emit1(Goto, 0)
+			w.patchJump(lbPos, w.curPos())
+		} else {
+			for i := 0; i < curIndex; i++ {
+				w.patchJump(w.popInt(), w.curPos())
+			}
+		}
+		break
+
+	case ntTestref | beforeChild:
+		if curIndex == 0 {
+			w.emit(Setjump)
+			w.pushInt(w.curPos())
+			w.emit1(Lazybranch, 0)
+			w.emit1(Testref, w.mapCapnum(node.m))
+			w.emit(Forejump)
+		}
+
+	case ntTestref | afterChild:
+		if curIndex == 0 {
+			branchpos := w.popInt()
+			w.pushInt(w.curPos())
+			w.emit1(Goto, 0)
+			w.patchJump(branchpos, w.curPos())
+			w.emit(Forejump)
+			if len(node.children) <= 1 {
+				w.patchJump(w.popInt(), w.curPos())
+			}
+		} else if curIndex == 1 {
+			w.patchJump(w.popInt(), w.curPos())
+		}
+
+	case ntTestgroup | beforeChild:
+		if curIndex == 0 {
+			w.emit(Setjump)
+			w.emit(Setmark)
+			w.pushInt(w.curPos())
+			w.emit1(Lazybranch, 0)
+		}
+
+	case ntTestgroup | afterChild:
+		if curIndex == 0 {
+			w.emit(Getmark)
+			w.emit(Forejump)
+		} else if curIndex == 1 {
+			Branchpos := w.popInt()
+			w.pushInt(w.curPos())
+			w.emit1(Goto, 0)
+			w.patchJump(Branchpos, w.curPos())
+			w.emit(Getmark)
+			w.emit(Forejump)
+			if len(node.children) <= 2 {
+				w.patchJump(w.popInt(), w.curPos())
+			}
+		} else if curIndex == 2 {
+			w.patchJump(w.popInt(), w.curPos())
+		}
+
+	case ntLoop | beforeChild, ntLazyloop | beforeChild:
+
+		if node.n < math.MaxInt32 || node.m > 1 {
+			if node.m == 0 {
+				w.emit1(Nullcount, 0)
+			} else {
+				w.emit1(Setcount, 1-node.m)
+			}
+		} else if node.m == 0 {
+			w.emit(Nullmark)
+		} else {
+			w.emit(Setmark)
+		}
+
+		if node.m == 0 {
+			w.pushInt(w.curPos())
+			w.emit1(Goto, 0)
+		}
+		w.pushInt(w.curPos())
+
+	case ntLoop | afterChild, ntLazyloop | afterChild:
+
+		startJumpPos := w.curPos()
+		lazy := (nodetype - (ntLoop | afterChild))
+
+		if node.n < math.MaxInt32 || node.m > 1 {
+			if node.n == math.MaxInt32 {
+				w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
+			} else {
+				w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
+			}
+		} else {
+			w.emit1(InstOp(Branchmark+lazy), w.popInt())
+		}
+
+		if node.m == 0 {
+			w.patchJump(w.popInt(), startJumpPos)
+		}
+
+	case ntGroup | beforeChild, ntGroup | afterChild:
+
+	case ntCapture | beforeChild:
+		w.emit(Setmark)
+
+	case ntCapture | afterChild:
+		w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
+
+	case ntRequire | beforeChild:
+		// NOTE: the following line causes lookahead/lookbehind to be
+		// NON-BACKTRACKING. It can be commented out with (*)
+		w.emit(Setjump)
+
+		w.emit(Setmark)
+
+	case ntRequire | afterChild:
+		w.emit(Getmark)
+
+		// NOTE: the following line causes lookahead/lookbehind to be
+		// NON-BACKTRACKING. It can be commented out with (*)
+		w.emit(Forejump)
+
+	case ntPrevent | beforeChild:
+		w.emit(Setjump)
+		w.pushInt(w.curPos())
+		w.emit1(Lazybranch, 0)
+
+	case ntPrevent | afterChild:
+		w.emit(Backjump)
+		w.patchJump(w.popInt(), w.curPos())
+		w.emit(Forejump)
+
+	case ntGreedy | beforeChild:
+		w.emit(Setjump)
+
+	case ntGreedy | afterChild:
+		w.emit(Forejump)
+
+	case ntOne, ntNotone:
+		w.emit1(InstOp(node.t|ntBits), int(node.ch))
+
+	case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
+		if node.m > 0 {
+			if node.t == ntOneloop || node.t == ntOnelazy {
+				w.emit2(Onerep|bits, int(node.ch), node.m)
+			} else {
+				w.emit2(Notonerep|bits, int(node.ch), node.m)
+			}
+		}
+		if node.n > node.m {
+			if node.n == math.MaxInt32 {
+				w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
+			} else {
+				w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
+			}
+		}
+
+	case ntSetloop, ntSetlazy:
+		if node.m > 0 {
+			w.emit2(Setrep|bits, w.setCode(node.set), node.m)
+		}
+		if node.n > node.m {
+			if node.n == math.MaxInt32 {
+				w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
+			} else {
+				w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
+			}
+		}
+
+	case ntMulti:
+		w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
+
+	case ntSet:
+		w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
+
+	case ntRef:
+		w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
+
+	case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
+		w.emit(InstOp(node.t))
+
+	default:
+		return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
+	}
+
+	return nil
+}
+
+// To avoid recursion, we use a simple integer stack.
+// This is the push.
+func (w *writer) pushInt(i int) {
+	w.intStack = append(w.intStack, i)
+}
+
+// Returns true if the stack is empty.
+func (w *writer) emptyStack() bool {
+	return len(w.intStack) == 0
+}
+
+// This is the pop.
+func (w *writer) popInt() int {
+	//get our item
+	idx := len(w.intStack) - 1
+	i := w.intStack[idx]
+	//trim our slice
+	w.intStack = w.intStack[:idx]
+	return i
+}
+
+// Returns the current position in the emitted code.
+func (w *writer) curPos() int {
+	return w.curpos
+}
+
+// Fixes up a jump instruction at the specified offset
+// so that it jumps to the specified jumpDest.
+func (w *writer) patchJump(offset, jumpDest int) {
+	w.emitted[offset+1] = jumpDest
+}
+
+// Returns an index in the set table for a charset
+// uses a map to eliminate duplicates.
+func (w *writer) setCode(set *CharSet) int {
+	if w.counting {
+		return 0
+	}
+
+	buf := &bytes.Buffer{}
+
+	set.mapHashFill(buf)
+	hash := buf.String()
+	i, ok := w.sethash[hash]
+	if !ok {
+		i = len(w.sethash)
+		w.sethash[hash] = i
+		w.settable = append(w.settable, set)
+	}
+	return i
+}
+
+// Returns an index in the string table for a string.
+// uses a map to eliminate duplicates.
+func (w *writer) stringCode(str []rune) int {
+	if w.counting {
+		return 0
+	}
+
+	hash := string(str)
+	i, ok := w.stringhash[hash]
+	if !ok {
+		i = len(w.stringhash)
+		w.stringhash[hash] = i
+		w.stringtable = append(w.stringtable, str)
+	}
+
+	return i
+}
+
+// When generating code on a regex that uses a sparse set
+// of capture slots, we hash them to a dense set of indices
+// for an array of capture slots. Instead of doing the hash
+// at match time, it's done at compile time, here.
+func (w *writer) mapCapnum(capnum int) int {
+	if capnum == -1 {
+		return -1
+	}
+
+	if w.caps != nil {
+		return w.caps[capnum]
+	}
+
+	return capnum
+}
+
+// Emits a zero-argument operation. Note that the emit
+// functions all run in two modes: they can emit code, or
+// they can just count the size of the code.
+func (w *writer) emit(op InstOp) {
+	if w.counting {
+		w.count++
+		if opcodeBacktracks(op) {
+			w.trackcount++
+		}
+		return
+	}
+	w.emitted[w.curpos] = int(op)
+	w.curpos++
+}
+
+// Emits a one-argument operation.
+func (w *writer) emit1(op InstOp, opd1 int) {
+	if w.counting {
+		w.count += 2
+		if opcodeBacktracks(op) {
+			w.trackcount++
+		}
+		return
+	}
+	w.emitted[w.curpos] = int(op)
+	w.curpos++
+	w.emitted[w.curpos] = opd1
+	w.curpos++
+}
+
+// Emits a two-argument operation.
+func (w *writer) emit2(op InstOp, opd1, opd2 int) {
+	if w.counting {
+		w.count += 3
+		if opcodeBacktracks(op) {
+			w.trackcount++
+		}
+		return
+	}
+	w.emitted[w.curpos] = int(op)
+	w.curpos++
+	w.emitted[w.curpos] = opd1
+	w.curpos++
+	w.emitted[w.curpos] = opd2
+	w.curpos++
+}