diff vendor/github.com/alecthomas/chroma/v2/regexp.go @ 66:787b5ee0289d draft

Use vendored modules Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author yakumo.izuru
date Sun, 23 Jul 2023 13:18:53 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vendor/github.com/alecthomas/chroma/v2/regexp.go	Sun Jul 23 13:18:53 2023 +0000
@@ -0,0 +1,480 @@
+package chroma
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+	"unicode/utf8"
+
+	"github.com/dlclark/regexp2"
+)
+
+// A Rule is the fundamental matching unit of the Regex lexer state machine.
+type Rule struct {
+	Pattern string
+	Type    Emitter
+	Mutator Mutator
+}
+
+// Words creates a regex that matches any of the given literal words.
+func Words(prefix, suffix string, words ...string) string {
+	sort.Slice(words, func(i, j int) bool {
+		return len(words[j]) < len(words[i])
+	})
+	for i, word := range words {
+		words[i] = regexp.QuoteMeta(word)
+	}
+	return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
+}
+
+// Tokenise text using lexer, returning tokens as a slice.
+func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
+	var out []Token
+	it, err := lexer.Tokenise(options, text)
+	if err != nil {
+		return nil, err
+	}
+	for t := it(); t != EOF; t = it() {
+		out = append(out, t)
+	}
+	return out, nil
+}
+
+// Rules maps from state to a sequence of Rules.
+type Rules map[string][]Rule
+
+// Rename clones rules then a rule.
+func (r Rules) Rename(oldRule, newRule string) Rules {
+	r = r.Clone()
+	r[newRule] = r[oldRule]
+	delete(r, oldRule)
+	return r
+}
+
+// Clone returns a clone of the Rules.
+func (r Rules) Clone() Rules {
+	out := map[string][]Rule{}
+	for key, rules := range r {
+		out[key] = make([]Rule, len(rules))
+		copy(out[key], rules)
+	}
+	return out
+}
+
+// Merge creates a clone of "r" then merges "rules" into the clone.
+func (r Rules) Merge(rules Rules) Rules {
+	out := r.Clone()
+	for k, v := range rules.Clone() {
+		out[k] = v
+	}
+	return out
+}
+
+// MustNewLexer creates a new Lexer with deferred rules generation or panics.
+func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
+	lexer, err := NewLexer(config, rulesFunc)
+	if err != nil {
+		panic(err)
+	}
+	return lexer
+}
+
+// NewLexer creates a new regex-based Lexer.
+//
+// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
+// that match input, optionally modify lexer state, and output tokens.
+func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
+	if config == nil {
+		config = &Config{}
+	}
+	for _, glob := range append(config.Filenames, config.AliasFilenames...) {
+		_, err := filepath.Match(glob, "")
+		if err != nil {
+			return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
+		}
+	}
+	r := &RegexLexer{
+		config:         config,
+		fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
+	}
+	// One-off code to generate XML lexers in the Chroma source tree.
+	// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
+	// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
+	// data, err := Marshal(r)
+	// if err != nil {
+	// 	if errors.Is(err, ErrNotSerialisable) {
+	// 		fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
+	// 		return r, nil
+	// 	}
+	// 	return nil, err
+	// }
+	// _, file, _, ok := runtime.Caller(2)
+	// if !ok {
+	// 	panic("??")
+	// }
+	// fmt.Println(file)
+	// if strings.Contains(file, "/lexers/") {
+	// 	dir := filepath.Join(filepath.Dir(file), "embedded")
+	// 	err = os.MkdirAll(dir, 0700)
+	// 	if err != nil {
+	// 		return nil, err
+	// 	}
+	// 	filename := filepath.Join(dir, name) + ".xml"
+	// 	fmt.Println(filename)
+	// 	err = ioutil.WriteFile(filename, data, 0600)
+	// 	if err != nil {
+	// 		return nil, err
+	// 	}
+	// }
+	return r, nil
+}
+
+// Trace enables debug tracing.
+func (r *RegexLexer) Trace(trace bool) *RegexLexer {
+	r.trace = trace
+	return r
+}
+
+// A CompiledRule is a Rule with a pre-compiled regex.
+//
+// Note that regular expressions are lazily compiled on first use of the lexer.
+type CompiledRule struct {
+	Rule
+	Regexp *regexp2.Regexp
+	flags  string
+}
+
+// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
+type CompiledRules map[string][]*CompiledRule
+
+// LexerState contains the state for a single lex.
+type LexerState struct {
+	Lexer    *RegexLexer
+	Registry *LexerRegistry
+	Text     []rune
+	Pos      int
+	Rules    CompiledRules
+	Stack    []string
+	State    string
+	Rule     int
+	// Group matches.
+	Groups []string
+	// Named Group matches.
+	NamedGroups map[string]string
+	// Custum context for mutators.
+	MutatorContext map[interface{}]interface{}
+	iteratorStack  []Iterator
+	options        *TokeniseOptions
+	newlineAdded   bool
+}
+
+// Set mutator context.
+func (l *LexerState) Set(key interface{}, value interface{}) {
+	l.MutatorContext[key] = value
+}
+
+// Get mutator context.
+func (l *LexerState) Get(key interface{}) interface{} {
+	return l.MutatorContext[key]
+}
+
+// Iterator returns the next Token from the lexer.
+func (l *LexerState) Iterator() Token { // nolint: gocognit
+	end := len(l.Text)
+	if l.newlineAdded {
+		end--
+	}
+	for l.Pos < end && len(l.Stack) > 0 {
+		// Exhaust the iterator stack, if any.
+		for len(l.iteratorStack) > 0 {
+			n := len(l.iteratorStack) - 1
+			t := l.iteratorStack[n]()
+			if t == EOF {
+				l.iteratorStack = l.iteratorStack[:n]
+				continue
+			}
+			return t
+		}
+
+		l.State = l.Stack[len(l.Stack)-1]
+		if l.Lexer.trace {
+			fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
+		}
+		selectedRule, ok := l.Rules[l.State]
+		if !ok {
+			panic("unknown state " + l.State)
+		}
+		ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
+		// No match.
+		if groups == nil {
+			// From Pygments :\
+			//
+			// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
+			// emptied and the lexer continues scanning in the 'root' state. This can help producing
+			// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
+			// closed.
+			if l.Text[l.Pos] == '\n' && l.State != l.options.State {
+				l.Stack = []string{l.options.State}
+				continue
+			}
+			l.Pos++
+			return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
+		}
+		l.Rule = ruleIndex
+		l.Groups = groups
+		l.NamedGroups = namedGroups
+		l.Pos += utf8.RuneCountInString(groups[0])
+		if rule.Mutator != nil {
+			if err := rule.Mutator.Mutate(l); err != nil {
+				panic(err)
+			}
+		}
+		if rule.Type != nil {
+			l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
+		}
+	}
+	// Exhaust the IteratorStack, if any.
+	// Duplicate code, but eh.
+	for len(l.iteratorStack) > 0 {
+		n := len(l.iteratorStack) - 1
+		t := l.iteratorStack[n]()
+		if t == EOF {
+			l.iteratorStack = l.iteratorStack[:n]
+			continue
+		}
+		return t
+	}
+
+	// If we get to here and we still have text, return it as an error.
+	if l.Pos != len(l.Text) && len(l.Stack) == 0 {
+		value := string(l.Text[l.Pos:])
+		l.Pos = len(l.Text)
+		return Token{Type: Error, Value: value}
+	}
+	return EOF
+}
+
+// RegexLexer is the default lexer implementation used in Chroma.
+type RegexLexer struct {
+	registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
+	config   *Config
+	analyser func(text string) float32
+	trace    bool
+
+	mu             sync.Mutex
+	compiled       bool
+	rawRules       Rules
+	rules          map[string][]*CompiledRule
+	fetchRulesFunc func() (Rules, error)
+	compileOnce    sync.Once
+}
+
+func (r *RegexLexer) String() string {
+	return r.config.Name
+}
+
+// Rules in the Lexer.
+func (r *RegexLexer) Rules() (Rules, error) {
+	if err := r.needRules(); err != nil {
+		return nil, err
+	}
+	return r.rawRules, nil
+}
+
+// SetRegistry the lexer will use to lookup other lexers if necessary.
+func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
+	r.registry = registry
+	return r
+}
+
+// SetAnalyser sets the analyser function used to perform content inspection.
+func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
+	r.analyser = analyser
+	return r
+}
+
+func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
+	if r.analyser != nil {
+		return r.analyser(text)
+	}
+	return 0.0
+}
+
+// SetConfig replaces the Config for this Lexer.
+func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
+	r.config = config
+	return r
+}
+
+func (r *RegexLexer) Config() *Config { // nolint
+	return r.config
+}
+
+// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
+func (r *RegexLexer) maybeCompile() (err error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.compiled {
+		return nil
+	}
+	for state, rules := range r.rules {
+		for i, rule := range rules {
+			if rule.Regexp == nil {
+				pattern := "(?:" + rule.Pattern + ")"
+				if rule.flags != "" {
+					pattern = "(?" + rule.flags + ")" + pattern
+				}
+				pattern = `\G` + pattern
+				rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
+				if err != nil {
+					return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
+				}
+				rule.Regexp.MatchTimeout = time.Millisecond * 250
+			}
+		}
+	}
+restart:
+	seen := map[LexerMutator]bool{}
+	for state := range r.rules {
+		for i := 0; i < len(r.rules[state]); i++ {
+			rule := r.rules[state][i]
+			if compile, ok := rule.Mutator.(LexerMutator); ok {
+				if seen[compile] {
+					return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
+				}
+				seen[compile] = true
+				if err := compile.MutateLexer(r.rules, state, i); err != nil {
+					return err
+				}
+				// Process the rules again in case the mutator added/removed rules.
+				//
+				// This sounds bad, but shouldn't be significant in practice.
+				goto restart
+			}
+		}
+	}
+	r.compiled = true
+	return nil
+}
+
+func (r *RegexLexer) fetchRules() error {
+	rules, err := r.fetchRulesFunc()
+	if err != nil {
+		return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
+	}
+	if _, ok := rules["root"]; !ok {
+		return fmt.Errorf("no \"root\" state")
+	}
+	compiledRules := map[string][]*CompiledRule{}
+	for state, rules := range rules {
+		compiledRules[state] = nil
+		for _, rule := range rules {
+			flags := ""
+			if !r.config.NotMultiline {
+				flags += "m"
+			}
+			if r.config.CaseInsensitive {
+				flags += "i"
+			}
+			if r.config.DotAll {
+				flags += "s"
+			}
+			compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
+		}
+	}
+
+	r.rawRules = rules
+	r.rules = compiledRules
+	return nil
+}
+
+func (r *RegexLexer) needRules() error {
+	var err error
+	if r.fetchRulesFunc != nil {
+		r.compileOnce.Do(func() {
+			err = r.fetchRules()
+		})
+	}
+	if err := r.maybeCompile(); err != nil {
+		return err
+	}
+	return err
+}
+
+func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
+	err := r.needRules()
+	if err != nil {
+		return nil, err
+	}
+	if options == nil {
+		options = defaultOptions
+	}
+	if options.EnsureLF {
+		text = ensureLF(text)
+	}
+	newlineAdded := false
+	if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
+		text += "\n"
+		newlineAdded = true
+	}
+	state := &LexerState{
+		Registry:       r.registry,
+		newlineAdded:   newlineAdded,
+		options:        options,
+		Lexer:          r,
+		Text:           []rune(text),
+		Stack:          []string{options.State},
+		Rules:          r.rules,
+		MutatorContext: map[interface{}]interface{}{},
+	}
+	return state.Iterator, nil
+}
+
+// MustRules is like Rules() but will panic on error.
+func (r *RegexLexer) MustRules() Rules {
+	rules, err := r.Rules()
+	if err != nil {
+		panic(err)
+	}
+	return rules
+}
+
+func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
+	for i, rule := range rules {
+		match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
+		if match != nil && err == nil && match.Index == pos {
+			groups := []string{}
+			namedGroups := make(map[string]string)
+			for _, g := range match.Groups() {
+				namedGroups[g.Name] = g.String()
+				groups = append(groups, g.String())
+			}
+			return i, rule, groups, namedGroups
+		}
+	}
+	return 0, &CompiledRule{}, nil, nil
+}
+
+// replace \r and \r\n with \n
+// same as strings.ReplaceAll but more efficient
+func ensureLF(text string) string {
+	buf := make([]byte, len(text))
+	var j int
+	for i := 0; i < len(text); i++ {
+		c := text[i]
+		if c == '\r' {
+			if i < len(text)-1 && text[i+1] == '\n' {
+				continue
+			}
+			c = '\n'
+		}
+		buf[j] = c
+		j++
+	}
+	return string(buf[:j])
+}