Mercurial > yakumo_izuru > aya
comparison vendor/github.com/alecthomas/chroma/v2/regexp.go @ 66:787b5ee0289d draft
Use vendored modules
Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author | yakumo.izuru |
---|---|
date | Sun, 23 Jul 2023 13:18:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
65:6d985efa0f7a | 66:787b5ee0289d |
---|---|
1 package chroma | |
2 | |
3 import ( | |
4 "fmt" | |
5 "os" | |
6 "path/filepath" | |
7 "regexp" | |
8 "sort" | |
9 "strings" | |
10 "sync" | |
11 "time" | |
12 "unicode/utf8" | |
13 | |
14 "github.com/dlclark/regexp2" | |
15 ) | |
16 | |
17 // A Rule is the fundamental matching unit of the Regex lexer state machine. | |
18 type Rule struct { | |
19 Pattern string | |
20 Type Emitter | |
21 Mutator Mutator | |
22 } | |
23 | |
24 // Words creates a regex that matches any of the given literal words. | |
25 func Words(prefix, suffix string, words ...string) string { | |
26 sort.Slice(words, func(i, j int) bool { | |
27 return len(words[j]) < len(words[i]) | |
28 }) | |
29 for i, word := range words { | |
30 words[i] = regexp.QuoteMeta(word) | |
31 } | |
32 return prefix + `(` + strings.Join(words, `|`) + `)` + suffix | |
33 } | |
34 | |
35 // Tokenise text using lexer, returning tokens as a slice. | |
36 func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) { | |
37 var out []Token | |
38 it, err := lexer.Tokenise(options, text) | |
39 if err != nil { | |
40 return nil, err | |
41 } | |
42 for t := it(); t != EOF; t = it() { | |
43 out = append(out, t) | |
44 } | |
45 return out, nil | |
46 } | |
47 | |
48 // Rules maps from state to a sequence of Rules. | |
49 type Rules map[string][]Rule | |
50 | |
51 // Rename clones rules then a rule. | |
52 func (r Rules) Rename(oldRule, newRule string) Rules { | |
53 r = r.Clone() | |
54 r[newRule] = r[oldRule] | |
55 delete(r, oldRule) | |
56 return r | |
57 } | |
58 | |
59 // Clone returns a clone of the Rules. | |
60 func (r Rules) Clone() Rules { | |
61 out := map[string][]Rule{} | |
62 for key, rules := range r { | |
63 out[key] = make([]Rule, len(rules)) | |
64 copy(out[key], rules) | |
65 } | |
66 return out | |
67 } | |
68 | |
69 // Merge creates a clone of "r" then merges "rules" into the clone. | |
70 func (r Rules) Merge(rules Rules) Rules { | |
71 out := r.Clone() | |
72 for k, v := range rules.Clone() { | |
73 out[k] = v | |
74 } | |
75 return out | |
76 } | |
77 | |
78 // MustNewLexer creates a new Lexer with deferred rules generation or panics. | |
79 func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer { | |
80 lexer, err := NewLexer(config, rulesFunc) | |
81 if err != nil { | |
82 panic(err) | |
83 } | |
84 return lexer | |
85 } | |
86 | |
87 // NewLexer creates a new regex-based Lexer. | |
88 // | |
89 // "rules" is a state machine transition map. Each key is a state. Values are sets of rules | |
90 // that match input, optionally modify lexer state, and output tokens. | |
91 func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) { | |
92 if config == nil { | |
93 config = &Config{} | |
94 } | |
95 for _, glob := range append(config.Filenames, config.AliasFilenames...) { | |
96 _, err := filepath.Match(glob, "") | |
97 if err != nil { | |
98 return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err) | |
99 } | |
100 } | |
101 r := &RegexLexer{ | |
102 config: config, | |
103 fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil }, | |
104 } | |
105 // One-off code to generate XML lexers in the Chroma source tree. | |
106 // var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`) | |
107 // name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_")) | |
108 // data, err := Marshal(r) | |
109 // if err != nil { | |
110 // if errors.Is(err, ErrNotSerialisable) { | |
111 // fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err) | |
112 // return r, nil | |
113 // } | |
114 // return nil, err | |
115 // } | |
116 // _, file, _, ok := runtime.Caller(2) | |
117 // if !ok { | |
118 // panic("??") | |
119 // } | |
120 // fmt.Println(file) | |
121 // if strings.Contains(file, "/lexers/") { | |
122 // dir := filepath.Join(filepath.Dir(file), "embedded") | |
123 // err = os.MkdirAll(dir, 0700) | |
124 // if err != nil { | |
125 // return nil, err | |
126 // } | |
127 // filename := filepath.Join(dir, name) + ".xml" | |
128 // fmt.Println(filename) | |
129 // err = ioutil.WriteFile(filename, data, 0600) | |
130 // if err != nil { | |
131 // return nil, err | |
132 // } | |
133 // } | |
134 return r, nil | |
135 } | |
136 | |
137 // Trace enables debug tracing. | |
138 func (r *RegexLexer) Trace(trace bool) *RegexLexer { | |
139 r.trace = trace | |
140 return r | |
141 } | |
142 | |
143 // A CompiledRule is a Rule with a pre-compiled regex. | |
144 // | |
145 // Note that regular expressions are lazily compiled on first use of the lexer. | |
146 type CompiledRule struct { | |
147 Rule | |
148 Regexp *regexp2.Regexp | |
149 flags string | |
150 } | |
151 | |
152 // CompiledRules is a map of rule name to sequence of compiled rules in that rule. | |
153 type CompiledRules map[string][]*CompiledRule | |
154 | |
155 // LexerState contains the state for a single lex. | |
156 type LexerState struct { | |
157 Lexer *RegexLexer | |
158 Registry *LexerRegistry | |
159 Text []rune | |
160 Pos int | |
161 Rules CompiledRules | |
162 Stack []string | |
163 State string | |
164 Rule int | |
165 // Group matches. | |
166 Groups []string | |
167 // Named Group matches. | |
168 NamedGroups map[string]string | |
169 // Custum context for mutators. | |
170 MutatorContext map[interface{}]interface{} | |
171 iteratorStack []Iterator | |
172 options *TokeniseOptions | |
173 newlineAdded bool | |
174 } | |
175 | |
176 // Set mutator context. | |
177 func (l *LexerState) Set(key interface{}, value interface{}) { | |
178 l.MutatorContext[key] = value | |
179 } | |
180 | |
181 // Get mutator context. | |
182 func (l *LexerState) Get(key interface{}) interface{} { | |
183 return l.MutatorContext[key] | |
184 } | |
185 | |
186 // Iterator returns the next Token from the lexer. | |
187 func (l *LexerState) Iterator() Token { // nolint: gocognit | |
188 end := len(l.Text) | |
189 if l.newlineAdded { | |
190 end-- | |
191 } | |
192 for l.Pos < end && len(l.Stack) > 0 { | |
193 // Exhaust the iterator stack, if any. | |
194 for len(l.iteratorStack) > 0 { | |
195 n := len(l.iteratorStack) - 1 | |
196 t := l.iteratorStack[n]() | |
197 if t == EOF { | |
198 l.iteratorStack = l.iteratorStack[:n] | |
199 continue | |
200 } | |
201 return t | |
202 } | |
203 | |
204 l.State = l.Stack[len(l.Stack)-1] | |
205 if l.Lexer.trace { | |
206 fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:])) | |
207 } | |
208 selectedRule, ok := l.Rules[l.State] | |
209 if !ok { | |
210 panic("unknown state " + l.State) | |
211 } | |
212 ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule) | |
213 // No match. | |
214 if groups == nil { | |
215 // From Pygments :\ | |
216 // | |
217 // If the RegexLexer encounters a newline that is flagged as an error token, the stack is | |
218 // emptied and the lexer continues scanning in the 'root' state. This can help producing | |
219 // error-tolerant highlighting for erroneous input, e.g. when a single-line string is not | |
220 // closed. | |
221 if l.Text[l.Pos] == '\n' && l.State != l.options.State { | |
222 l.Stack = []string{l.options.State} | |
223 continue | |
224 } | |
225 l.Pos++ | |
226 return Token{Error, string(l.Text[l.Pos-1 : l.Pos])} | |
227 } | |
228 l.Rule = ruleIndex | |
229 l.Groups = groups | |
230 l.NamedGroups = namedGroups | |
231 l.Pos += utf8.RuneCountInString(groups[0]) | |
232 if rule.Mutator != nil { | |
233 if err := rule.Mutator.Mutate(l); err != nil { | |
234 panic(err) | |
235 } | |
236 } | |
237 if rule.Type != nil { | |
238 l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l)) | |
239 } | |
240 } | |
241 // Exhaust the IteratorStack, if any. | |
242 // Duplicate code, but eh. | |
243 for len(l.iteratorStack) > 0 { | |
244 n := len(l.iteratorStack) - 1 | |
245 t := l.iteratorStack[n]() | |
246 if t == EOF { | |
247 l.iteratorStack = l.iteratorStack[:n] | |
248 continue | |
249 } | |
250 return t | |
251 } | |
252 | |
253 // If we get to here and we still have text, return it as an error. | |
254 if l.Pos != len(l.Text) && len(l.Stack) == 0 { | |
255 value := string(l.Text[l.Pos:]) | |
256 l.Pos = len(l.Text) | |
257 return Token{Type: Error, Value: value} | |
258 } | |
259 return EOF | |
260 } | |
261 | |
262 // RegexLexer is the default lexer implementation used in Chroma. | |
263 type RegexLexer struct { | |
264 registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any. | |
265 config *Config | |
266 analyser func(text string) float32 | |
267 trace bool | |
268 | |
269 mu sync.Mutex | |
270 compiled bool | |
271 rawRules Rules | |
272 rules map[string][]*CompiledRule | |
273 fetchRulesFunc func() (Rules, error) | |
274 compileOnce sync.Once | |
275 } | |
276 | |
277 func (r *RegexLexer) String() string { | |
278 return r.config.Name | |
279 } | |
280 | |
281 // Rules in the Lexer. | |
282 func (r *RegexLexer) Rules() (Rules, error) { | |
283 if err := r.needRules(); err != nil { | |
284 return nil, err | |
285 } | |
286 return r.rawRules, nil | |
287 } | |
288 | |
289 // SetRegistry the lexer will use to lookup other lexers if necessary. | |
290 func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer { | |
291 r.registry = registry | |
292 return r | |
293 } | |
294 | |
295 // SetAnalyser sets the analyser function used to perform content inspection. | |
296 func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer { | |
297 r.analyser = analyser | |
298 return r | |
299 } | |
300 | |
301 func (r *RegexLexer) AnalyseText(text string) float32 { // nolint | |
302 if r.analyser != nil { | |
303 return r.analyser(text) | |
304 } | |
305 return 0.0 | |
306 } | |
307 | |
308 // SetConfig replaces the Config for this Lexer. | |
309 func (r *RegexLexer) SetConfig(config *Config) *RegexLexer { | |
310 r.config = config | |
311 return r | |
312 } | |
313 | |
314 func (r *RegexLexer) Config() *Config { // nolint | |
315 return r.config | |
316 } | |
317 | |
318 // Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs. | |
319 func (r *RegexLexer) maybeCompile() (err error) { | |
320 r.mu.Lock() | |
321 defer r.mu.Unlock() | |
322 if r.compiled { | |
323 return nil | |
324 } | |
325 for state, rules := range r.rules { | |
326 for i, rule := range rules { | |
327 if rule.Regexp == nil { | |
328 pattern := "(?:" + rule.Pattern + ")" | |
329 if rule.flags != "" { | |
330 pattern = "(?" + rule.flags + ")" + pattern | |
331 } | |
332 pattern = `\G` + pattern | |
333 rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2) | |
334 if err != nil { | |
335 return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err) | |
336 } | |
337 rule.Regexp.MatchTimeout = time.Millisecond * 250 | |
338 } | |
339 } | |
340 } | |
341 restart: | |
342 seen := map[LexerMutator]bool{} | |
343 for state := range r.rules { | |
344 for i := 0; i < len(r.rules[state]); i++ { | |
345 rule := r.rules[state][i] | |
346 if compile, ok := rule.Mutator.(LexerMutator); ok { | |
347 if seen[compile] { | |
348 return fmt.Errorf("saw mutator %T twice; this should not happen", compile) | |
349 } | |
350 seen[compile] = true | |
351 if err := compile.MutateLexer(r.rules, state, i); err != nil { | |
352 return err | |
353 } | |
354 // Process the rules again in case the mutator added/removed rules. | |
355 // | |
356 // This sounds bad, but shouldn't be significant in practice. | |
357 goto restart | |
358 } | |
359 } | |
360 } | |
361 r.compiled = true | |
362 return nil | |
363 } | |
364 | |
365 func (r *RegexLexer) fetchRules() error { | |
366 rules, err := r.fetchRulesFunc() | |
367 if err != nil { | |
368 return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err) | |
369 } | |
370 if _, ok := rules["root"]; !ok { | |
371 return fmt.Errorf("no \"root\" state") | |
372 } | |
373 compiledRules := map[string][]*CompiledRule{} | |
374 for state, rules := range rules { | |
375 compiledRules[state] = nil | |
376 for _, rule := range rules { | |
377 flags := "" | |
378 if !r.config.NotMultiline { | |
379 flags += "m" | |
380 } | |
381 if r.config.CaseInsensitive { | |
382 flags += "i" | |
383 } | |
384 if r.config.DotAll { | |
385 flags += "s" | |
386 } | |
387 compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags}) | |
388 } | |
389 } | |
390 | |
391 r.rawRules = rules | |
392 r.rules = compiledRules | |
393 return nil | |
394 } | |
395 | |
396 func (r *RegexLexer) needRules() error { | |
397 var err error | |
398 if r.fetchRulesFunc != nil { | |
399 r.compileOnce.Do(func() { | |
400 err = r.fetchRules() | |
401 }) | |
402 } | |
403 if err := r.maybeCompile(); err != nil { | |
404 return err | |
405 } | |
406 return err | |
407 } | |
408 | |
409 func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint | |
410 err := r.needRules() | |
411 if err != nil { | |
412 return nil, err | |
413 } | |
414 if options == nil { | |
415 options = defaultOptions | |
416 } | |
417 if options.EnsureLF { | |
418 text = ensureLF(text) | |
419 } | |
420 newlineAdded := false | |
421 if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") { | |
422 text += "\n" | |
423 newlineAdded = true | |
424 } | |
425 state := &LexerState{ | |
426 Registry: r.registry, | |
427 newlineAdded: newlineAdded, | |
428 options: options, | |
429 Lexer: r, | |
430 Text: []rune(text), | |
431 Stack: []string{options.State}, | |
432 Rules: r.rules, | |
433 MutatorContext: map[interface{}]interface{}{}, | |
434 } | |
435 return state.Iterator, nil | |
436 } | |
437 | |
438 // MustRules is like Rules() but will panic on error. | |
439 func (r *RegexLexer) MustRules() Rules { | |
440 rules, err := r.Rules() | |
441 if err != nil { | |
442 panic(err) | |
443 } | |
444 return rules | |
445 } | |
446 | |
447 func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) { | |
448 for i, rule := range rules { | |
449 match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos) | |
450 if match != nil && err == nil && match.Index == pos { | |
451 groups := []string{} | |
452 namedGroups := make(map[string]string) | |
453 for _, g := range match.Groups() { | |
454 namedGroups[g.Name] = g.String() | |
455 groups = append(groups, g.String()) | |
456 } | |
457 return i, rule, groups, namedGroups | |
458 } | |
459 } | |
460 return 0, &CompiledRule{}, nil, nil | |
461 } | |
462 | |
463 // replace \r and \r\n with \n | |
464 // same as strings.ReplaceAll but more efficient | |
465 func ensureLF(text string) string { | |
466 buf := make([]byte, len(text)) | |
467 var j int | |
468 for i := 0; i < len(text); i++ { | |
469 c := text[i] | |
470 if c == '\r' { | |
471 if i < len(text)-1 && text[i+1] == '\n' { | |
472 continue | |
473 } | |
474 c = '\n' | |
475 } | |
476 buf[j] = c | |
477 j++ | |
478 } | |
479 return string(buf[:j]) | |
480 } |