66
|
1 package chroma
|
|
2
|
|
3 import (
|
|
4 "fmt"
|
|
5 "os"
|
|
6 "path/filepath"
|
|
7 "regexp"
|
|
8 "sort"
|
|
9 "strings"
|
|
10 "sync"
|
|
11 "time"
|
|
12 "unicode/utf8"
|
|
13
|
|
14 "github.com/dlclark/regexp2"
|
|
15 )
|
|
16
|
|
17 // A Rule is the fundamental matching unit of the Regex lexer state machine.
|
|
18 type Rule struct {
|
|
19 Pattern string
|
|
20 Type Emitter
|
|
21 Mutator Mutator
|
|
22 }
|
|
23
|
|
24 // Words creates a regex that matches any of the given literal words.
|
|
25 func Words(prefix, suffix string, words ...string) string {
|
|
26 sort.Slice(words, func(i, j int) bool {
|
|
27 return len(words[j]) < len(words[i])
|
|
28 })
|
|
29 for i, word := range words {
|
|
30 words[i] = regexp.QuoteMeta(word)
|
|
31 }
|
|
32 return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
|
|
33 }
|
|
34
|
|
35 // Tokenise text using lexer, returning tokens as a slice.
|
|
36 func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
|
|
37 var out []Token
|
|
38 it, err := lexer.Tokenise(options, text)
|
|
39 if err != nil {
|
|
40 return nil, err
|
|
41 }
|
|
42 for t := it(); t != EOF; t = it() {
|
|
43 out = append(out, t)
|
|
44 }
|
|
45 return out, nil
|
|
46 }
|
|
47
|
|
48 // Rules maps from state to a sequence of Rules.
|
|
49 type Rules map[string][]Rule
|
|
50
|
|
51 // Rename clones rules then a rule.
|
|
52 func (r Rules) Rename(oldRule, newRule string) Rules {
|
|
53 r = r.Clone()
|
|
54 r[newRule] = r[oldRule]
|
|
55 delete(r, oldRule)
|
|
56 return r
|
|
57 }
|
|
58
|
|
59 // Clone returns a clone of the Rules.
|
|
60 func (r Rules) Clone() Rules {
|
|
61 out := map[string][]Rule{}
|
|
62 for key, rules := range r {
|
|
63 out[key] = make([]Rule, len(rules))
|
|
64 copy(out[key], rules)
|
|
65 }
|
|
66 return out
|
|
67 }
|
|
68
|
|
69 // Merge creates a clone of "r" then merges "rules" into the clone.
|
|
70 func (r Rules) Merge(rules Rules) Rules {
|
|
71 out := r.Clone()
|
|
72 for k, v := range rules.Clone() {
|
|
73 out[k] = v
|
|
74 }
|
|
75 return out
|
|
76 }
|
|
77
|
|
78 // MustNewLexer creates a new Lexer with deferred rules generation or panics.
|
|
79 func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
|
|
80 lexer, err := NewLexer(config, rulesFunc)
|
|
81 if err != nil {
|
|
82 panic(err)
|
|
83 }
|
|
84 return lexer
|
|
85 }
|
|
86
|
|
87 // NewLexer creates a new regex-based Lexer.
|
|
88 //
|
|
89 // "rules" is a state machine transition map. Each key is a state. Values are sets of rules
|
|
90 // that match input, optionally modify lexer state, and output tokens.
|
|
91 func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
|
|
92 if config == nil {
|
|
93 config = &Config{}
|
|
94 }
|
|
95 for _, glob := range append(config.Filenames, config.AliasFilenames...) {
|
|
96 _, err := filepath.Match(glob, "")
|
|
97 if err != nil {
|
|
98 return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
|
|
99 }
|
|
100 }
|
|
101 r := &RegexLexer{
|
|
102 config: config,
|
|
103 fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
|
|
104 }
|
|
105 // One-off code to generate XML lexers in the Chroma source tree.
|
|
106 // var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
|
|
107 // name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
|
|
108 // data, err := Marshal(r)
|
|
109 // if err != nil {
|
|
110 // if errors.Is(err, ErrNotSerialisable) {
|
|
111 // fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
|
|
112 // return r, nil
|
|
113 // }
|
|
114 // return nil, err
|
|
115 // }
|
|
116 // _, file, _, ok := runtime.Caller(2)
|
|
117 // if !ok {
|
|
118 // panic("??")
|
|
119 // }
|
|
120 // fmt.Println(file)
|
|
121 // if strings.Contains(file, "/lexers/") {
|
|
122 // dir := filepath.Join(filepath.Dir(file), "embedded")
|
|
123 // err = os.MkdirAll(dir, 0700)
|
|
124 // if err != nil {
|
|
125 // return nil, err
|
|
126 // }
|
|
127 // filename := filepath.Join(dir, name) + ".xml"
|
|
128 // fmt.Println(filename)
|
|
129 // err = ioutil.WriteFile(filename, data, 0600)
|
|
130 // if err != nil {
|
|
131 // return nil, err
|
|
132 // }
|
|
133 // }
|
|
134 return r, nil
|
|
135 }
|
|
136
|
|
137 // Trace enables debug tracing.
|
|
138 func (r *RegexLexer) Trace(trace bool) *RegexLexer {
|
|
139 r.trace = trace
|
|
140 return r
|
|
141 }
|
|
142
|
|
143 // A CompiledRule is a Rule with a pre-compiled regex.
|
|
144 //
|
|
145 // Note that regular expressions are lazily compiled on first use of the lexer.
|
|
146 type CompiledRule struct {
|
|
147 Rule
|
|
148 Regexp *regexp2.Regexp
|
|
149 flags string
|
|
150 }
|
|
151
|
|
152 // CompiledRules is a map of rule name to sequence of compiled rules in that rule.
|
|
153 type CompiledRules map[string][]*CompiledRule
|
|
154
|
|
155 // LexerState contains the state for a single lex.
|
|
156 type LexerState struct {
|
|
157 Lexer *RegexLexer
|
|
158 Registry *LexerRegistry
|
|
159 Text []rune
|
|
160 Pos int
|
|
161 Rules CompiledRules
|
|
162 Stack []string
|
|
163 State string
|
|
164 Rule int
|
|
165 // Group matches.
|
|
166 Groups []string
|
|
167 // Named Group matches.
|
|
168 NamedGroups map[string]string
|
|
169 // Custum context for mutators.
|
|
170 MutatorContext map[interface{}]interface{}
|
|
171 iteratorStack []Iterator
|
|
172 options *TokeniseOptions
|
|
173 newlineAdded bool
|
|
174 }
|
|
175
|
|
176 // Set mutator context.
|
|
177 func (l *LexerState) Set(key interface{}, value interface{}) {
|
|
178 l.MutatorContext[key] = value
|
|
179 }
|
|
180
|
|
181 // Get mutator context.
|
|
182 func (l *LexerState) Get(key interface{}) interface{} {
|
|
183 return l.MutatorContext[key]
|
|
184 }
|
|
185
|
|
186 // Iterator returns the next Token from the lexer.
|
|
187 func (l *LexerState) Iterator() Token { // nolint: gocognit
|
|
188 end := len(l.Text)
|
|
189 if l.newlineAdded {
|
|
190 end--
|
|
191 }
|
|
192 for l.Pos < end && len(l.Stack) > 0 {
|
|
193 // Exhaust the iterator stack, if any.
|
|
194 for len(l.iteratorStack) > 0 {
|
|
195 n := len(l.iteratorStack) - 1
|
|
196 t := l.iteratorStack[n]()
|
|
197 if t == EOF {
|
|
198 l.iteratorStack = l.iteratorStack[:n]
|
|
199 continue
|
|
200 }
|
|
201 return t
|
|
202 }
|
|
203
|
|
204 l.State = l.Stack[len(l.Stack)-1]
|
|
205 if l.Lexer.trace {
|
|
206 fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
|
|
207 }
|
|
208 selectedRule, ok := l.Rules[l.State]
|
|
209 if !ok {
|
|
210 panic("unknown state " + l.State)
|
|
211 }
|
|
212 ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
|
|
213 // No match.
|
|
214 if groups == nil {
|
|
215 // From Pygments :\
|
|
216 //
|
|
217 // If the RegexLexer encounters a newline that is flagged as an error token, the stack is
|
|
218 // emptied and the lexer continues scanning in the 'root' state. This can help producing
|
|
219 // error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
|
|
220 // closed.
|
|
221 if l.Text[l.Pos] == '\n' && l.State != l.options.State {
|
|
222 l.Stack = []string{l.options.State}
|
|
223 continue
|
|
224 }
|
|
225 l.Pos++
|
|
226 return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
|
|
227 }
|
|
228 l.Rule = ruleIndex
|
|
229 l.Groups = groups
|
|
230 l.NamedGroups = namedGroups
|
|
231 l.Pos += utf8.RuneCountInString(groups[0])
|
|
232 if rule.Mutator != nil {
|
|
233 if err := rule.Mutator.Mutate(l); err != nil {
|
|
234 panic(err)
|
|
235 }
|
|
236 }
|
|
237 if rule.Type != nil {
|
|
238 l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
|
|
239 }
|
|
240 }
|
|
241 // Exhaust the IteratorStack, if any.
|
|
242 // Duplicate code, but eh.
|
|
243 for len(l.iteratorStack) > 0 {
|
|
244 n := len(l.iteratorStack) - 1
|
|
245 t := l.iteratorStack[n]()
|
|
246 if t == EOF {
|
|
247 l.iteratorStack = l.iteratorStack[:n]
|
|
248 continue
|
|
249 }
|
|
250 return t
|
|
251 }
|
|
252
|
|
253 // If we get to here and we still have text, return it as an error.
|
|
254 if l.Pos != len(l.Text) && len(l.Stack) == 0 {
|
|
255 value := string(l.Text[l.Pos:])
|
|
256 l.Pos = len(l.Text)
|
|
257 return Token{Type: Error, Value: value}
|
|
258 }
|
|
259 return EOF
|
|
260 }
|
|
261
|
|
262 // RegexLexer is the default lexer implementation used in Chroma.
|
|
263 type RegexLexer struct {
|
|
264 registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
|
|
265 config *Config
|
|
266 analyser func(text string) float32
|
|
267 trace bool
|
|
268
|
|
269 mu sync.Mutex
|
|
270 compiled bool
|
|
271 rawRules Rules
|
|
272 rules map[string][]*CompiledRule
|
|
273 fetchRulesFunc func() (Rules, error)
|
|
274 compileOnce sync.Once
|
|
275 }
|
|
276
|
|
277 func (r *RegexLexer) String() string {
|
|
278 return r.config.Name
|
|
279 }
|
|
280
|
|
281 // Rules in the Lexer.
|
|
282 func (r *RegexLexer) Rules() (Rules, error) {
|
|
283 if err := r.needRules(); err != nil {
|
|
284 return nil, err
|
|
285 }
|
|
286 return r.rawRules, nil
|
|
287 }
|
|
288
|
|
289 // SetRegistry the lexer will use to lookup other lexers if necessary.
|
|
290 func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
|
|
291 r.registry = registry
|
|
292 return r
|
|
293 }
|
|
294
|
|
295 // SetAnalyser sets the analyser function used to perform content inspection.
|
|
296 func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
|
|
297 r.analyser = analyser
|
|
298 return r
|
|
299 }
|
|
300
|
|
301 func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
|
|
302 if r.analyser != nil {
|
|
303 return r.analyser(text)
|
|
304 }
|
|
305 return 0.0
|
|
306 }
|
|
307
|
|
308 // SetConfig replaces the Config for this Lexer.
|
|
309 func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
|
|
310 r.config = config
|
|
311 return r
|
|
312 }
|
|
313
|
|
314 func (r *RegexLexer) Config() *Config { // nolint
|
|
315 return r.config
|
|
316 }
|
|
317
|
|
318 // Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
|
|
319 func (r *RegexLexer) maybeCompile() (err error) {
|
|
320 r.mu.Lock()
|
|
321 defer r.mu.Unlock()
|
|
322 if r.compiled {
|
|
323 return nil
|
|
324 }
|
|
325 for state, rules := range r.rules {
|
|
326 for i, rule := range rules {
|
|
327 if rule.Regexp == nil {
|
|
328 pattern := "(?:" + rule.Pattern + ")"
|
|
329 if rule.flags != "" {
|
|
330 pattern = "(?" + rule.flags + ")" + pattern
|
|
331 }
|
|
332 pattern = `\G` + pattern
|
|
333 rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
|
|
334 if err != nil {
|
|
335 return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
|
|
336 }
|
|
337 rule.Regexp.MatchTimeout = time.Millisecond * 250
|
|
338 }
|
|
339 }
|
|
340 }
|
|
341 restart:
|
|
342 seen := map[LexerMutator]bool{}
|
|
343 for state := range r.rules {
|
|
344 for i := 0; i < len(r.rules[state]); i++ {
|
|
345 rule := r.rules[state][i]
|
|
346 if compile, ok := rule.Mutator.(LexerMutator); ok {
|
|
347 if seen[compile] {
|
|
348 return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
|
|
349 }
|
|
350 seen[compile] = true
|
|
351 if err := compile.MutateLexer(r.rules, state, i); err != nil {
|
|
352 return err
|
|
353 }
|
|
354 // Process the rules again in case the mutator added/removed rules.
|
|
355 //
|
|
356 // This sounds bad, but shouldn't be significant in practice.
|
|
357 goto restart
|
|
358 }
|
|
359 }
|
|
360 }
|
|
361 r.compiled = true
|
|
362 return nil
|
|
363 }
|
|
364
|
|
365 func (r *RegexLexer) fetchRules() error {
|
|
366 rules, err := r.fetchRulesFunc()
|
|
367 if err != nil {
|
|
368 return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
|
|
369 }
|
|
370 if _, ok := rules["root"]; !ok {
|
|
371 return fmt.Errorf("no \"root\" state")
|
|
372 }
|
|
373 compiledRules := map[string][]*CompiledRule{}
|
|
374 for state, rules := range rules {
|
|
375 compiledRules[state] = nil
|
|
376 for _, rule := range rules {
|
|
377 flags := ""
|
|
378 if !r.config.NotMultiline {
|
|
379 flags += "m"
|
|
380 }
|
|
381 if r.config.CaseInsensitive {
|
|
382 flags += "i"
|
|
383 }
|
|
384 if r.config.DotAll {
|
|
385 flags += "s"
|
|
386 }
|
|
387 compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
|
|
388 }
|
|
389 }
|
|
390
|
|
391 r.rawRules = rules
|
|
392 r.rules = compiledRules
|
|
393 return nil
|
|
394 }
|
|
395
|
|
396 func (r *RegexLexer) needRules() error {
|
|
397 var err error
|
|
398 if r.fetchRulesFunc != nil {
|
|
399 r.compileOnce.Do(func() {
|
|
400 err = r.fetchRules()
|
|
401 })
|
|
402 }
|
|
403 if err := r.maybeCompile(); err != nil {
|
|
404 return err
|
|
405 }
|
|
406 return err
|
|
407 }
|
|
408
|
|
409 func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
|
|
410 err := r.needRules()
|
|
411 if err != nil {
|
|
412 return nil, err
|
|
413 }
|
|
414 if options == nil {
|
|
415 options = defaultOptions
|
|
416 }
|
|
417 if options.EnsureLF {
|
|
418 text = ensureLF(text)
|
|
419 }
|
|
420 newlineAdded := false
|
|
421 if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
|
|
422 text += "\n"
|
|
423 newlineAdded = true
|
|
424 }
|
|
425 state := &LexerState{
|
|
426 Registry: r.registry,
|
|
427 newlineAdded: newlineAdded,
|
|
428 options: options,
|
|
429 Lexer: r,
|
|
430 Text: []rune(text),
|
|
431 Stack: []string{options.State},
|
|
432 Rules: r.rules,
|
|
433 MutatorContext: map[interface{}]interface{}{},
|
|
434 }
|
|
435 return state.Iterator, nil
|
|
436 }
|
|
437
|
|
438 // MustRules is like Rules() but will panic on error.
|
|
439 func (r *RegexLexer) MustRules() Rules {
|
|
440 rules, err := r.Rules()
|
|
441 if err != nil {
|
|
442 panic(err)
|
|
443 }
|
|
444 return rules
|
|
445 }
|
|
446
|
|
447 func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
|
|
448 for i, rule := range rules {
|
|
449 match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
|
|
450 if match != nil && err == nil && match.Index == pos {
|
|
451 groups := []string{}
|
|
452 namedGroups := make(map[string]string)
|
|
453 for _, g := range match.Groups() {
|
|
454 namedGroups[g.Name] = g.String()
|
|
455 groups = append(groups, g.String())
|
|
456 }
|
|
457 return i, rule, groups, namedGroups
|
|
458 }
|
|
459 }
|
|
460 return 0, &CompiledRule{}, nil, nil
|
|
461 }
|
|
462
|
|
463 // replace \r and \r\n with \n
|
|
464 // same as strings.ReplaceAll but more efficient
|
|
465 func ensureLF(text string) string {
|
|
466 buf := make([]byte, len(text))
|
|
467 var j int
|
|
468 for i := 0; i < len(text); i++ {
|
|
469 c := text[i]
|
|
470 if c == '\r' {
|
|
471 if i < len(text)-1 && text[i+1] == '\n' {
|
|
472 continue
|
|
473 }
|
|
474 c = '\n'
|
|
475 }
|
|
476 buf[j] = c
|
|
477 j++
|
|
478 }
|
|
479 return string(buf[:j])
|
|
480 }
|