Mercurial > yakumo_izuru > aya
comparison vendor/github.com/dlclark/regexp2/syntax/writer.go @ 66:787b5ee0289d draft
Use vendored modules
Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author | yakumo.izuru |
---|---|
date | Sun, 23 Jul 2023 13:18:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
65:6d985efa0f7a | 66:787b5ee0289d |
---|---|
1 package syntax | |
2 | |
3 import ( | |
4 "bytes" | |
5 "fmt" | |
6 "math" | |
7 "os" | |
8 ) | |
9 | |
10 func Write(tree *RegexTree) (*Code, error) { | |
11 w := writer{ | |
12 intStack: make([]int, 0, 32), | |
13 emitted: make([]int, 2), | |
14 stringhash: make(map[string]int), | |
15 sethash: make(map[string]int), | |
16 } | |
17 | |
18 code, err := w.codeFromTree(tree) | |
19 | |
20 if tree.options&Debug > 0 && code != nil { | |
21 os.Stdout.WriteString(code.Dump()) | |
22 os.Stdout.WriteString("\n") | |
23 } | |
24 | |
25 return code, err | |
26 } | |
27 | |
28 type writer struct { | |
29 emitted []int | |
30 | |
31 intStack []int | |
32 curpos int | |
33 stringhash map[string]int | |
34 stringtable [][]rune | |
35 sethash map[string]int | |
36 settable []*CharSet | |
37 counting bool | |
38 count int | |
39 trackcount int | |
40 caps map[int]int | |
41 } | |
42 | |
43 const ( | |
44 beforeChild nodeType = 64 | |
45 afterChild = 128 | |
46 //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix | |
47 MaxPrefixSize = 50 | |
48 ) | |
49 | |
50 // The top level RegexCode generator. It does a depth-first walk | |
51 // through the tree and calls EmitFragment to emits code before | |
52 // and after each child of an interior node, and at each leaf. | |
53 // | |
54 // It runs two passes, first to count the size of the generated | |
55 // code, and second to generate the code. | |
56 // | |
57 // We should time it against the alternative, which is | |
58 // to just generate the code and grow the array as we go. | |
59 func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) { | |
60 var ( | |
61 curNode *regexNode | |
62 curChild int | |
63 capsize int | |
64 ) | |
65 // construct sparse capnum mapping if some numbers are unused | |
66 | |
67 if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) { | |
68 capsize = tree.captop | |
69 w.caps = nil | |
70 } else { | |
71 capsize = len(tree.capnumlist) | |
72 w.caps = tree.caps | |
73 for i := 0; i < len(tree.capnumlist); i++ { | |
74 w.caps[tree.capnumlist[i]] = i | |
75 } | |
76 } | |
77 | |
78 w.counting = true | |
79 | |
80 for { | |
81 if !w.counting { | |
82 w.emitted = make([]int, w.count) | |
83 } | |
84 | |
85 curNode = tree.root | |
86 curChild = 0 | |
87 | |
88 w.emit1(Lazybranch, 0) | |
89 | |
90 for { | |
91 if len(curNode.children) == 0 { | |
92 w.emitFragment(curNode.t, curNode, 0) | |
93 } else if curChild < len(curNode.children) { | |
94 w.emitFragment(curNode.t|beforeChild, curNode, curChild) | |
95 | |
96 curNode = curNode.children[curChild] | |
97 | |
98 w.pushInt(curChild) | |
99 curChild = 0 | |
100 continue | |
101 } | |
102 | |
103 if w.emptyStack() { | |
104 break | |
105 } | |
106 | |
107 curChild = w.popInt() | |
108 curNode = curNode.next | |
109 | |
110 w.emitFragment(curNode.t|afterChild, curNode, curChild) | |
111 curChild++ | |
112 } | |
113 | |
114 w.patchJump(0, w.curPos()) | |
115 w.emit(Stop) | |
116 | |
117 if !w.counting { | |
118 break | |
119 } | |
120 | |
121 w.counting = false | |
122 } | |
123 | |
124 fcPrefix := getFirstCharsPrefix(tree) | |
125 prefix := getPrefix(tree) | |
126 rtl := (tree.options & RightToLeft) != 0 | |
127 | |
128 var bmPrefix *BmPrefix | |
129 //TODO: benchmark string prefixes | |
130 if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 { | |
131 if len(prefix.PrefixStr) > MaxPrefixSize { | |
132 // limit prefix changes to 10k | |
133 prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize] | |
134 } | |
135 bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl) | |
136 } else { | |
137 bmPrefix = nil | |
138 } | |
139 | |
140 return &Code{ | |
141 Codes: w.emitted, | |
142 Strings: w.stringtable, | |
143 Sets: w.settable, | |
144 TrackCount: w.trackcount, | |
145 Caps: w.caps, | |
146 Capsize: capsize, | |
147 FcPrefix: fcPrefix, | |
148 BmPrefix: bmPrefix, | |
149 Anchors: getAnchors(tree), | |
150 RightToLeft: rtl, | |
151 }, nil | |
152 } | |
153 | |
154 // The main RegexCode generator. It does a depth-first walk | |
155 // through the tree and calls EmitFragment to emits code before | |
156 // and after each child of an interior node, and at each leaf. | |
157 func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error { | |
158 bits := InstOp(0) | |
159 | |
160 if nodetype <= ntRef { | |
161 if (node.options & RightToLeft) != 0 { | |
162 bits |= Rtl | |
163 } | |
164 if (node.options & IgnoreCase) != 0 { | |
165 bits |= Ci | |
166 } | |
167 } | |
168 ntBits := nodeType(bits) | |
169 | |
170 switch nodetype { | |
171 case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty: | |
172 break | |
173 | |
174 case ntAlternate | beforeChild: | |
175 if curIndex < len(node.children)-1 { | |
176 w.pushInt(w.curPos()) | |
177 w.emit1(Lazybranch, 0) | |
178 } | |
179 | |
180 case ntAlternate | afterChild: | |
181 if curIndex < len(node.children)-1 { | |
182 lbPos := w.popInt() | |
183 w.pushInt(w.curPos()) | |
184 w.emit1(Goto, 0) | |
185 w.patchJump(lbPos, w.curPos()) | |
186 } else { | |
187 for i := 0; i < curIndex; i++ { | |
188 w.patchJump(w.popInt(), w.curPos()) | |
189 } | |
190 } | |
191 break | |
192 | |
193 case ntTestref | beforeChild: | |
194 if curIndex == 0 { | |
195 w.emit(Setjump) | |
196 w.pushInt(w.curPos()) | |
197 w.emit1(Lazybranch, 0) | |
198 w.emit1(Testref, w.mapCapnum(node.m)) | |
199 w.emit(Forejump) | |
200 } | |
201 | |
202 case ntTestref | afterChild: | |
203 if curIndex == 0 { | |
204 branchpos := w.popInt() | |
205 w.pushInt(w.curPos()) | |
206 w.emit1(Goto, 0) | |
207 w.patchJump(branchpos, w.curPos()) | |
208 w.emit(Forejump) | |
209 if len(node.children) <= 1 { | |
210 w.patchJump(w.popInt(), w.curPos()) | |
211 } | |
212 } else if curIndex == 1 { | |
213 w.patchJump(w.popInt(), w.curPos()) | |
214 } | |
215 | |
216 case ntTestgroup | beforeChild: | |
217 if curIndex == 0 { | |
218 w.emit(Setjump) | |
219 w.emit(Setmark) | |
220 w.pushInt(w.curPos()) | |
221 w.emit1(Lazybranch, 0) | |
222 } | |
223 | |
224 case ntTestgroup | afterChild: | |
225 if curIndex == 0 { | |
226 w.emit(Getmark) | |
227 w.emit(Forejump) | |
228 } else if curIndex == 1 { | |
229 Branchpos := w.popInt() | |
230 w.pushInt(w.curPos()) | |
231 w.emit1(Goto, 0) | |
232 w.patchJump(Branchpos, w.curPos()) | |
233 w.emit(Getmark) | |
234 w.emit(Forejump) | |
235 if len(node.children) <= 2 { | |
236 w.patchJump(w.popInt(), w.curPos()) | |
237 } | |
238 } else if curIndex == 2 { | |
239 w.patchJump(w.popInt(), w.curPos()) | |
240 } | |
241 | |
242 case ntLoop | beforeChild, ntLazyloop | beforeChild: | |
243 | |
244 if node.n < math.MaxInt32 || node.m > 1 { | |
245 if node.m == 0 { | |
246 w.emit1(Nullcount, 0) | |
247 } else { | |
248 w.emit1(Setcount, 1-node.m) | |
249 } | |
250 } else if node.m == 0 { | |
251 w.emit(Nullmark) | |
252 } else { | |
253 w.emit(Setmark) | |
254 } | |
255 | |
256 if node.m == 0 { | |
257 w.pushInt(w.curPos()) | |
258 w.emit1(Goto, 0) | |
259 } | |
260 w.pushInt(w.curPos()) | |
261 | |
262 case ntLoop | afterChild, ntLazyloop | afterChild: | |
263 | |
264 startJumpPos := w.curPos() | |
265 lazy := (nodetype - (ntLoop | afterChild)) | |
266 | |
267 if node.n < math.MaxInt32 || node.m > 1 { | |
268 if node.n == math.MaxInt32 { | |
269 w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32) | |
270 } else { | |
271 w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m) | |
272 } | |
273 } else { | |
274 w.emit1(InstOp(Branchmark+lazy), w.popInt()) | |
275 } | |
276 | |
277 if node.m == 0 { | |
278 w.patchJump(w.popInt(), startJumpPos) | |
279 } | |
280 | |
281 case ntGroup | beforeChild, ntGroup | afterChild: | |
282 | |
283 case ntCapture | beforeChild: | |
284 w.emit(Setmark) | |
285 | |
286 case ntCapture | afterChild: | |
287 w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n)) | |
288 | |
289 case ntRequire | beforeChild: | |
290 // NOTE: the following line causes lookahead/lookbehind to be | |
291 // NON-BACKTRACKING. It can be commented out with (*) | |
292 w.emit(Setjump) | |
293 | |
294 w.emit(Setmark) | |
295 | |
296 case ntRequire | afterChild: | |
297 w.emit(Getmark) | |
298 | |
299 // NOTE: the following line causes lookahead/lookbehind to be | |
300 // NON-BACKTRACKING. It can be commented out with (*) | |
301 w.emit(Forejump) | |
302 | |
303 case ntPrevent | beforeChild: | |
304 w.emit(Setjump) | |
305 w.pushInt(w.curPos()) | |
306 w.emit1(Lazybranch, 0) | |
307 | |
308 case ntPrevent | afterChild: | |
309 w.emit(Backjump) | |
310 w.patchJump(w.popInt(), w.curPos()) | |
311 w.emit(Forejump) | |
312 | |
313 case ntGreedy | beforeChild: | |
314 w.emit(Setjump) | |
315 | |
316 case ntGreedy | afterChild: | |
317 w.emit(Forejump) | |
318 | |
319 case ntOne, ntNotone: | |
320 w.emit1(InstOp(node.t|ntBits), int(node.ch)) | |
321 | |
322 case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy: | |
323 if node.m > 0 { | |
324 if node.t == ntOneloop || node.t == ntOnelazy { | |
325 w.emit2(Onerep|bits, int(node.ch), node.m) | |
326 } else { | |
327 w.emit2(Notonerep|bits, int(node.ch), node.m) | |
328 } | |
329 } | |
330 if node.n > node.m { | |
331 if node.n == math.MaxInt32 { | |
332 w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32) | |
333 } else { | |
334 w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m) | |
335 } | |
336 } | |
337 | |
338 case ntSetloop, ntSetlazy: | |
339 if node.m > 0 { | |
340 w.emit2(Setrep|bits, w.setCode(node.set), node.m) | |
341 } | |
342 if node.n > node.m { | |
343 if node.n == math.MaxInt32 { | |
344 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32) | |
345 } else { | |
346 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m) | |
347 } | |
348 } | |
349 | |
350 case ntMulti: | |
351 w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str)) | |
352 | |
353 case ntSet: | |
354 w.emit1(InstOp(node.t|ntBits), w.setCode(node.set)) | |
355 | |
356 case ntRef: | |
357 w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m)) | |
358 | |
359 case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: | |
360 w.emit(InstOp(node.t)) | |
361 | |
362 default: | |
363 return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype) | |
364 } | |
365 | |
366 return nil | |
367 } | |
368 | |
369 // To avoid recursion, we use a simple integer stack. | |
370 // This is the push. | |
371 func (w *writer) pushInt(i int) { | |
372 w.intStack = append(w.intStack, i) | |
373 } | |
374 | |
375 // Returns true if the stack is empty. | |
376 func (w *writer) emptyStack() bool { | |
377 return len(w.intStack) == 0 | |
378 } | |
379 | |
380 // This is the pop. | |
381 func (w *writer) popInt() int { | |
382 //get our item | |
383 idx := len(w.intStack) - 1 | |
384 i := w.intStack[idx] | |
385 //trim our slice | |
386 w.intStack = w.intStack[:idx] | |
387 return i | |
388 } | |
389 | |
390 // Returns the current position in the emitted code. | |
391 func (w *writer) curPos() int { | |
392 return w.curpos | |
393 } | |
394 | |
395 // Fixes up a jump instruction at the specified offset | |
396 // so that it jumps to the specified jumpDest. | |
397 func (w *writer) patchJump(offset, jumpDest int) { | |
398 w.emitted[offset+1] = jumpDest | |
399 } | |
400 | |
401 // Returns an index in the set table for a charset | |
402 // uses a map to eliminate duplicates. | |
403 func (w *writer) setCode(set *CharSet) int { | |
404 if w.counting { | |
405 return 0 | |
406 } | |
407 | |
408 buf := &bytes.Buffer{} | |
409 | |
410 set.mapHashFill(buf) | |
411 hash := buf.String() | |
412 i, ok := w.sethash[hash] | |
413 if !ok { | |
414 i = len(w.sethash) | |
415 w.sethash[hash] = i | |
416 w.settable = append(w.settable, set) | |
417 } | |
418 return i | |
419 } | |
420 | |
421 // Returns an index in the string table for a string. | |
422 // uses a map to eliminate duplicates. | |
423 func (w *writer) stringCode(str []rune) int { | |
424 if w.counting { | |
425 return 0 | |
426 } | |
427 | |
428 hash := string(str) | |
429 i, ok := w.stringhash[hash] | |
430 if !ok { | |
431 i = len(w.stringhash) | |
432 w.stringhash[hash] = i | |
433 w.stringtable = append(w.stringtable, str) | |
434 } | |
435 | |
436 return i | |
437 } | |
438 | |
439 // When generating code on a regex that uses a sparse set | |
440 // of capture slots, we hash them to a dense set of indices | |
441 // for an array of capture slots. Instead of doing the hash | |
442 // at match time, it's done at compile time, here. | |
443 func (w *writer) mapCapnum(capnum int) int { | |
444 if capnum == -1 { | |
445 return -1 | |
446 } | |
447 | |
448 if w.caps != nil { | |
449 return w.caps[capnum] | |
450 } | |
451 | |
452 return capnum | |
453 } | |
454 | |
455 // Emits a zero-argument operation. Note that the emit | |
456 // functions all run in two modes: they can emit code, or | |
457 // they can just count the size of the code. | |
458 func (w *writer) emit(op InstOp) { | |
459 if w.counting { | |
460 w.count++ | |
461 if opcodeBacktracks(op) { | |
462 w.trackcount++ | |
463 } | |
464 return | |
465 } | |
466 w.emitted[w.curpos] = int(op) | |
467 w.curpos++ | |
468 } | |
469 | |
470 // Emits a one-argument operation. | |
471 func (w *writer) emit1(op InstOp, opd1 int) { | |
472 if w.counting { | |
473 w.count += 2 | |
474 if opcodeBacktracks(op) { | |
475 w.trackcount++ | |
476 } | |
477 return | |
478 } | |
479 w.emitted[w.curpos] = int(op) | |
480 w.curpos++ | |
481 w.emitted[w.curpos] = opd1 | |
482 w.curpos++ | |
483 } | |
484 | |
485 // Emits a two-argument operation. | |
486 func (w *writer) emit2(op InstOp, opd1, opd2 int) { | |
487 if w.counting { | |
488 w.count += 3 | |
489 if opcodeBacktracks(op) { | |
490 w.trackcount++ | |
491 } | |
492 return | |
493 } | |
494 w.emitted[w.curpos] = int(op) | |
495 w.curpos++ | |
496 w.emitted[w.curpos] = opd1 | |
497 w.curpos++ | |
498 w.emitted[w.curpos] = opd2 | |
499 w.curpos++ | |
500 } |