66
|
1 package syntax
|
|
2
|
|
3 import (
|
|
4 "bytes"
|
|
5 "fmt"
|
|
6 "math"
|
|
7 "os"
|
|
8 )
|
|
9
|
|
10 func Write(tree *RegexTree) (*Code, error) {
|
|
11 w := writer{
|
|
12 intStack: make([]int, 0, 32),
|
|
13 emitted: make([]int, 2),
|
|
14 stringhash: make(map[string]int),
|
|
15 sethash: make(map[string]int),
|
|
16 }
|
|
17
|
|
18 code, err := w.codeFromTree(tree)
|
|
19
|
|
20 if tree.options&Debug > 0 && code != nil {
|
|
21 os.Stdout.WriteString(code.Dump())
|
|
22 os.Stdout.WriteString("\n")
|
|
23 }
|
|
24
|
|
25 return code, err
|
|
26 }
|
|
27
|
|
28 type writer struct {
|
|
29 emitted []int
|
|
30
|
|
31 intStack []int
|
|
32 curpos int
|
|
33 stringhash map[string]int
|
|
34 stringtable [][]rune
|
|
35 sethash map[string]int
|
|
36 settable []*CharSet
|
|
37 counting bool
|
|
38 count int
|
|
39 trackcount int
|
|
40 caps map[int]int
|
|
41 }
|
|
42
|
|
43 const (
|
|
44 beforeChild nodeType = 64
|
|
45 afterChild = 128
|
|
46 //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
|
|
47 MaxPrefixSize = 50
|
|
48 )
|
|
49
|
|
50 // The top level RegexCode generator. It does a depth-first walk
|
|
51 // through the tree and calls EmitFragment to emits code before
|
|
52 // and after each child of an interior node, and at each leaf.
|
|
53 //
|
|
54 // It runs two passes, first to count the size of the generated
|
|
55 // code, and second to generate the code.
|
|
56 //
|
|
57 // We should time it against the alternative, which is
|
|
58 // to just generate the code and grow the array as we go.
|
|
59 func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
|
|
60 var (
|
|
61 curNode *regexNode
|
|
62 curChild int
|
|
63 capsize int
|
|
64 )
|
|
65 // construct sparse capnum mapping if some numbers are unused
|
|
66
|
|
67 if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
|
|
68 capsize = tree.captop
|
|
69 w.caps = nil
|
|
70 } else {
|
|
71 capsize = len(tree.capnumlist)
|
|
72 w.caps = tree.caps
|
|
73 for i := 0; i < len(tree.capnumlist); i++ {
|
|
74 w.caps[tree.capnumlist[i]] = i
|
|
75 }
|
|
76 }
|
|
77
|
|
78 w.counting = true
|
|
79
|
|
80 for {
|
|
81 if !w.counting {
|
|
82 w.emitted = make([]int, w.count)
|
|
83 }
|
|
84
|
|
85 curNode = tree.root
|
|
86 curChild = 0
|
|
87
|
|
88 w.emit1(Lazybranch, 0)
|
|
89
|
|
90 for {
|
|
91 if len(curNode.children) == 0 {
|
|
92 w.emitFragment(curNode.t, curNode, 0)
|
|
93 } else if curChild < len(curNode.children) {
|
|
94 w.emitFragment(curNode.t|beforeChild, curNode, curChild)
|
|
95
|
|
96 curNode = curNode.children[curChild]
|
|
97
|
|
98 w.pushInt(curChild)
|
|
99 curChild = 0
|
|
100 continue
|
|
101 }
|
|
102
|
|
103 if w.emptyStack() {
|
|
104 break
|
|
105 }
|
|
106
|
|
107 curChild = w.popInt()
|
|
108 curNode = curNode.next
|
|
109
|
|
110 w.emitFragment(curNode.t|afterChild, curNode, curChild)
|
|
111 curChild++
|
|
112 }
|
|
113
|
|
114 w.patchJump(0, w.curPos())
|
|
115 w.emit(Stop)
|
|
116
|
|
117 if !w.counting {
|
|
118 break
|
|
119 }
|
|
120
|
|
121 w.counting = false
|
|
122 }
|
|
123
|
|
124 fcPrefix := getFirstCharsPrefix(tree)
|
|
125 prefix := getPrefix(tree)
|
|
126 rtl := (tree.options & RightToLeft) != 0
|
|
127
|
|
128 var bmPrefix *BmPrefix
|
|
129 //TODO: benchmark string prefixes
|
|
130 if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
|
|
131 if len(prefix.PrefixStr) > MaxPrefixSize {
|
|
132 // limit prefix changes to 10k
|
|
133 prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
|
|
134 }
|
|
135 bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
|
|
136 } else {
|
|
137 bmPrefix = nil
|
|
138 }
|
|
139
|
|
140 return &Code{
|
|
141 Codes: w.emitted,
|
|
142 Strings: w.stringtable,
|
|
143 Sets: w.settable,
|
|
144 TrackCount: w.trackcount,
|
|
145 Caps: w.caps,
|
|
146 Capsize: capsize,
|
|
147 FcPrefix: fcPrefix,
|
|
148 BmPrefix: bmPrefix,
|
|
149 Anchors: getAnchors(tree),
|
|
150 RightToLeft: rtl,
|
|
151 }, nil
|
|
152 }
|
|
153
|
|
154 // The main RegexCode generator. It does a depth-first walk
|
|
155 // through the tree and calls EmitFragment to emits code before
|
|
156 // and after each child of an interior node, and at each leaf.
|
|
157 func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
|
|
158 bits := InstOp(0)
|
|
159
|
|
160 if nodetype <= ntRef {
|
|
161 if (node.options & RightToLeft) != 0 {
|
|
162 bits |= Rtl
|
|
163 }
|
|
164 if (node.options & IgnoreCase) != 0 {
|
|
165 bits |= Ci
|
|
166 }
|
|
167 }
|
|
168 ntBits := nodeType(bits)
|
|
169
|
|
170 switch nodetype {
|
|
171 case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
|
|
172 break
|
|
173
|
|
174 case ntAlternate | beforeChild:
|
|
175 if curIndex < len(node.children)-1 {
|
|
176 w.pushInt(w.curPos())
|
|
177 w.emit1(Lazybranch, 0)
|
|
178 }
|
|
179
|
|
180 case ntAlternate | afterChild:
|
|
181 if curIndex < len(node.children)-1 {
|
|
182 lbPos := w.popInt()
|
|
183 w.pushInt(w.curPos())
|
|
184 w.emit1(Goto, 0)
|
|
185 w.patchJump(lbPos, w.curPos())
|
|
186 } else {
|
|
187 for i := 0; i < curIndex; i++ {
|
|
188 w.patchJump(w.popInt(), w.curPos())
|
|
189 }
|
|
190 }
|
|
191 break
|
|
192
|
|
193 case ntTestref | beforeChild:
|
|
194 if curIndex == 0 {
|
|
195 w.emit(Setjump)
|
|
196 w.pushInt(w.curPos())
|
|
197 w.emit1(Lazybranch, 0)
|
|
198 w.emit1(Testref, w.mapCapnum(node.m))
|
|
199 w.emit(Forejump)
|
|
200 }
|
|
201
|
|
202 case ntTestref | afterChild:
|
|
203 if curIndex == 0 {
|
|
204 branchpos := w.popInt()
|
|
205 w.pushInt(w.curPos())
|
|
206 w.emit1(Goto, 0)
|
|
207 w.patchJump(branchpos, w.curPos())
|
|
208 w.emit(Forejump)
|
|
209 if len(node.children) <= 1 {
|
|
210 w.patchJump(w.popInt(), w.curPos())
|
|
211 }
|
|
212 } else if curIndex == 1 {
|
|
213 w.patchJump(w.popInt(), w.curPos())
|
|
214 }
|
|
215
|
|
216 case ntTestgroup | beforeChild:
|
|
217 if curIndex == 0 {
|
|
218 w.emit(Setjump)
|
|
219 w.emit(Setmark)
|
|
220 w.pushInt(w.curPos())
|
|
221 w.emit1(Lazybranch, 0)
|
|
222 }
|
|
223
|
|
224 case ntTestgroup | afterChild:
|
|
225 if curIndex == 0 {
|
|
226 w.emit(Getmark)
|
|
227 w.emit(Forejump)
|
|
228 } else if curIndex == 1 {
|
|
229 Branchpos := w.popInt()
|
|
230 w.pushInt(w.curPos())
|
|
231 w.emit1(Goto, 0)
|
|
232 w.patchJump(Branchpos, w.curPos())
|
|
233 w.emit(Getmark)
|
|
234 w.emit(Forejump)
|
|
235 if len(node.children) <= 2 {
|
|
236 w.patchJump(w.popInt(), w.curPos())
|
|
237 }
|
|
238 } else if curIndex == 2 {
|
|
239 w.patchJump(w.popInt(), w.curPos())
|
|
240 }
|
|
241
|
|
242 case ntLoop | beforeChild, ntLazyloop | beforeChild:
|
|
243
|
|
244 if node.n < math.MaxInt32 || node.m > 1 {
|
|
245 if node.m == 0 {
|
|
246 w.emit1(Nullcount, 0)
|
|
247 } else {
|
|
248 w.emit1(Setcount, 1-node.m)
|
|
249 }
|
|
250 } else if node.m == 0 {
|
|
251 w.emit(Nullmark)
|
|
252 } else {
|
|
253 w.emit(Setmark)
|
|
254 }
|
|
255
|
|
256 if node.m == 0 {
|
|
257 w.pushInt(w.curPos())
|
|
258 w.emit1(Goto, 0)
|
|
259 }
|
|
260 w.pushInt(w.curPos())
|
|
261
|
|
262 case ntLoop | afterChild, ntLazyloop | afterChild:
|
|
263
|
|
264 startJumpPos := w.curPos()
|
|
265 lazy := (nodetype - (ntLoop | afterChild))
|
|
266
|
|
267 if node.n < math.MaxInt32 || node.m > 1 {
|
|
268 if node.n == math.MaxInt32 {
|
|
269 w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
|
|
270 } else {
|
|
271 w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
|
|
272 }
|
|
273 } else {
|
|
274 w.emit1(InstOp(Branchmark+lazy), w.popInt())
|
|
275 }
|
|
276
|
|
277 if node.m == 0 {
|
|
278 w.patchJump(w.popInt(), startJumpPos)
|
|
279 }
|
|
280
|
|
281 case ntGroup | beforeChild, ntGroup | afterChild:
|
|
282
|
|
283 case ntCapture | beforeChild:
|
|
284 w.emit(Setmark)
|
|
285
|
|
286 case ntCapture | afterChild:
|
|
287 w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
|
|
288
|
|
289 case ntRequire | beforeChild:
|
|
290 // NOTE: the following line causes lookahead/lookbehind to be
|
|
291 // NON-BACKTRACKING. It can be commented out with (*)
|
|
292 w.emit(Setjump)
|
|
293
|
|
294 w.emit(Setmark)
|
|
295
|
|
296 case ntRequire | afterChild:
|
|
297 w.emit(Getmark)
|
|
298
|
|
299 // NOTE: the following line causes lookahead/lookbehind to be
|
|
300 // NON-BACKTRACKING. It can be commented out with (*)
|
|
301 w.emit(Forejump)
|
|
302
|
|
303 case ntPrevent | beforeChild:
|
|
304 w.emit(Setjump)
|
|
305 w.pushInt(w.curPos())
|
|
306 w.emit1(Lazybranch, 0)
|
|
307
|
|
308 case ntPrevent | afterChild:
|
|
309 w.emit(Backjump)
|
|
310 w.patchJump(w.popInt(), w.curPos())
|
|
311 w.emit(Forejump)
|
|
312
|
|
313 case ntGreedy | beforeChild:
|
|
314 w.emit(Setjump)
|
|
315
|
|
316 case ntGreedy | afterChild:
|
|
317 w.emit(Forejump)
|
|
318
|
|
319 case ntOne, ntNotone:
|
|
320 w.emit1(InstOp(node.t|ntBits), int(node.ch))
|
|
321
|
|
322 case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
|
|
323 if node.m > 0 {
|
|
324 if node.t == ntOneloop || node.t == ntOnelazy {
|
|
325 w.emit2(Onerep|bits, int(node.ch), node.m)
|
|
326 } else {
|
|
327 w.emit2(Notonerep|bits, int(node.ch), node.m)
|
|
328 }
|
|
329 }
|
|
330 if node.n > node.m {
|
|
331 if node.n == math.MaxInt32 {
|
|
332 w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
|
|
333 } else {
|
|
334 w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
|
|
335 }
|
|
336 }
|
|
337
|
|
338 case ntSetloop, ntSetlazy:
|
|
339 if node.m > 0 {
|
|
340 w.emit2(Setrep|bits, w.setCode(node.set), node.m)
|
|
341 }
|
|
342 if node.n > node.m {
|
|
343 if node.n == math.MaxInt32 {
|
|
344 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
|
|
345 } else {
|
|
346 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
|
|
347 }
|
|
348 }
|
|
349
|
|
350 case ntMulti:
|
|
351 w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
|
|
352
|
|
353 case ntSet:
|
|
354 w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
|
|
355
|
|
356 case ntRef:
|
|
357 w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
|
|
358
|
|
359 case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
|
360 w.emit(InstOp(node.t))
|
|
361
|
|
362 default:
|
|
363 return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
|
|
364 }
|
|
365
|
|
366 return nil
|
|
367 }
|
|
368
|
|
369 // To avoid recursion, we use a simple integer stack.
|
|
370 // This is the push.
|
|
371 func (w *writer) pushInt(i int) {
|
|
372 w.intStack = append(w.intStack, i)
|
|
373 }
|
|
374
|
|
375 // Returns true if the stack is empty.
|
|
376 func (w *writer) emptyStack() bool {
|
|
377 return len(w.intStack) == 0
|
|
378 }
|
|
379
|
|
380 // This is the pop.
|
|
381 func (w *writer) popInt() int {
|
|
382 //get our item
|
|
383 idx := len(w.intStack) - 1
|
|
384 i := w.intStack[idx]
|
|
385 //trim our slice
|
|
386 w.intStack = w.intStack[:idx]
|
|
387 return i
|
|
388 }
|
|
389
|
|
390 // Returns the current position in the emitted code.
|
|
391 func (w *writer) curPos() int {
|
|
392 return w.curpos
|
|
393 }
|
|
394
|
|
395 // Fixes up a jump instruction at the specified offset
|
|
396 // so that it jumps to the specified jumpDest.
|
|
397 func (w *writer) patchJump(offset, jumpDest int) {
|
|
398 w.emitted[offset+1] = jumpDest
|
|
399 }
|
|
400
|
|
401 // Returns an index in the set table for a charset
|
|
402 // uses a map to eliminate duplicates.
|
|
403 func (w *writer) setCode(set *CharSet) int {
|
|
404 if w.counting {
|
|
405 return 0
|
|
406 }
|
|
407
|
|
408 buf := &bytes.Buffer{}
|
|
409
|
|
410 set.mapHashFill(buf)
|
|
411 hash := buf.String()
|
|
412 i, ok := w.sethash[hash]
|
|
413 if !ok {
|
|
414 i = len(w.sethash)
|
|
415 w.sethash[hash] = i
|
|
416 w.settable = append(w.settable, set)
|
|
417 }
|
|
418 return i
|
|
419 }
|
|
420
|
|
421 // Returns an index in the string table for a string.
|
|
422 // uses a map to eliminate duplicates.
|
|
423 func (w *writer) stringCode(str []rune) int {
|
|
424 if w.counting {
|
|
425 return 0
|
|
426 }
|
|
427
|
|
428 hash := string(str)
|
|
429 i, ok := w.stringhash[hash]
|
|
430 if !ok {
|
|
431 i = len(w.stringhash)
|
|
432 w.stringhash[hash] = i
|
|
433 w.stringtable = append(w.stringtable, str)
|
|
434 }
|
|
435
|
|
436 return i
|
|
437 }
|
|
438
|
|
439 // When generating code on a regex that uses a sparse set
|
|
440 // of capture slots, we hash them to a dense set of indices
|
|
441 // for an array of capture slots. Instead of doing the hash
|
|
442 // at match time, it's done at compile time, here.
|
|
443 func (w *writer) mapCapnum(capnum int) int {
|
|
444 if capnum == -1 {
|
|
445 return -1
|
|
446 }
|
|
447
|
|
448 if w.caps != nil {
|
|
449 return w.caps[capnum]
|
|
450 }
|
|
451
|
|
452 return capnum
|
|
453 }
|
|
454
|
|
455 // Emits a zero-argument operation. Note that the emit
|
|
456 // functions all run in two modes: they can emit code, or
|
|
457 // they can just count the size of the code.
|
|
458 func (w *writer) emit(op InstOp) {
|
|
459 if w.counting {
|
|
460 w.count++
|
|
461 if opcodeBacktracks(op) {
|
|
462 w.trackcount++
|
|
463 }
|
|
464 return
|
|
465 }
|
|
466 w.emitted[w.curpos] = int(op)
|
|
467 w.curpos++
|
|
468 }
|
|
469
|
|
470 // Emits a one-argument operation.
|
|
471 func (w *writer) emit1(op InstOp, opd1 int) {
|
|
472 if w.counting {
|
|
473 w.count += 2
|
|
474 if opcodeBacktracks(op) {
|
|
475 w.trackcount++
|
|
476 }
|
|
477 return
|
|
478 }
|
|
479 w.emitted[w.curpos] = int(op)
|
|
480 w.curpos++
|
|
481 w.emitted[w.curpos] = opd1
|
|
482 w.curpos++
|
|
483 }
|
|
484
|
|
485 // Emits a two-argument operation.
|
|
486 func (w *writer) emit2(op InstOp, opd1, opd2 int) {
|
|
487 if w.counting {
|
|
488 w.count += 3
|
|
489 if opcodeBacktracks(op) {
|
|
490 w.trackcount++
|
|
491 }
|
|
492 return
|
|
493 }
|
|
494 w.emitted[w.curpos] = int(op)
|
|
495 w.curpos++
|
|
496 w.emitted[w.curpos] = opd1
|
|
497 w.curpos++
|
|
498 w.emitted[w.curpos] = opd2
|
|
499 w.curpos++
|
|
500 }
|