66
|
1 package syntax
|
|
2
|
|
3 import (
|
|
4 "bytes"
|
|
5 "fmt"
|
|
6 "math"
|
|
7 )
|
|
8
|
|
9 // similar to prog.go in the go regex package...also with comment 'may not belong in this package'
|
|
10
|
|
11 // File provides operator constants for use by the Builder and the Machine.
|
|
12
|
|
13 // Implementation notes:
|
|
14 //
|
|
15 // Regexps are built into RegexCodes, which contain an operation array,
|
|
16 // a string table, and some constants.
|
|
17 //
|
|
18 // Each operation is one of the codes below, followed by the integer
|
|
19 // operands specified for each op.
|
|
20 //
|
|
21 // Strings and sets are indices into a string table.
|
|
22
|
|
23 type InstOp int
|
|
24
|
|
25 const (
|
|
26 // lef/back operands description
|
|
27
|
|
28 Onerep InstOp = 0 // lef,back char,min,max a {n}
|
|
29 Notonerep = 1 // lef,back char,min,max .{n}
|
|
30 Setrep = 2 // lef,back set,min,max [\d]{n}
|
|
31
|
|
32 Oneloop = 3 // lef,back char,min,max a {,n}
|
|
33 Notoneloop = 4 // lef,back char,min,max .{,n}
|
|
34 Setloop = 5 // lef,back set,min,max [\d]{,n}
|
|
35
|
|
36 Onelazy = 6 // lef,back char,min,max a {,n}?
|
|
37 Notonelazy = 7 // lef,back char,min,max .{,n}?
|
|
38 Setlazy = 8 // lef,back set,min,max [\d]{,n}?
|
|
39
|
|
40 One = 9 // lef char a
|
|
41 Notone = 10 // lef char [^a]
|
|
42 Set = 11 // lef set [a-z\s] \w \s \d
|
|
43
|
|
44 Multi = 12 // lef string abcd
|
|
45 Ref = 13 // lef group \#
|
|
46
|
|
47 Bol = 14 // ^
|
|
48 Eol = 15 // $
|
|
49 Boundary = 16 // \b
|
|
50 Nonboundary = 17 // \B
|
|
51 Beginning = 18 // \A
|
|
52 Start = 19 // \G
|
|
53 EndZ = 20 // \Z
|
|
54 End = 21 // \Z
|
|
55
|
|
56 Nothing = 22 // Reject!
|
|
57
|
|
58 // Primitive control structures
|
|
59
|
|
60 Lazybranch = 23 // back jump straight first
|
|
61 Branchmark = 24 // back jump branch first for loop
|
|
62 Lazybranchmark = 25 // back jump straight first for loop
|
|
63 Nullcount = 26 // back val set counter, null mark
|
|
64 Setcount = 27 // back val set counter, make mark
|
|
65 Branchcount = 28 // back jump,limit branch++ if zero<=c<limit
|
|
66 Lazybranchcount = 29 // back jump,limit same, but straight first
|
|
67 Nullmark = 30 // back save position
|
|
68 Setmark = 31 // back save position
|
|
69 Capturemark = 32 // back group define group
|
|
70 Getmark = 33 // back recall position
|
|
71 Setjump = 34 // back save backtrack state
|
|
72 Backjump = 35 // zap back to saved state
|
|
73 Forejump = 36 // zap backtracking state
|
|
74 Testref = 37 // backtrack if ref undefined
|
|
75 Goto = 38 // jump just go
|
|
76
|
|
77 Prune = 39 // prune it baby
|
|
78 Stop = 40 // done!
|
|
79
|
|
80 ECMABoundary = 41 // \b
|
|
81 NonECMABoundary = 42 // \B
|
|
82
|
|
83 // Modifiers for alternate modes
|
|
84
|
|
85 Mask = 63 // Mask to get unmodified ordinary operator
|
|
86 Rtl = 64 // bit to indicate that we're reverse scanning.
|
|
87 Back = 128 // bit to indicate that we're backtracking.
|
|
88 Back2 = 256 // bit to indicate that we're backtracking on a second branch.
|
|
89 Ci = 512 // bit to indicate that we're case-insensitive.
|
|
90 )
|
|
91
|
|
92 type Code struct {
|
|
93 Codes []int // the code
|
|
94 Strings [][]rune // string table
|
|
95 Sets []*CharSet //character set table
|
|
96 TrackCount int // how many instructions use backtracking
|
|
97 Caps map[int]int // mapping of user group numbers -> impl group slots
|
|
98 Capsize int // number of impl group slots
|
|
99 FcPrefix *Prefix // the set of candidate first characters (may be null)
|
|
100 BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null)
|
|
101 Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc)
|
|
102 RightToLeft bool // true if right to left
|
|
103 }
|
|
104
|
|
105 func opcodeBacktracks(op InstOp) bool {
|
|
106 op &= Mask
|
|
107
|
|
108 switch op {
|
|
109 case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark,
|
|
110 Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump,
|
|
111 Forejump, Goto:
|
|
112 return true
|
|
113
|
|
114 default:
|
|
115 return false
|
|
116 }
|
|
117 }
|
|
118
|
|
119 func opcodeSize(op InstOp) int {
|
|
120 op &= Mask
|
|
121
|
|
122 switch op {
|
|
123 case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ,
|
|
124 End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop:
|
|
125 return 1
|
|
126
|
|
127 case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark,
|
|
128 Prune, Set:
|
|
129 return 2
|
|
130
|
|
131 case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy,
|
|
132 Setlazy, Setrep, Setloop:
|
|
133 return 3
|
|
134
|
|
135 default:
|
|
136 panic(fmt.Errorf("Unexpected op code: %v", op))
|
|
137 }
|
|
138 }
|
|
139
|
|
140 var codeStr = []string{
|
|
141 "Onerep", "Notonerep", "Setrep",
|
|
142 "Oneloop", "Notoneloop", "Setloop",
|
|
143 "Onelazy", "Notonelazy", "Setlazy",
|
|
144 "One", "Notone", "Set",
|
|
145 "Multi", "Ref",
|
|
146 "Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
|
|
147 "Nothing",
|
|
148 "Lazybranch", "Branchmark", "Lazybranchmark",
|
|
149 "Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
|
|
150 "Nullmark", "Setmark", "Capturemark", "Getmark",
|
|
151 "Setjump", "Backjump", "Forejump", "Testref", "Goto",
|
|
152 "Prune", "Stop",
|
|
153 "ECMABoundary", "NonECMABoundary",
|
|
154 }
|
|
155
|
|
156 func operatorDescription(op InstOp) string {
|
|
157 desc := codeStr[op&Mask]
|
|
158 if (op & Ci) != 0 {
|
|
159 desc += "-Ci"
|
|
160 }
|
|
161 if (op & Rtl) != 0 {
|
|
162 desc += "-Rtl"
|
|
163 }
|
|
164 if (op & Back) != 0 {
|
|
165 desc += "-Back"
|
|
166 }
|
|
167 if (op & Back2) != 0 {
|
|
168 desc += "-Back2"
|
|
169 }
|
|
170
|
|
171 return desc
|
|
172 }
|
|
173
|
|
174 // OpcodeDescription is a humman readable string of the specific offset
|
|
175 func (c *Code) OpcodeDescription(offset int) string {
|
|
176 buf := &bytes.Buffer{}
|
|
177
|
|
178 op := InstOp(c.Codes[offset])
|
|
179 fmt.Fprintf(buf, "%06d ", offset)
|
|
180
|
|
181 if opcodeBacktracks(op & Mask) {
|
|
182 buf.WriteString("*")
|
|
183 } else {
|
|
184 buf.WriteString(" ")
|
|
185 }
|
|
186 buf.WriteString(operatorDescription(op))
|
|
187 buf.WriteString("(")
|
|
188 op &= Mask
|
|
189
|
|
190 switch op {
|
|
191 case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy:
|
|
192 buf.WriteString("Ch = ")
|
|
193 buf.WriteString(CharDescription(rune(c.Codes[offset+1])))
|
|
194
|
|
195 case Set, Setrep, Setloop, Setlazy:
|
|
196 buf.WriteString("Set = ")
|
|
197 buf.WriteString(c.Sets[c.Codes[offset+1]].String())
|
|
198
|
|
199 case Multi:
|
|
200 fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]]))
|
|
201
|
|
202 case Ref, Testref:
|
|
203 fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
|
|
204
|
|
205 case Capturemark:
|
|
206 fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
|
|
207 if c.Codes[offset+2] != -1 {
|
|
208 fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2])
|
|
209 }
|
|
210
|
|
211 case Nullcount, Setcount:
|
|
212 fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1])
|
|
213
|
|
214 case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount:
|
|
215 fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1])
|
|
216 }
|
|
217
|
|
218 switch op {
|
|
219 case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy:
|
|
220 buf.WriteString(", Rep = ")
|
|
221 if c.Codes[offset+2] == math.MaxInt32 {
|
|
222 buf.WriteString("inf")
|
|
223 } else {
|
|
224 fmt.Fprintf(buf, "%d", c.Codes[offset+2])
|
|
225 }
|
|
226
|
|
227 case Branchcount, Lazybranchcount:
|
|
228 buf.WriteString(", Limit = ")
|
|
229 if c.Codes[offset+2] == math.MaxInt32 {
|
|
230 buf.WriteString("inf")
|
|
231 } else {
|
|
232 fmt.Fprintf(buf, "%d", c.Codes[offset+2])
|
|
233 }
|
|
234
|
|
235 }
|
|
236
|
|
237 buf.WriteString(")")
|
|
238
|
|
239 return buf.String()
|
|
240 }
|
|
241
|
|
242 func (c *Code) Dump() string {
|
|
243 buf := &bytes.Buffer{}
|
|
244
|
|
245 if c.RightToLeft {
|
|
246 fmt.Fprintln(buf, "Direction: right-to-left")
|
|
247 } else {
|
|
248 fmt.Fprintln(buf, "Direction: left-to-right")
|
|
249 }
|
|
250 if c.FcPrefix == nil {
|
|
251 fmt.Fprintln(buf, "Firstchars: n/a")
|
|
252 } else {
|
|
253 fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String())
|
|
254 }
|
|
255
|
|
256 if c.BmPrefix == nil {
|
|
257 fmt.Fprintln(buf, "Prefix: n/a")
|
|
258 } else {
|
|
259 fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String()))
|
|
260 }
|
|
261
|
|
262 fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors)
|
|
263 fmt.Fprintln(buf)
|
|
264
|
|
265 if c.BmPrefix != nil {
|
|
266 fmt.Fprintln(buf, "BoyerMoore:")
|
|
267 fmt.Fprintln(buf, c.BmPrefix.Dump(" "))
|
|
268 }
|
|
269 for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) {
|
|
270 fmt.Fprintln(buf, c.OpcodeDescription(i))
|
|
271 }
|
|
272
|
|
273 return buf.String()
|
|
274 }
|