66
|
1 /*
|
|
2 Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a
|
|
3 more feature full regex engine behind the scenes.
|
|
4
|
|
5 It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET.
|
|
6 You'll likely be better off with the RE2 engine from the regexp package and should only use this if you
|
|
7 need to write very complex patterns or require compatibility with .NET.
|
|
8 */
|
|
9 package regexp2
|
|
10
|
|
11 import (
|
|
12 "errors"
|
|
13 "math"
|
|
14 "strconv"
|
|
15 "sync"
|
|
16 "time"
|
|
17
|
|
18 "github.com/dlclark/regexp2/syntax"
|
|
19 )
|
|
20
|
|
21 // Default timeout used when running regexp matches -- "forever"
|
|
22 var DefaultMatchTimeout = time.Duration(math.MaxInt64)
|
|
23
|
|
24 // Regexp is the representation of a compiled regular expression.
|
|
25 // A Regexp is safe for concurrent use by multiple goroutines.
|
|
26 type Regexp struct {
|
|
27 //timeout when trying to find matches
|
|
28 MatchTimeout time.Duration
|
|
29
|
|
30 // read-only after Compile
|
|
31 pattern string // as passed to Compile
|
|
32 options RegexOptions // options
|
|
33
|
|
34 caps map[int]int // capnum->index
|
|
35 capnames map[string]int //capture group name -> index
|
|
36 capslist []string //sorted list of capture group names
|
|
37 capsize int // size of the capture array
|
|
38
|
|
39 code *syntax.Code // compiled program
|
|
40
|
|
41 // cache of machines for running regexp
|
|
42 muRun sync.Mutex
|
|
43 runner []*runner
|
|
44 }
|
|
45
|
|
46 // Compile parses a regular expression and returns, if successful,
|
|
47 // a Regexp object that can be used to match against text.
|
|
48 func Compile(expr string, opt RegexOptions) (*Regexp, error) {
|
|
49 // parse it
|
|
50 tree, err := syntax.Parse(expr, syntax.RegexOptions(opt))
|
|
51 if err != nil {
|
|
52 return nil, err
|
|
53 }
|
|
54
|
|
55 // translate it to code
|
|
56 code, err := syntax.Write(tree)
|
|
57 if err != nil {
|
|
58 return nil, err
|
|
59 }
|
|
60
|
|
61 // return it
|
|
62 return &Regexp{
|
|
63 pattern: expr,
|
|
64 options: opt,
|
|
65 caps: code.Caps,
|
|
66 capnames: tree.Capnames,
|
|
67 capslist: tree.Caplist,
|
|
68 capsize: code.Capsize,
|
|
69 code: code,
|
|
70 MatchTimeout: DefaultMatchTimeout,
|
|
71 }, nil
|
|
72 }
|
|
73
|
|
74 // MustCompile is like Compile but panics if the expression cannot be parsed.
|
|
75 // It simplifies safe initialization of global variables holding compiled regular
|
|
76 // expressions.
|
|
77 func MustCompile(str string, opt RegexOptions) *Regexp {
|
|
78 regexp, error := Compile(str, opt)
|
|
79 if error != nil {
|
|
80 panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error())
|
|
81 }
|
|
82 return regexp
|
|
83 }
|
|
84
|
|
85 // Escape adds backslashes to any special characters in the input string
|
|
86 func Escape(input string) string {
|
|
87 return syntax.Escape(input)
|
|
88 }
|
|
89
|
|
90 // Unescape removes any backslashes from previously-escaped special characters in the input string
|
|
91 func Unescape(input string) (string, error) {
|
|
92 return syntax.Unescape(input)
|
|
93 }
|
|
94
|
|
95 // String returns the source text used to compile the regular expression.
|
|
96 func (re *Regexp) String() string {
|
|
97 return re.pattern
|
|
98 }
|
|
99
|
|
100 func quote(s string) string {
|
|
101 if strconv.CanBackquote(s) {
|
|
102 return "`" + s + "`"
|
|
103 }
|
|
104 return strconv.Quote(s)
|
|
105 }
|
|
106
|
|
107 // RegexOptions impact the runtime and parsing behavior
|
|
108 // for each specific regex. They are setable in code as well
|
|
109 // as in the regex pattern itself.
|
|
110 type RegexOptions int32
|
|
111
|
|
112 const (
|
|
113 None RegexOptions = 0x0
|
|
114 IgnoreCase = 0x0001 // "i"
|
|
115 Multiline = 0x0002 // "m"
|
|
116 ExplicitCapture = 0x0004 // "n"
|
|
117 Compiled = 0x0008 // "c"
|
|
118 Singleline = 0x0010 // "s"
|
|
119 IgnorePatternWhitespace = 0x0020 // "x"
|
|
120 RightToLeft = 0x0040 // "r"
|
|
121 Debug = 0x0080 // "d"
|
|
122 ECMAScript = 0x0100 // "e"
|
|
123 RE2 = 0x0200 // RE2 (regexp package) compatibility mode
|
|
124 )
|
|
125
|
|
126 func (re *Regexp) RightToLeft() bool {
|
|
127 return re.options&RightToLeft != 0
|
|
128 }
|
|
129
|
|
130 func (re *Regexp) Debug() bool {
|
|
131 return re.options&Debug != 0
|
|
132 }
|
|
133
|
|
134 // Replace searches the input string and replaces each match found with the replacement text.
|
|
135 // Count will limit the number of matches attempted and startAt will allow
|
|
136 // us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
|
|
137 // Set startAt and count to -1 to go through the whole string
|
|
138 func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) {
|
|
139 data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options))
|
|
140 if err != nil {
|
|
141 return "", err
|
|
142 }
|
|
143 //TODO: cache ReplacerData
|
|
144
|
|
145 return replace(re, data, nil, input, startAt, count)
|
|
146 }
|
|
147
|
|
148 // ReplaceFunc searches the input string and replaces each match found using the string from the evaluator
|
|
149 // Count will limit the number of matches attempted and startAt will allow
|
|
150 // us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
|
|
151 // Set startAt and count to -1 to go through the whole string.
|
|
152 func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) {
|
|
153 return replace(re, nil, evaluator, input, startAt, count)
|
|
154 }
|
|
155
|
|
156 // FindStringMatch searches the input string for a Regexp match
|
|
157 func (re *Regexp) FindStringMatch(s string) (*Match, error) {
|
|
158 // convert string to runes
|
|
159 return re.run(false, -1, getRunes(s))
|
|
160 }
|
|
161
|
|
162 // FindRunesMatch searches the input rune slice for a Regexp match
|
|
163 func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) {
|
|
164 return re.run(false, -1, r)
|
|
165 }
|
|
166
|
|
167 // FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index
|
|
168 func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) {
|
|
169 if startAt > len(s) {
|
|
170 return nil, errors.New("startAt must be less than the length of the input string")
|
|
171 }
|
|
172 r, startAt := re.getRunesAndStart(s, startAt)
|
|
173 if startAt == -1 {
|
|
174 // we didn't find our start index in the string -- that's a problem
|
|
175 return nil, errors.New("startAt must align to the start of a valid rune in the input string")
|
|
176 }
|
|
177
|
|
178 return re.run(false, startAt, r)
|
|
179 }
|
|
180
|
|
181 // FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index
|
|
182 func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) {
|
|
183 return re.run(false, startAt, r)
|
|
184 }
|
|
185
|
|
186 // FindNextMatch returns the next match in the same input string as the match parameter.
|
|
187 // Will return nil if there is no next match or if given a nil match.
|
|
188 func (re *Regexp) FindNextMatch(m *Match) (*Match, error) {
|
|
189 if m == nil {
|
|
190 return nil, nil
|
|
191 }
|
|
192
|
|
193 // If previous match was empty, advance by one before matching to prevent
|
|
194 // infinite loop
|
|
195 startAt := m.textpos
|
|
196 if m.Length == 0 {
|
|
197 if m.textpos == len(m.text) {
|
|
198 return nil, nil
|
|
199 }
|
|
200
|
|
201 if re.RightToLeft() {
|
|
202 startAt--
|
|
203 } else {
|
|
204 startAt++
|
|
205 }
|
|
206 }
|
|
207 return re.run(false, startAt, m.text)
|
|
208 }
|
|
209
|
|
210 // MatchString return true if the string matches the regex
|
|
211 // error will be set if a timeout occurs
|
|
212 func (re *Regexp) MatchString(s string) (bool, error) {
|
|
213 m, err := re.run(true, -1, getRunes(s))
|
|
214 if err != nil {
|
|
215 return false, err
|
|
216 }
|
|
217 return m != nil, nil
|
|
218 }
|
|
219
|
|
220 func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) {
|
|
221 if startAt < 0 {
|
|
222 if re.RightToLeft() {
|
|
223 r := getRunes(s)
|
|
224 return r, len(r)
|
|
225 }
|
|
226 return getRunes(s), 0
|
|
227 }
|
|
228 ret := make([]rune, len(s))
|
|
229 i := 0
|
|
230 runeIdx := -1
|
|
231 for strIdx, r := range s {
|
|
232 if strIdx == startAt {
|
|
233 runeIdx = i
|
|
234 }
|
|
235 ret[i] = r
|
|
236 i++
|
|
237 }
|
|
238 if startAt == len(s) {
|
|
239 runeIdx = i
|
|
240 }
|
|
241 return ret[:i], runeIdx
|
|
242 }
|
|
243
|
|
244 func getRunes(s string) []rune {
|
|
245 return []rune(s)
|
|
246 }
|
|
247
|
|
248 // MatchRunes return true if the runes matches the regex
|
|
249 // error will be set if a timeout occurs
|
|
250 func (re *Regexp) MatchRunes(r []rune) (bool, error) {
|
|
251 m, err := re.run(true, -1, r)
|
|
252 if err != nil {
|
|
253 return false, err
|
|
254 }
|
|
255 return m != nil, nil
|
|
256 }
|
|
257
|
|
258 // GetGroupNames Returns the set of strings used to name capturing groups in the expression.
|
|
259 func (re *Regexp) GetGroupNames() []string {
|
|
260 var result []string
|
|
261
|
|
262 if re.capslist == nil {
|
|
263 result = make([]string, re.capsize)
|
|
264
|
|
265 for i := 0; i < len(result); i++ {
|
|
266 result[i] = strconv.Itoa(i)
|
|
267 }
|
|
268 } else {
|
|
269 result = make([]string, len(re.capslist))
|
|
270 copy(result, re.capslist)
|
|
271 }
|
|
272
|
|
273 return result
|
|
274 }
|
|
275
|
|
276 // GetGroupNumbers returns the integer group numbers corresponding to a group name.
|
|
277 func (re *Regexp) GetGroupNumbers() []int {
|
|
278 var result []int
|
|
279
|
|
280 if re.caps == nil {
|
|
281 result = make([]int, re.capsize)
|
|
282
|
|
283 for i := 0; i < len(result); i++ {
|
|
284 result[i] = i
|
|
285 }
|
|
286 } else {
|
|
287 result = make([]int, len(re.caps))
|
|
288
|
|
289 for k, v := range re.caps {
|
|
290 result[v] = k
|
|
291 }
|
|
292 }
|
|
293
|
|
294 return result
|
|
295 }
|
|
296
|
|
297 // GroupNameFromNumber retrieves a group name that corresponds to a group number.
|
|
298 // It will return "" for and unknown group number. Unnamed groups automatically
|
|
299 // receive a name that is the decimal string equivalent of its number.
|
|
300 func (re *Regexp) GroupNameFromNumber(i int) string {
|
|
301 if re.capslist == nil {
|
|
302 if i >= 0 && i < re.capsize {
|
|
303 return strconv.Itoa(i)
|
|
304 }
|
|
305
|
|
306 return ""
|
|
307 }
|
|
308
|
|
309 if re.caps != nil {
|
|
310 var ok bool
|
|
311 if i, ok = re.caps[i]; !ok {
|
|
312 return ""
|
|
313 }
|
|
314 }
|
|
315
|
|
316 if i >= 0 && i < len(re.capslist) {
|
|
317 return re.capslist[i]
|
|
318 }
|
|
319
|
|
320 return ""
|
|
321 }
|
|
322
|
|
323 // GroupNumberFromName returns a group number that corresponds to a group name.
|
|
324 // Returns -1 if the name is not a recognized group name. Numbered groups
|
|
325 // automatically get a group name that is the decimal string equivalent of its number.
|
|
326 func (re *Regexp) GroupNumberFromName(name string) int {
|
|
327 // look up name if we have a hashtable of names
|
|
328 if re.capnames != nil {
|
|
329 if k, ok := re.capnames[name]; ok {
|
|
330 return k
|
|
331 }
|
|
332
|
|
333 return -1
|
|
334 }
|
|
335
|
|
336 // convert to an int if it looks like a number
|
|
337 result := 0
|
|
338 for i := 0; i < len(name); i++ {
|
|
339 ch := name[i]
|
|
340
|
|
341 if ch > '9' || ch < '0' {
|
|
342 return -1
|
|
343 }
|
|
344
|
|
345 result *= 10
|
|
346 result += int(ch - '0')
|
|
347 }
|
|
348
|
|
349 // return int if it's in range
|
|
350 if result >= 0 && result < re.capsize {
|
|
351 return result
|
|
352 }
|
|
353
|
|
354 return -1
|
|
355 }
|