Mercurial > yakumo_izuru > aya
comparison vendor/github.com/dlclark/regexp2/syntax/writer.go @ 66:787b5ee0289d draft
Use vendored modules
Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
| author | yakumo.izuru |
|---|---|
| date | Sun, 23 Jul 2023 13:18:53 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 65:6d985efa0f7a | 66:787b5ee0289d |
|---|---|
| 1 package syntax | |
| 2 | |
| 3 import ( | |
| 4 "bytes" | |
| 5 "fmt" | |
| 6 "math" | |
| 7 "os" | |
| 8 ) | |
| 9 | |
| 10 func Write(tree *RegexTree) (*Code, error) { | |
| 11 w := writer{ | |
| 12 intStack: make([]int, 0, 32), | |
| 13 emitted: make([]int, 2), | |
| 14 stringhash: make(map[string]int), | |
| 15 sethash: make(map[string]int), | |
| 16 } | |
| 17 | |
| 18 code, err := w.codeFromTree(tree) | |
| 19 | |
| 20 if tree.options&Debug > 0 && code != nil { | |
| 21 os.Stdout.WriteString(code.Dump()) | |
| 22 os.Stdout.WriteString("\n") | |
| 23 } | |
| 24 | |
| 25 return code, err | |
| 26 } | |
| 27 | |
| 28 type writer struct { | |
| 29 emitted []int | |
| 30 | |
| 31 intStack []int | |
| 32 curpos int | |
| 33 stringhash map[string]int | |
| 34 stringtable [][]rune | |
| 35 sethash map[string]int | |
| 36 settable []*CharSet | |
| 37 counting bool | |
| 38 count int | |
| 39 trackcount int | |
| 40 caps map[int]int | |
| 41 } | |
| 42 | |
| 43 const ( | |
| 44 beforeChild nodeType = 64 | |
| 45 afterChild = 128 | |
| 46 //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix | |
| 47 MaxPrefixSize = 50 | |
| 48 ) | |
| 49 | |
| 50 // The top level RegexCode generator. It does a depth-first walk | |
| 51 // through the tree and calls EmitFragment to emits code before | |
| 52 // and after each child of an interior node, and at each leaf. | |
| 53 // | |
| 54 // It runs two passes, first to count the size of the generated | |
| 55 // code, and second to generate the code. | |
| 56 // | |
| 57 // We should time it against the alternative, which is | |
| 58 // to just generate the code and grow the array as we go. | |
| 59 func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) { | |
| 60 var ( | |
| 61 curNode *regexNode | |
| 62 curChild int | |
| 63 capsize int | |
| 64 ) | |
| 65 // construct sparse capnum mapping if some numbers are unused | |
| 66 | |
| 67 if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) { | |
| 68 capsize = tree.captop | |
| 69 w.caps = nil | |
| 70 } else { | |
| 71 capsize = len(tree.capnumlist) | |
| 72 w.caps = tree.caps | |
| 73 for i := 0; i < len(tree.capnumlist); i++ { | |
| 74 w.caps[tree.capnumlist[i]] = i | |
| 75 } | |
| 76 } | |
| 77 | |
| 78 w.counting = true | |
| 79 | |
| 80 for { | |
| 81 if !w.counting { | |
| 82 w.emitted = make([]int, w.count) | |
| 83 } | |
| 84 | |
| 85 curNode = tree.root | |
| 86 curChild = 0 | |
| 87 | |
| 88 w.emit1(Lazybranch, 0) | |
| 89 | |
| 90 for { | |
| 91 if len(curNode.children) == 0 { | |
| 92 w.emitFragment(curNode.t, curNode, 0) | |
| 93 } else if curChild < len(curNode.children) { | |
| 94 w.emitFragment(curNode.t|beforeChild, curNode, curChild) | |
| 95 | |
| 96 curNode = curNode.children[curChild] | |
| 97 | |
| 98 w.pushInt(curChild) | |
| 99 curChild = 0 | |
| 100 continue | |
| 101 } | |
| 102 | |
| 103 if w.emptyStack() { | |
| 104 break | |
| 105 } | |
| 106 | |
| 107 curChild = w.popInt() | |
| 108 curNode = curNode.next | |
| 109 | |
| 110 w.emitFragment(curNode.t|afterChild, curNode, curChild) | |
| 111 curChild++ | |
| 112 } | |
| 113 | |
| 114 w.patchJump(0, w.curPos()) | |
| 115 w.emit(Stop) | |
| 116 | |
| 117 if !w.counting { | |
| 118 break | |
| 119 } | |
| 120 | |
| 121 w.counting = false | |
| 122 } | |
| 123 | |
| 124 fcPrefix := getFirstCharsPrefix(tree) | |
| 125 prefix := getPrefix(tree) | |
| 126 rtl := (tree.options & RightToLeft) != 0 | |
| 127 | |
| 128 var bmPrefix *BmPrefix | |
| 129 //TODO: benchmark string prefixes | |
| 130 if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 { | |
| 131 if len(prefix.PrefixStr) > MaxPrefixSize { | |
| 132 // limit prefix changes to 10k | |
| 133 prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize] | |
| 134 } | |
| 135 bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl) | |
| 136 } else { | |
| 137 bmPrefix = nil | |
| 138 } | |
| 139 | |
| 140 return &Code{ | |
| 141 Codes: w.emitted, | |
| 142 Strings: w.stringtable, | |
| 143 Sets: w.settable, | |
| 144 TrackCount: w.trackcount, | |
| 145 Caps: w.caps, | |
| 146 Capsize: capsize, | |
| 147 FcPrefix: fcPrefix, | |
| 148 BmPrefix: bmPrefix, | |
| 149 Anchors: getAnchors(tree), | |
| 150 RightToLeft: rtl, | |
| 151 }, nil | |
| 152 } | |
| 153 | |
| 154 // The main RegexCode generator. It does a depth-first walk | |
| 155 // through the tree and calls EmitFragment to emits code before | |
| 156 // and after each child of an interior node, and at each leaf. | |
| 157 func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error { | |
| 158 bits := InstOp(0) | |
| 159 | |
| 160 if nodetype <= ntRef { | |
| 161 if (node.options & RightToLeft) != 0 { | |
| 162 bits |= Rtl | |
| 163 } | |
| 164 if (node.options & IgnoreCase) != 0 { | |
| 165 bits |= Ci | |
| 166 } | |
| 167 } | |
| 168 ntBits := nodeType(bits) | |
| 169 | |
| 170 switch nodetype { | |
| 171 case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty: | |
| 172 break | |
| 173 | |
| 174 case ntAlternate | beforeChild: | |
| 175 if curIndex < len(node.children)-1 { | |
| 176 w.pushInt(w.curPos()) | |
| 177 w.emit1(Lazybranch, 0) | |
| 178 } | |
| 179 | |
| 180 case ntAlternate | afterChild: | |
| 181 if curIndex < len(node.children)-1 { | |
| 182 lbPos := w.popInt() | |
| 183 w.pushInt(w.curPos()) | |
| 184 w.emit1(Goto, 0) | |
| 185 w.patchJump(lbPos, w.curPos()) | |
| 186 } else { | |
| 187 for i := 0; i < curIndex; i++ { | |
| 188 w.patchJump(w.popInt(), w.curPos()) | |
| 189 } | |
| 190 } | |
| 191 break | |
| 192 | |
| 193 case ntTestref | beforeChild: | |
| 194 if curIndex == 0 { | |
| 195 w.emit(Setjump) | |
| 196 w.pushInt(w.curPos()) | |
| 197 w.emit1(Lazybranch, 0) | |
| 198 w.emit1(Testref, w.mapCapnum(node.m)) | |
| 199 w.emit(Forejump) | |
| 200 } | |
| 201 | |
| 202 case ntTestref | afterChild: | |
| 203 if curIndex == 0 { | |
| 204 branchpos := w.popInt() | |
| 205 w.pushInt(w.curPos()) | |
| 206 w.emit1(Goto, 0) | |
| 207 w.patchJump(branchpos, w.curPos()) | |
| 208 w.emit(Forejump) | |
| 209 if len(node.children) <= 1 { | |
| 210 w.patchJump(w.popInt(), w.curPos()) | |
| 211 } | |
| 212 } else if curIndex == 1 { | |
| 213 w.patchJump(w.popInt(), w.curPos()) | |
| 214 } | |
| 215 | |
| 216 case ntTestgroup | beforeChild: | |
| 217 if curIndex == 0 { | |
| 218 w.emit(Setjump) | |
| 219 w.emit(Setmark) | |
| 220 w.pushInt(w.curPos()) | |
| 221 w.emit1(Lazybranch, 0) | |
| 222 } | |
| 223 | |
| 224 case ntTestgroup | afterChild: | |
| 225 if curIndex == 0 { | |
| 226 w.emit(Getmark) | |
| 227 w.emit(Forejump) | |
| 228 } else if curIndex == 1 { | |
| 229 Branchpos := w.popInt() | |
| 230 w.pushInt(w.curPos()) | |
| 231 w.emit1(Goto, 0) | |
| 232 w.patchJump(Branchpos, w.curPos()) | |
| 233 w.emit(Getmark) | |
| 234 w.emit(Forejump) | |
| 235 if len(node.children) <= 2 { | |
| 236 w.patchJump(w.popInt(), w.curPos()) | |
| 237 } | |
| 238 } else if curIndex == 2 { | |
| 239 w.patchJump(w.popInt(), w.curPos()) | |
| 240 } | |
| 241 | |
| 242 case ntLoop | beforeChild, ntLazyloop | beforeChild: | |
| 243 | |
| 244 if node.n < math.MaxInt32 || node.m > 1 { | |
| 245 if node.m == 0 { | |
| 246 w.emit1(Nullcount, 0) | |
| 247 } else { | |
| 248 w.emit1(Setcount, 1-node.m) | |
| 249 } | |
| 250 } else if node.m == 0 { | |
| 251 w.emit(Nullmark) | |
| 252 } else { | |
| 253 w.emit(Setmark) | |
| 254 } | |
| 255 | |
| 256 if node.m == 0 { | |
| 257 w.pushInt(w.curPos()) | |
| 258 w.emit1(Goto, 0) | |
| 259 } | |
| 260 w.pushInt(w.curPos()) | |
| 261 | |
| 262 case ntLoop | afterChild, ntLazyloop | afterChild: | |
| 263 | |
| 264 startJumpPos := w.curPos() | |
| 265 lazy := (nodetype - (ntLoop | afterChild)) | |
| 266 | |
| 267 if node.n < math.MaxInt32 || node.m > 1 { | |
| 268 if node.n == math.MaxInt32 { | |
| 269 w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32) | |
| 270 } else { | |
| 271 w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m) | |
| 272 } | |
| 273 } else { | |
| 274 w.emit1(InstOp(Branchmark+lazy), w.popInt()) | |
| 275 } | |
| 276 | |
| 277 if node.m == 0 { | |
| 278 w.patchJump(w.popInt(), startJumpPos) | |
| 279 } | |
| 280 | |
| 281 case ntGroup | beforeChild, ntGroup | afterChild: | |
| 282 | |
| 283 case ntCapture | beforeChild: | |
| 284 w.emit(Setmark) | |
| 285 | |
| 286 case ntCapture | afterChild: | |
| 287 w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n)) | |
| 288 | |
| 289 case ntRequire | beforeChild: | |
| 290 // NOTE: the following line causes lookahead/lookbehind to be | |
| 291 // NON-BACKTRACKING. It can be commented out with (*) | |
| 292 w.emit(Setjump) | |
| 293 | |
| 294 w.emit(Setmark) | |
| 295 | |
| 296 case ntRequire | afterChild: | |
| 297 w.emit(Getmark) | |
| 298 | |
| 299 // NOTE: the following line causes lookahead/lookbehind to be | |
| 300 // NON-BACKTRACKING. It can be commented out with (*) | |
| 301 w.emit(Forejump) | |
| 302 | |
| 303 case ntPrevent | beforeChild: | |
| 304 w.emit(Setjump) | |
| 305 w.pushInt(w.curPos()) | |
| 306 w.emit1(Lazybranch, 0) | |
| 307 | |
| 308 case ntPrevent | afterChild: | |
| 309 w.emit(Backjump) | |
| 310 w.patchJump(w.popInt(), w.curPos()) | |
| 311 w.emit(Forejump) | |
| 312 | |
| 313 case ntGreedy | beforeChild: | |
| 314 w.emit(Setjump) | |
| 315 | |
| 316 case ntGreedy | afterChild: | |
| 317 w.emit(Forejump) | |
| 318 | |
| 319 case ntOne, ntNotone: | |
| 320 w.emit1(InstOp(node.t|ntBits), int(node.ch)) | |
| 321 | |
| 322 case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy: | |
| 323 if node.m > 0 { | |
| 324 if node.t == ntOneloop || node.t == ntOnelazy { | |
| 325 w.emit2(Onerep|bits, int(node.ch), node.m) | |
| 326 } else { | |
| 327 w.emit2(Notonerep|bits, int(node.ch), node.m) | |
| 328 } | |
| 329 } | |
| 330 if node.n > node.m { | |
| 331 if node.n == math.MaxInt32 { | |
| 332 w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32) | |
| 333 } else { | |
| 334 w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m) | |
| 335 } | |
| 336 } | |
| 337 | |
| 338 case ntSetloop, ntSetlazy: | |
| 339 if node.m > 0 { | |
| 340 w.emit2(Setrep|bits, w.setCode(node.set), node.m) | |
| 341 } | |
| 342 if node.n > node.m { | |
| 343 if node.n == math.MaxInt32 { | |
| 344 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32) | |
| 345 } else { | |
| 346 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m) | |
| 347 } | |
| 348 } | |
| 349 | |
| 350 case ntMulti: | |
| 351 w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str)) | |
| 352 | |
| 353 case ntSet: | |
| 354 w.emit1(InstOp(node.t|ntBits), w.setCode(node.set)) | |
| 355 | |
| 356 case ntRef: | |
| 357 w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m)) | |
| 358 | |
| 359 case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: | |
| 360 w.emit(InstOp(node.t)) | |
| 361 | |
| 362 default: | |
| 363 return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype) | |
| 364 } | |
| 365 | |
| 366 return nil | |
| 367 } | |
| 368 | |
| 369 // To avoid recursion, we use a simple integer stack. | |
| 370 // This is the push. | |
| 371 func (w *writer) pushInt(i int) { | |
| 372 w.intStack = append(w.intStack, i) | |
| 373 } | |
| 374 | |
| 375 // Returns true if the stack is empty. | |
| 376 func (w *writer) emptyStack() bool { | |
| 377 return len(w.intStack) == 0 | |
| 378 } | |
| 379 | |
| 380 // This is the pop. | |
| 381 func (w *writer) popInt() int { | |
| 382 //get our item | |
| 383 idx := len(w.intStack) - 1 | |
| 384 i := w.intStack[idx] | |
| 385 //trim our slice | |
| 386 w.intStack = w.intStack[:idx] | |
| 387 return i | |
| 388 } | |
| 389 | |
| 390 // Returns the current position in the emitted code. | |
| 391 func (w *writer) curPos() int { | |
| 392 return w.curpos | |
| 393 } | |
| 394 | |
| 395 // Fixes up a jump instruction at the specified offset | |
| 396 // so that it jumps to the specified jumpDest. | |
| 397 func (w *writer) patchJump(offset, jumpDest int) { | |
| 398 w.emitted[offset+1] = jumpDest | |
| 399 } | |
| 400 | |
| 401 // Returns an index in the set table for a charset | |
| 402 // uses a map to eliminate duplicates. | |
| 403 func (w *writer) setCode(set *CharSet) int { | |
| 404 if w.counting { | |
| 405 return 0 | |
| 406 } | |
| 407 | |
| 408 buf := &bytes.Buffer{} | |
| 409 | |
| 410 set.mapHashFill(buf) | |
| 411 hash := buf.String() | |
| 412 i, ok := w.sethash[hash] | |
| 413 if !ok { | |
| 414 i = len(w.sethash) | |
| 415 w.sethash[hash] = i | |
| 416 w.settable = append(w.settable, set) | |
| 417 } | |
| 418 return i | |
| 419 } | |
| 420 | |
| 421 // Returns an index in the string table for a string. | |
| 422 // uses a map to eliminate duplicates. | |
| 423 func (w *writer) stringCode(str []rune) int { | |
| 424 if w.counting { | |
| 425 return 0 | |
| 426 } | |
| 427 | |
| 428 hash := string(str) | |
| 429 i, ok := w.stringhash[hash] | |
| 430 if !ok { | |
| 431 i = len(w.stringhash) | |
| 432 w.stringhash[hash] = i | |
| 433 w.stringtable = append(w.stringtable, str) | |
| 434 } | |
| 435 | |
| 436 return i | |
| 437 } | |
| 438 | |
| 439 // When generating code on a regex that uses a sparse set | |
| 440 // of capture slots, we hash them to a dense set of indices | |
| 441 // for an array of capture slots. Instead of doing the hash | |
| 442 // at match time, it's done at compile time, here. | |
| 443 func (w *writer) mapCapnum(capnum int) int { | |
| 444 if capnum == -1 { | |
| 445 return -1 | |
| 446 } | |
| 447 | |
| 448 if w.caps != nil { | |
| 449 return w.caps[capnum] | |
| 450 } | |
| 451 | |
| 452 return capnum | |
| 453 } | |
| 454 | |
| 455 // Emits a zero-argument operation. Note that the emit | |
| 456 // functions all run in two modes: they can emit code, or | |
| 457 // they can just count the size of the code. | |
| 458 func (w *writer) emit(op InstOp) { | |
| 459 if w.counting { | |
| 460 w.count++ | |
| 461 if opcodeBacktracks(op) { | |
| 462 w.trackcount++ | |
| 463 } | |
| 464 return | |
| 465 } | |
| 466 w.emitted[w.curpos] = int(op) | |
| 467 w.curpos++ | |
| 468 } | |
| 469 | |
| 470 // Emits a one-argument operation. | |
| 471 func (w *writer) emit1(op InstOp, opd1 int) { | |
| 472 if w.counting { | |
| 473 w.count += 2 | |
| 474 if opcodeBacktracks(op) { | |
| 475 w.trackcount++ | |
| 476 } | |
| 477 return | |
| 478 } | |
| 479 w.emitted[w.curpos] = int(op) | |
| 480 w.curpos++ | |
| 481 w.emitted[w.curpos] = opd1 | |
| 482 w.curpos++ | |
| 483 } | |
| 484 | |
| 485 // Emits a two-argument operation. | |
| 486 func (w *writer) emit2(op InstOp, opd1, opd2 int) { | |
| 487 if w.counting { | |
| 488 w.count += 3 | |
| 489 if opcodeBacktracks(op) { | |
| 490 w.trackcount++ | |
| 491 } | |
| 492 return | |
| 493 } | |
| 494 w.emitted[w.curpos] = int(op) | |
| 495 w.curpos++ | |
| 496 w.emitted[w.curpos] = opd1 | |
| 497 w.curpos++ | |
| 498 w.emitted[w.curpos] = opd2 | |
| 499 w.curpos++ | |
| 500 } |
