Mercurial > yakumo_izuru > aya
comparison vendor/github.com/russross/blackfriday/v2/inline.go @ 66:787b5ee0289d draft
Use vendored modules
Signed-off-by: Izuru Yakumo <yakumo.izuru@chaotic.ninja>
author | yakumo.izuru |
---|---|
date | Sun, 23 Jul 2023 13:18:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
65:6d985efa0f7a | 66:787b5ee0289d |
---|---|
1 // | |
2 // Blackfriday Markdown Processor | |
3 // Available at http://github.com/russross/blackfriday | |
4 // | |
5 // Copyright © 2011 Russ Ross <russ@russross.com>. | |
6 // Distributed under the Simplified BSD License. | |
7 // See README.md for details. | |
8 // | |
9 | |
10 // | |
11 // Functions to parse inline elements. | |
12 // | |
13 | |
14 package blackfriday | |
15 | |
16 import ( | |
17 "bytes" | |
18 "regexp" | |
19 "strconv" | |
20 ) | |
21 | |
22 var ( | |
23 urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` | |
24 anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`) | |
25 | |
26 // https://www.w3.org/TR/html5/syntax.html#character-references | |
27 // highest unicode code point in 17 planes (2^20): 1,114,112d = | |
28 // 7 dec digits or 6 hex digits | |
29 // named entity references can be 2-31 characters with stuff like < | |
30 // at one end and ∳ at the other. There | |
31 // are also sometimes numbers at the end, although this isn't inherent | |
32 // in the specification; there are never numbers anywhere else in | |
33 // current character references, though; see ¾ and ▒, etc. | |
34 // https://www.w3.org/TR/html5/syntax.html#named-character-references | |
35 // | |
36 // entity := "&" (named group | number ref) ";" | |
37 // named group := [a-zA-Z]{2,31}[0-9]{0,2} | |
38 // number ref := "#" (dec ref | hex ref) | |
39 // dec ref := [0-9]{1,7} | |
40 // hex ref := ("x" | "X") [0-9a-fA-F]{1,6} | |
41 htmlEntityRe = regexp.MustCompile(`&([a-zA-Z]{2,31}[0-9]{0,2}|#([0-9]{1,7}|[xX][0-9a-fA-F]{1,6}));`) | |
42 ) | |
43 | |
44 // Functions to parse text within a block | |
45 // Each function returns the number of chars taken care of | |
46 // data is the complete block being rendered | |
47 // offset is the number of valid chars before the current cursor | |
48 | |
49 func (p *Markdown) inline(currBlock *Node, data []byte) { | |
50 // handlers might call us recursively: enforce a maximum depth | |
51 if p.nesting >= p.maxNesting || len(data) == 0 { | |
52 return | |
53 } | |
54 p.nesting++ | |
55 beg, end := 0, 0 | |
56 for end < len(data) { | |
57 handler := p.inlineCallback[data[end]] | |
58 if handler != nil { | |
59 if consumed, node := handler(p, data, end); consumed == 0 { | |
60 // No action from the callback. | |
61 end++ | |
62 } else { | |
63 // Copy inactive chars into the output. | |
64 currBlock.AppendChild(text(data[beg:end])) | |
65 if node != nil { | |
66 currBlock.AppendChild(node) | |
67 } | |
68 // Skip past whatever the callback used. | |
69 beg = end + consumed | |
70 end = beg | |
71 } | |
72 } else { | |
73 end++ | |
74 } | |
75 } | |
76 if beg < len(data) { | |
77 if data[end-1] == '\n' { | |
78 end-- | |
79 } | |
80 currBlock.AppendChild(text(data[beg:end])) | |
81 } | |
82 p.nesting-- | |
83 } | |
84 | |
85 // single and double emphasis parsing | |
86 func emphasis(p *Markdown, data []byte, offset int) (int, *Node) { | |
87 data = data[offset:] | |
88 c := data[0] | |
89 | |
90 if len(data) > 2 && data[1] != c { | |
91 // whitespace cannot follow an opening emphasis; | |
92 // strikethrough only takes two characters '~~' | |
93 if c == '~' || isspace(data[1]) { | |
94 return 0, nil | |
95 } | |
96 ret, node := helperEmphasis(p, data[1:], c) | |
97 if ret == 0 { | |
98 return 0, nil | |
99 } | |
100 | |
101 return ret + 1, node | |
102 } | |
103 | |
104 if len(data) > 3 && data[1] == c && data[2] != c { | |
105 if isspace(data[2]) { | |
106 return 0, nil | |
107 } | |
108 ret, node := helperDoubleEmphasis(p, data[2:], c) | |
109 if ret == 0 { | |
110 return 0, nil | |
111 } | |
112 | |
113 return ret + 2, node | |
114 } | |
115 | |
116 if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c { | |
117 if c == '~' || isspace(data[3]) { | |
118 return 0, nil | |
119 } | |
120 ret, node := helperTripleEmphasis(p, data, 3, c) | |
121 if ret == 0 { | |
122 return 0, nil | |
123 } | |
124 | |
125 return ret + 3, node | |
126 } | |
127 | |
128 return 0, nil | |
129 } | |
130 | |
131 func codeSpan(p *Markdown, data []byte, offset int) (int, *Node) { | |
132 data = data[offset:] | |
133 | |
134 nb := 0 | |
135 | |
136 // count the number of backticks in the delimiter | |
137 for nb < len(data) && data[nb] == '`' { | |
138 nb++ | |
139 } | |
140 | |
141 // find the next delimiter | |
142 i, end := 0, 0 | |
143 for end = nb; end < len(data) && i < nb; end++ { | |
144 if data[end] == '`' { | |
145 i++ | |
146 } else { | |
147 i = 0 | |
148 } | |
149 } | |
150 | |
151 // no matching delimiter? | |
152 if i < nb && end >= len(data) { | |
153 return 0, nil | |
154 } | |
155 | |
156 // trim outside whitespace | |
157 fBegin := nb | |
158 for fBegin < end && data[fBegin] == ' ' { | |
159 fBegin++ | |
160 } | |
161 | |
162 fEnd := end - nb | |
163 for fEnd > fBegin && data[fEnd-1] == ' ' { | |
164 fEnd-- | |
165 } | |
166 | |
167 // render the code span | |
168 if fBegin != fEnd { | |
169 code := NewNode(Code) | |
170 code.Literal = data[fBegin:fEnd] | |
171 return end, code | |
172 } | |
173 | |
174 return end, nil | |
175 } | |
176 | |
177 // newline preceded by two spaces becomes <br> | |
178 func maybeLineBreak(p *Markdown, data []byte, offset int) (int, *Node) { | |
179 origOffset := offset | |
180 for offset < len(data) && data[offset] == ' ' { | |
181 offset++ | |
182 } | |
183 | |
184 if offset < len(data) && data[offset] == '\n' { | |
185 if offset-origOffset >= 2 { | |
186 return offset - origOffset + 1, NewNode(Hardbreak) | |
187 } | |
188 return offset - origOffset, nil | |
189 } | |
190 return 0, nil | |
191 } | |
192 | |
193 // newline without two spaces works when HardLineBreak is enabled | |
194 func lineBreak(p *Markdown, data []byte, offset int) (int, *Node) { | |
195 if p.extensions&HardLineBreak != 0 { | |
196 return 1, NewNode(Hardbreak) | |
197 } | |
198 return 0, nil | |
199 } | |
200 | |
201 type linkType int | |
202 | |
203 const ( | |
204 linkNormal linkType = iota | |
205 linkImg | |
206 linkDeferredFootnote | |
207 linkInlineFootnote | |
208 ) | |
209 | |
210 func isReferenceStyleLink(data []byte, pos int, t linkType) bool { | |
211 if t == linkDeferredFootnote { | |
212 return false | |
213 } | |
214 return pos < len(data)-1 && data[pos] == '[' && data[pos+1] != '^' | |
215 } | |
216 | |
217 func maybeImage(p *Markdown, data []byte, offset int) (int, *Node) { | |
218 if offset < len(data)-1 && data[offset+1] == '[' { | |
219 return link(p, data, offset) | |
220 } | |
221 return 0, nil | |
222 } | |
223 | |
224 func maybeInlineFootnote(p *Markdown, data []byte, offset int) (int, *Node) { | |
225 if offset < len(data)-1 && data[offset+1] == '[' { | |
226 return link(p, data, offset) | |
227 } | |
228 return 0, nil | |
229 } | |
230 | |
231 // '[': parse a link or an image or a footnote | |
232 func link(p *Markdown, data []byte, offset int) (int, *Node) { | |
233 // no links allowed inside regular links, footnote, and deferred footnotes | |
234 if p.insideLink && (offset > 0 && data[offset-1] == '[' || len(data)-1 > offset && data[offset+1] == '^') { | |
235 return 0, nil | |
236 } | |
237 | |
238 var t linkType | |
239 switch { | |
240 // special case: ![^text] == deferred footnote (that follows something with | |
241 // an exclamation point) | |
242 case p.extensions&Footnotes != 0 && len(data)-1 > offset && data[offset+1] == '^': | |
243 t = linkDeferredFootnote | |
244 // ![alt] == image | |
245 case offset >= 0 && data[offset] == '!': | |
246 t = linkImg | |
247 offset++ | |
248 // ^[text] == inline footnote | |
249 // [^refId] == deferred footnote | |
250 case p.extensions&Footnotes != 0: | |
251 if offset >= 0 && data[offset] == '^' { | |
252 t = linkInlineFootnote | |
253 offset++ | |
254 } else if len(data)-1 > offset && data[offset+1] == '^' { | |
255 t = linkDeferredFootnote | |
256 } | |
257 // [text] == regular link | |
258 default: | |
259 t = linkNormal | |
260 } | |
261 | |
262 data = data[offset:] | |
263 | |
264 var ( | |
265 i = 1 | |
266 noteID int | |
267 title, link, altContent []byte | |
268 textHasNl = false | |
269 ) | |
270 | |
271 if t == linkDeferredFootnote { | |
272 i++ | |
273 } | |
274 | |
275 // look for the matching closing bracket | |
276 for level := 1; level > 0 && i < len(data); i++ { | |
277 switch { | |
278 case data[i] == '\n': | |
279 textHasNl = true | |
280 | |
281 case isBackslashEscaped(data, i): | |
282 continue | |
283 | |
284 case data[i] == '[': | |
285 level++ | |
286 | |
287 case data[i] == ']': | |
288 level-- | |
289 if level <= 0 { | |
290 i-- // compensate for extra i++ in for loop | |
291 } | |
292 } | |
293 } | |
294 | |
295 if i >= len(data) { | |
296 return 0, nil | |
297 } | |
298 | |
299 txtE := i | |
300 i++ | |
301 var footnoteNode *Node | |
302 | |
303 // skip any amount of whitespace or newline | |
304 // (this is much more lax than original markdown syntax) | |
305 for i < len(data) && isspace(data[i]) { | |
306 i++ | |
307 } | |
308 | |
309 // inline style link | |
310 switch { | |
311 case i < len(data) && data[i] == '(': | |
312 // skip initial whitespace | |
313 i++ | |
314 | |
315 for i < len(data) && isspace(data[i]) { | |
316 i++ | |
317 } | |
318 | |
319 linkB := i | |
320 | |
321 // look for link end: ' " ) | |
322 findlinkend: | |
323 for i < len(data) { | |
324 switch { | |
325 case data[i] == '\\': | |
326 i += 2 | |
327 | |
328 case data[i] == ')' || data[i] == '\'' || data[i] == '"': | |
329 break findlinkend | |
330 | |
331 default: | |
332 i++ | |
333 } | |
334 } | |
335 | |
336 if i >= len(data) { | |
337 return 0, nil | |
338 } | |
339 linkE := i | |
340 | |
341 // look for title end if present | |
342 titleB, titleE := 0, 0 | |
343 if data[i] == '\'' || data[i] == '"' { | |
344 i++ | |
345 titleB = i | |
346 | |
347 findtitleend: | |
348 for i < len(data) { | |
349 switch { | |
350 case data[i] == '\\': | |
351 i += 2 | |
352 | |
353 case data[i] == ')': | |
354 break findtitleend | |
355 | |
356 default: | |
357 i++ | |
358 } | |
359 } | |
360 | |
361 if i >= len(data) { | |
362 return 0, nil | |
363 } | |
364 | |
365 // skip whitespace after title | |
366 titleE = i - 1 | |
367 for titleE > titleB && isspace(data[titleE]) { | |
368 titleE-- | |
369 } | |
370 | |
371 // check for closing quote presence | |
372 if data[titleE] != '\'' && data[titleE] != '"' { | |
373 titleB, titleE = 0, 0 | |
374 linkE = i | |
375 } | |
376 } | |
377 | |
378 // remove whitespace at the end of the link | |
379 for linkE > linkB && isspace(data[linkE-1]) { | |
380 linkE-- | |
381 } | |
382 | |
383 // remove optional angle brackets around the link | |
384 if data[linkB] == '<' { | |
385 linkB++ | |
386 } | |
387 if data[linkE-1] == '>' { | |
388 linkE-- | |
389 } | |
390 | |
391 // build escaped link and title | |
392 if linkE > linkB { | |
393 link = data[linkB:linkE] | |
394 } | |
395 | |
396 if titleE > titleB { | |
397 title = data[titleB:titleE] | |
398 } | |
399 | |
400 i++ | |
401 | |
402 // reference style link | |
403 case isReferenceStyleLink(data, i, t): | |
404 var id []byte | |
405 altContentConsidered := false | |
406 | |
407 // look for the id | |
408 i++ | |
409 linkB := i | |
410 for i < len(data) && data[i] != ']' { | |
411 i++ | |
412 } | |
413 if i >= len(data) { | |
414 return 0, nil | |
415 } | |
416 linkE := i | |
417 | |
418 // find the reference | |
419 if linkB == linkE { | |
420 if textHasNl { | |
421 var b bytes.Buffer | |
422 | |
423 for j := 1; j < txtE; j++ { | |
424 switch { | |
425 case data[j] != '\n': | |
426 b.WriteByte(data[j]) | |
427 case data[j-1] != ' ': | |
428 b.WriteByte(' ') | |
429 } | |
430 } | |
431 | |
432 id = b.Bytes() | |
433 } else { | |
434 id = data[1:txtE] | |
435 altContentConsidered = true | |
436 } | |
437 } else { | |
438 id = data[linkB:linkE] | |
439 } | |
440 | |
441 // find the reference with matching id | |
442 lr, ok := p.getRef(string(id)) | |
443 if !ok { | |
444 return 0, nil | |
445 } | |
446 | |
447 // keep link and title from reference | |
448 link = lr.link | |
449 title = lr.title | |
450 if altContentConsidered { | |
451 altContent = lr.text | |
452 } | |
453 i++ | |
454 | |
455 // shortcut reference style link or reference or inline footnote | |
456 default: | |
457 var id []byte | |
458 | |
459 // craft the id | |
460 if textHasNl { | |
461 var b bytes.Buffer | |
462 | |
463 for j := 1; j < txtE; j++ { | |
464 switch { | |
465 case data[j] != '\n': | |
466 b.WriteByte(data[j]) | |
467 case data[j-1] != ' ': | |
468 b.WriteByte(' ') | |
469 } | |
470 } | |
471 | |
472 id = b.Bytes() | |
473 } else { | |
474 if t == linkDeferredFootnote { | |
475 id = data[2:txtE] // get rid of the ^ | |
476 } else { | |
477 id = data[1:txtE] | |
478 } | |
479 } | |
480 | |
481 footnoteNode = NewNode(Item) | |
482 if t == linkInlineFootnote { | |
483 // create a new reference | |
484 noteID = len(p.notes) + 1 | |
485 | |
486 var fragment []byte | |
487 if len(id) > 0 { | |
488 if len(id) < 16 { | |
489 fragment = make([]byte, len(id)) | |
490 } else { | |
491 fragment = make([]byte, 16) | |
492 } | |
493 copy(fragment, slugify(id)) | |
494 } else { | |
495 fragment = append([]byte("footnote-"), []byte(strconv.Itoa(noteID))...) | |
496 } | |
497 | |
498 ref := &reference{ | |
499 noteID: noteID, | |
500 hasBlock: false, | |
501 link: fragment, | |
502 title: id, | |
503 footnote: footnoteNode, | |
504 } | |
505 | |
506 p.notes = append(p.notes, ref) | |
507 | |
508 link = ref.link | |
509 title = ref.title | |
510 } else { | |
511 // find the reference with matching id | |
512 lr, ok := p.getRef(string(id)) | |
513 if !ok { | |
514 return 0, nil | |
515 } | |
516 | |
517 if t == linkDeferredFootnote { | |
518 lr.noteID = len(p.notes) + 1 | |
519 lr.footnote = footnoteNode | |
520 p.notes = append(p.notes, lr) | |
521 } | |
522 | |
523 // keep link and title from reference | |
524 link = lr.link | |
525 // if inline footnote, title == footnote contents | |
526 title = lr.title | |
527 noteID = lr.noteID | |
528 } | |
529 | |
530 // rewind the whitespace | |
531 i = txtE + 1 | |
532 } | |
533 | |
534 var uLink []byte | |
535 if t == linkNormal || t == linkImg { | |
536 if len(link) > 0 { | |
537 var uLinkBuf bytes.Buffer | |
538 unescapeText(&uLinkBuf, link) | |
539 uLink = uLinkBuf.Bytes() | |
540 } | |
541 | |
542 // links need something to click on and somewhere to go | |
543 if len(uLink) == 0 || (t == linkNormal && txtE <= 1) { | |
544 return 0, nil | |
545 } | |
546 } | |
547 | |
548 // call the relevant rendering function | |
549 var linkNode *Node | |
550 switch t { | |
551 case linkNormal: | |
552 linkNode = NewNode(Link) | |
553 linkNode.Destination = normalizeURI(uLink) | |
554 linkNode.Title = title | |
555 if len(altContent) > 0 { | |
556 linkNode.AppendChild(text(altContent)) | |
557 } else { | |
558 // links cannot contain other links, so turn off link parsing | |
559 // temporarily and recurse | |
560 insideLink := p.insideLink | |
561 p.insideLink = true | |
562 p.inline(linkNode, data[1:txtE]) | |
563 p.insideLink = insideLink | |
564 } | |
565 | |
566 case linkImg: | |
567 linkNode = NewNode(Image) | |
568 linkNode.Destination = uLink | |
569 linkNode.Title = title | |
570 linkNode.AppendChild(text(data[1:txtE])) | |
571 i++ | |
572 | |
573 case linkInlineFootnote, linkDeferredFootnote: | |
574 linkNode = NewNode(Link) | |
575 linkNode.Destination = link | |
576 linkNode.Title = title | |
577 linkNode.NoteID = noteID | |
578 linkNode.Footnote = footnoteNode | |
579 if t == linkInlineFootnote { | |
580 i++ | |
581 } | |
582 | |
583 default: | |
584 return 0, nil | |
585 } | |
586 | |
587 return i, linkNode | |
588 } | |
589 | |
590 func (p *Markdown) inlineHTMLComment(data []byte) int { | |
591 if len(data) < 5 { | |
592 return 0 | |
593 } | |
594 if data[0] != '<' || data[1] != '!' || data[2] != '-' || data[3] != '-' { | |
595 return 0 | |
596 } | |
597 i := 5 | |
598 // scan for an end-of-comment marker, across lines if necessary | |
599 for i < len(data) && !(data[i-2] == '-' && data[i-1] == '-' && data[i] == '>') { | |
600 i++ | |
601 } | |
602 // no end-of-comment marker | |
603 if i >= len(data) { | |
604 return 0 | |
605 } | |
606 return i + 1 | |
607 } | |
608 | |
609 func stripMailto(link []byte) []byte { | |
610 if bytes.HasPrefix(link, []byte("mailto://")) { | |
611 return link[9:] | |
612 } else if bytes.HasPrefix(link, []byte("mailto:")) { | |
613 return link[7:] | |
614 } else { | |
615 return link | |
616 } | |
617 } | |
618 | |
619 // autolinkType specifies a kind of autolink that gets detected. | |
620 type autolinkType int | |
621 | |
622 // These are the possible flag values for the autolink renderer. | |
623 const ( | |
624 notAutolink autolinkType = iota | |
625 normalAutolink | |
626 emailAutolink | |
627 ) | |
628 | |
629 // '<' when tags or autolinks are allowed | |
630 func leftAngle(p *Markdown, data []byte, offset int) (int, *Node) { | |
631 data = data[offset:] | |
632 altype, end := tagLength(data) | |
633 if size := p.inlineHTMLComment(data); size > 0 { | |
634 end = size | |
635 } | |
636 if end > 2 { | |
637 if altype != notAutolink { | |
638 var uLink bytes.Buffer | |
639 unescapeText(&uLink, data[1:end+1-2]) | |
640 if uLink.Len() > 0 { | |
641 link := uLink.Bytes() | |
642 node := NewNode(Link) | |
643 node.Destination = link | |
644 if altype == emailAutolink { | |
645 node.Destination = append([]byte("mailto:"), link...) | |
646 } | |
647 node.AppendChild(text(stripMailto(link))) | |
648 return end, node | |
649 } | |
650 } else { | |
651 htmlTag := NewNode(HTMLSpan) | |
652 htmlTag.Literal = data[:end] | |
653 return end, htmlTag | |
654 } | |
655 } | |
656 | |
657 return end, nil | |
658 } | |
659 | |
660 // '\\' backslash escape | |
661 var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>~") | |
662 | |
663 func escape(p *Markdown, data []byte, offset int) (int, *Node) { | |
664 data = data[offset:] | |
665 | |
666 if len(data) > 1 { | |
667 if p.extensions&BackslashLineBreak != 0 && data[1] == '\n' { | |
668 return 2, NewNode(Hardbreak) | |
669 } | |
670 if bytes.IndexByte(escapeChars, data[1]) < 0 { | |
671 return 0, nil | |
672 } | |
673 | |
674 return 2, text(data[1:2]) | |
675 } | |
676 | |
677 return 2, nil | |
678 } | |
679 | |
680 func unescapeText(ob *bytes.Buffer, src []byte) { | |
681 i := 0 | |
682 for i < len(src) { | |
683 org := i | |
684 for i < len(src) && src[i] != '\\' { | |
685 i++ | |
686 } | |
687 | |
688 if i > org { | |
689 ob.Write(src[org:i]) | |
690 } | |
691 | |
692 if i+1 >= len(src) { | |
693 break | |
694 } | |
695 | |
696 ob.WriteByte(src[i+1]) | |
697 i += 2 | |
698 } | |
699 } | |
700 | |
701 // '&' escaped when it doesn't belong to an entity | |
702 // valid entities are assumed to be anything matching &#?[A-Za-z0-9]+; | |
703 func entity(p *Markdown, data []byte, offset int) (int, *Node) { | |
704 data = data[offset:] | |
705 | |
706 end := 1 | |
707 | |
708 if end < len(data) && data[end] == '#' { | |
709 end++ | |
710 } | |
711 | |
712 for end < len(data) && isalnum(data[end]) { | |
713 end++ | |
714 } | |
715 | |
716 if end < len(data) && data[end] == ';' { | |
717 end++ // real entity | |
718 } else { | |
719 return 0, nil // lone '&' | |
720 } | |
721 | |
722 ent := data[:end] | |
723 // undo & escaping or it will be converted to &amp; by another | |
724 // escaper in the renderer | |
725 if bytes.Equal(ent, []byte("&")) { | |
726 ent = []byte{'&'} | |
727 } | |
728 | |
729 return end, text(ent) | |
730 } | |
731 | |
732 func linkEndsWithEntity(data []byte, linkEnd int) bool { | |
733 entityRanges := htmlEntityRe.FindAllIndex(data[:linkEnd], -1) | |
734 return entityRanges != nil && entityRanges[len(entityRanges)-1][1] == linkEnd | |
735 } | |
736 | |
737 // hasPrefixCaseInsensitive is a custom implementation of | |
738 // strings.HasPrefix(strings.ToLower(s), prefix) | |
739 // we rolled our own because ToLower pulls in a huge machinery of lowercasing | |
740 // anything from Unicode and that's very slow. Since this func will only be | |
741 // used on ASCII protocol prefixes, we can take shortcuts. | |
742 func hasPrefixCaseInsensitive(s, prefix []byte) bool { | |
743 if len(s) < len(prefix) { | |
744 return false | |
745 } | |
746 delta := byte('a' - 'A') | |
747 for i, b := range prefix { | |
748 if b != s[i] && b != s[i]+delta { | |
749 return false | |
750 } | |
751 } | |
752 return true | |
753 } | |
754 | |
755 var protocolPrefixes = [][]byte{ | |
756 []byte("http://"), | |
757 []byte("https://"), | |
758 []byte("ftp://"), | |
759 []byte("file://"), | |
760 []byte("mailto:"), | |
761 } | |
762 | |
763 const shortestPrefix = 6 // len("ftp://"), the shortest of the above | |
764 | |
765 func maybeAutoLink(p *Markdown, data []byte, offset int) (int, *Node) { | |
766 // quick check to rule out most false hits | |
767 if p.insideLink || len(data) < offset+shortestPrefix { | |
768 return 0, nil | |
769 } | |
770 for _, prefix := range protocolPrefixes { | |
771 endOfHead := offset + 8 // 8 is the len() of the longest prefix | |
772 if endOfHead > len(data) { | |
773 endOfHead = len(data) | |
774 } | |
775 if hasPrefixCaseInsensitive(data[offset:endOfHead], prefix) { | |
776 return autoLink(p, data, offset) | |
777 } | |
778 } | |
779 return 0, nil | |
780 } | |
781 | |
782 func autoLink(p *Markdown, data []byte, offset int) (int, *Node) { | |
783 // Now a more expensive check to see if we're not inside an anchor element | |
784 anchorStart := offset | |
785 offsetFromAnchor := 0 | |
786 for anchorStart > 0 && data[anchorStart] != '<' { | |
787 anchorStart-- | |
788 offsetFromAnchor++ | |
789 } | |
790 | |
791 anchorStr := anchorRe.Find(data[anchorStart:]) | |
792 if anchorStr != nil { | |
793 anchorClose := NewNode(HTMLSpan) | |
794 anchorClose.Literal = anchorStr[offsetFromAnchor:] | |
795 return len(anchorStr) - offsetFromAnchor, anchorClose | |
796 } | |
797 | |
798 // scan backward for a word boundary | |
799 rewind := 0 | |
800 for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) { | |
801 rewind++ | |
802 } | |
803 if rewind > 6 { // longest supported protocol is "mailto" which has 6 letters | |
804 return 0, nil | |
805 } | |
806 | |
807 origData := data | |
808 data = data[offset-rewind:] | |
809 | |
810 if !isSafeLink(data) { | |
811 return 0, nil | |
812 } | |
813 | |
814 linkEnd := 0 | |
815 for linkEnd < len(data) && !isEndOfLink(data[linkEnd]) { | |
816 linkEnd++ | |
817 } | |
818 | |
819 // Skip punctuation at the end of the link | |
820 if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',') && data[linkEnd-2] != '\\' { | |
821 linkEnd-- | |
822 } | |
823 | |
824 // But don't skip semicolon if it's a part of escaped entity: | |
825 if data[linkEnd-1] == ';' && data[linkEnd-2] != '\\' && !linkEndsWithEntity(data, linkEnd) { | |
826 linkEnd-- | |
827 } | |
828 | |
829 // See if the link finishes with a punctuation sign that can be closed. | |
830 var copen byte | |
831 switch data[linkEnd-1] { | |
832 case '"': | |
833 copen = '"' | |
834 case '\'': | |
835 copen = '\'' | |
836 case ')': | |
837 copen = '(' | |
838 case ']': | |
839 copen = '[' | |
840 case '}': | |
841 copen = '{' | |
842 default: | |
843 copen = 0 | |
844 } | |
845 | |
846 if copen != 0 { | |
847 bufEnd := offset - rewind + linkEnd - 2 | |
848 | |
849 openDelim := 1 | |
850 | |
851 /* Try to close the final punctuation sign in this same line; | |
852 * if we managed to close it outside of the URL, that means that it's | |
853 * not part of the URL. If it closes inside the URL, that means it | |
854 * is part of the URL. | |
855 * | |
856 * Examples: | |
857 * | |
858 * foo http://www.pokemon.com/Pikachu_(Electric) bar | |
859 * => http://www.pokemon.com/Pikachu_(Electric) | |
860 * | |
861 * foo (http://www.pokemon.com/Pikachu_(Electric)) bar | |
862 * => http://www.pokemon.com/Pikachu_(Electric) | |
863 * | |
864 * foo http://www.pokemon.com/Pikachu_(Electric)) bar | |
865 * => http://www.pokemon.com/Pikachu_(Electric)) | |
866 * | |
867 * (foo http://www.pokemon.com/Pikachu_(Electric)) bar | |
868 * => foo http://www.pokemon.com/Pikachu_(Electric) | |
869 */ | |
870 | |
871 for bufEnd >= 0 && origData[bufEnd] != '\n' && openDelim != 0 { | |
872 if origData[bufEnd] == data[linkEnd-1] { | |
873 openDelim++ | |
874 } | |
875 | |
876 if origData[bufEnd] == copen { | |
877 openDelim-- | |
878 } | |
879 | |
880 bufEnd-- | |
881 } | |
882 | |
883 if openDelim == 0 { | |
884 linkEnd-- | |
885 } | |
886 } | |
887 | |
888 var uLink bytes.Buffer | |
889 unescapeText(&uLink, data[:linkEnd]) | |
890 | |
891 if uLink.Len() > 0 { | |
892 node := NewNode(Link) | |
893 node.Destination = uLink.Bytes() | |
894 node.AppendChild(text(uLink.Bytes())) | |
895 return linkEnd, node | |
896 } | |
897 | |
898 return linkEnd, nil | |
899 } | |
900 | |
901 func isEndOfLink(char byte) bool { | |
902 return isspace(char) || char == '<' | |
903 } | |
904 | |
905 var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")} | |
906 var validPaths = [][]byte{[]byte("/"), []byte("./"), []byte("../")} | |
907 | |
908 func isSafeLink(link []byte) bool { | |
909 for _, path := range validPaths { | |
910 if len(link) >= len(path) && bytes.Equal(link[:len(path)], path) { | |
911 if len(link) == len(path) { | |
912 return true | |
913 } else if isalnum(link[len(path)]) { | |
914 return true | |
915 } | |
916 } | |
917 } | |
918 | |
919 for _, prefix := range validUris { | |
920 // TODO: handle unicode here | |
921 // case-insensitive prefix test | |
922 if len(link) > len(prefix) && bytes.Equal(bytes.ToLower(link[:len(prefix)]), prefix) && isalnum(link[len(prefix)]) { | |
923 return true | |
924 } | |
925 } | |
926 | |
927 return false | |
928 } | |
929 | |
930 // return the length of the given tag, or 0 is it's not valid | |
931 func tagLength(data []byte) (autolink autolinkType, end int) { | |
932 var i, j int | |
933 | |
934 // a valid tag can't be shorter than 3 chars | |
935 if len(data) < 3 { | |
936 return notAutolink, 0 | |
937 } | |
938 | |
939 // begins with a '<' optionally followed by '/', followed by letter or number | |
940 if data[0] != '<' { | |
941 return notAutolink, 0 | |
942 } | |
943 if data[1] == '/' { | |
944 i = 2 | |
945 } else { | |
946 i = 1 | |
947 } | |
948 | |
949 if !isalnum(data[i]) { | |
950 return notAutolink, 0 | |
951 } | |
952 | |
953 // scheme test | |
954 autolink = notAutolink | |
955 | |
956 // try to find the beginning of an URI | |
957 for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') { | |
958 i++ | |
959 } | |
960 | |
961 if i > 1 && i < len(data) && data[i] == '@' { | |
962 if j = isMailtoAutoLink(data[i:]); j != 0 { | |
963 return emailAutolink, i + j | |
964 } | |
965 } | |
966 | |
967 if i > 2 && i < len(data) && data[i] == ':' { | |
968 autolink = normalAutolink | |
969 i++ | |
970 } | |
971 | |
972 // complete autolink test: no whitespace or ' or " | |
973 switch { | |
974 case i >= len(data): | |
975 autolink = notAutolink | |
976 case autolink != notAutolink: | |
977 j = i | |
978 | |
979 for i < len(data) { | |
980 if data[i] == '\\' { | |
981 i += 2 | |
982 } else if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) { | |
983 break | |
984 } else { | |
985 i++ | |
986 } | |
987 | |
988 } | |
989 | |
990 if i >= len(data) { | |
991 return autolink, 0 | |
992 } | |
993 if i > j && data[i] == '>' { | |
994 return autolink, i + 1 | |
995 } | |
996 | |
997 // one of the forbidden chars has been found | |
998 autolink = notAutolink | |
999 } | |
1000 i += bytes.IndexByte(data[i:], '>') | |
1001 if i < 0 { | |
1002 return autolink, 0 | |
1003 } | |
1004 return autolink, i + 1 | |
1005 } | |
1006 | |
1007 // look for the address part of a mail autolink and '>' | |
1008 // this is less strict than the original markdown e-mail address matching | |
1009 func isMailtoAutoLink(data []byte) int { | |
1010 nb := 0 | |
1011 | |
1012 // address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@' | |
1013 for i := 0; i < len(data); i++ { | |
1014 if isalnum(data[i]) { | |
1015 continue | |
1016 } | |
1017 | |
1018 switch data[i] { | |
1019 case '@': | |
1020 nb++ | |
1021 | |
1022 case '-', '.', '_': | |
1023 break | |
1024 | |
1025 case '>': | |
1026 if nb == 1 { | |
1027 return i + 1 | |
1028 } | |
1029 return 0 | |
1030 default: | |
1031 return 0 | |
1032 } | |
1033 } | |
1034 | |
1035 return 0 | |
1036 } | |
1037 | |
1038 // look for the next emph char, skipping other constructs | |
1039 func helperFindEmphChar(data []byte, c byte) int { | |
1040 i := 0 | |
1041 | |
1042 for i < len(data) { | |
1043 for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' { | |
1044 i++ | |
1045 } | |
1046 if i >= len(data) { | |
1047 return 0 | |
1048 } | |
1049 // do not count escaped chars | |
1050 if i != 0 && data[i-1] == '\\' { | |
1051 i++ | |
1052 continue | |
1053 } | |
1054 if data[i] == c { | |
1055 return i | |
1056 } | |
1057 | |
1058 if data[i] == '`' { | |
1059 // skip a code span | |
1060 tmpI := 0 | |
1061 i++ | |
1062 for i < len(data) && data[i] != '`' { | |
1063 if tmpI == 0 && data[i] == c { | |
1064 tmpI = i | |
1065 } | |
1066 i++ | |
1067 } | |
1068 if i >= len(data) { | |
1069 return tmpI | |
1070 } | |
1071 i++ | |
1072 } else if data[i] == '[' { | |
1073 // skip a link | |
1074 tmpI := 0 | |
1075 i++ | |
1076 for i < len(data) && data[i] != ']' { | |
1077 if tmpI == 0 && data[i] == c { | |
1078 tmpI = i | |
1079 } | |
1080 i++ | |
1081 } | |
1082 i++ | |
1083 for i < len(data) && (data[i] == ' ' || data[i] == '\n') { | |
1084 i++ | |
1085 } | |
1086 if i >= len(data) { | |
1087 return tmpI | |
1088 } | |
1089 if data[i] != '[' && data[i] != '(' { // not a link | |
1090 if tmpI > 0 { | |
1091 return tmpI | |
1092 } | |
1093 continue | |
1094 } | |
1095 cc := data[i] | |
1096 i++ | |
1097 for i < len(data) && data[i] != cc { | |
1098 if tmpI == 0 && data[i] == c { | |
1099 return i | |
1100 } | |
1101 i++ | |
1102 } | |
1103 if i >= len(data) { | |
1104 return tmpI | |
1105 } | |
1106 i++ | |
1107 } | |
1108 } | |
1109 return 0 | |
1110 } | |
1111 | |
1112 func helperEmphasis(p *Markdown, data []byte, c byte) (int, *Node) { | |
1113 i := 0 | |
1114 | |
1115 // skip one symbol if coming from emph3 | |
1116 if len(data) > 1 && data[0] == c && data[1] == c { | |
1117 i = 1 | |
1118 } | |
1119 | |
1120 for i < len(data) { | |
1121 length := helperFindEmphChar(data[i:], c) | |
1122 if length == 0 { | |
1123 return 0, nil | |
1124 } | |
1125 i += length | |
1126 if i >= len(data) { | |
1127 return 0, nil | |
1128 } | |
1129 | |
1130 if i+1 < len(data) && data[i+1] == c { | |
1131 i++ | |
1132 continue | |
1133 } | |
1134 | |
1135 if data[i] == c && !isspace(data[i-1]) { | |
1136 | |
1137 if p.extensions&NoIntraEmphasis != 0 { | |
1138 if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) { | |
1139 continue | |
1140 } | |
1141 } | |
1142 | |
1143 emph := NewNode(Emph) | |
1144 p.inline(emph, data[:i]) | |
1145 return i + 1, emph | |
1146 } | |
1147 } | |
1148 | |
1149 return 0, nil | |
1150 } | |
1151 | |
1152 func helperDoubleEmphasis(p *Markdown, data []byte, c byte) (int, *Node) { | |
1153 i := 0 | |
1154 | |
1155 for i < len(data) { | |
1156 length := helperFindEmphChar(data[i:], c) | |
1157 if length == 0 { | |
1158 return 0, nil | |
1159 } | |
1160 i += length | |
1161 | |
1162 if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) { | |
1163 nodeType := Strong | |
1164 if c == '~' { | |
1165 nodeType = Del | |
1166 } | |
1167 node := NewNode(nodeType) | |
1168 p.inline(node, data[:i]) | |
1169 return i + 2, node | |
1170 } | |
1171 i++ | |
1172 } | |
1173 return 0, nil | |
1174 } | |
1175 | |
1176 func helperTripleEmphasis(p *Markdown, data []byte, offset int, c byte) (int, *Node) { | |
1177 i := 0 | |
1178 origData := data | |
1179 data = data[offset:] | |
1180 | |
1181 for i < len(data) { | |
1182 length := helperFindEmphChar(data[i:], c) | |
1183 if length == 0 { | |
1184 return 0, nil | |
1185 } | |
1186 i += length | |
1187 | |
1188 // skip whitespace preceded symbols | |
1189 if data[i] != c || isspace(data[i-1]) { | |
1190 continue | |
1191 } | |
1192 | |
1193 switch { | |
1194 case i+2 < len(data) && data[i+1] == c && data[i+2] == c: | |
1195 // triple symbol found | |
1196 strong := NewNode(Strong) | |
1197 em := NewNode(Emph) | |
1198 strong.AppendChild(em) | |
1199 p.inline(em, data[:i]) | |
1200 return i + 3, strong | |
1201 case (i+1 < len(data) && data[i+1] == c): | |
1202 // double symbol found, hand over to emph1 | |
1203 length, node := helperEmphasis(p, origData[offset-2:], c) | |
1204 if length == 0 { | |
1205 return 0, nil | |
1206 } | |
1207 return length - 2, node | |
1208 default: | |
1209 // single symbol found, hand over to emph2 | |
1210 length, node := helperDoubleEmphasis(p, origData[offset-1:], c) | |
1211 if length == 0 { | |
1212 return 0, nil | |
1213 } | |
1214 return length - 1, node | |
1215 } | |
1216 } | |
1217 return 0, nil | |
1218 } | |
1219 | |
1220 func text(s []byte) *Node { | |
1221 node := NewNode(Text) | |
1222 node.Literal = s | |
1223 return node | |
1224 } | |
1225 | |
1226 func normalizeURI(s []byte) []byte { | |
1227 return s // TODO: implement | |
1228 } |