1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
12 // transitionFunc is the array of context transition functions for text nodes.
13 // A transition function takes a context and template text input, and returns
14 // the updated context and the number of bytes consumed from the front of the
16 var transitionFunc = [...]func(context, []byte) (context, int){
19 stateAttrName: tAttrName,
20 stateAfterName: tAfterName,
21 stateBeforeValue: tBeforeValue,
22 stateHTMLCmt: tHTMLCmt,
23 stateRCDATA: tSpecialTagEnd,
28 stateJSDqStr: tJSDelimited,
29 stateJSSqStr: tJSDelimited,
30 stateJSRegexp: tJSDelimited,
31 stateJSTmplLit: tJSTmpl,
32 stateJSBlockCmt: tBlockCmt,
33 stateJSLineCmt: tLineCmt,
34 stateJSHTMLOpenCmt: tLineCmt,
35 stateJSHTMLCloseCmt: tLineCmt,
37 stateCSSDqStr: tCSSStr,
38 stateCSSSqStr: tCSSStr,
39 stateCSSDqURL: tCSSStr,
40 stateCSSSqURL: tCSSStr,
42 stateCSSBlockCmt: tBlockCmt,
43 stateCSSLineCmt: tLineCmt,
47 var commentStart = []byte("<!--")
48 var commentEnd = []byte("-->")
50 // tText is the context transition function for the text state.
51 func tText(c context, s []byte) (context, int) {
54 i := k + bytes.IndexByte(s[k:], '<')
55 if i < k || i+1 == len(s) {
57 } else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) {
58 return context{state: stateHTMLCmt}, i + 4
68 j, e := eatTagName(s, i)
73 // We've found an HTML tag.
74 return context{state: stateTag, element: e}, j
80 var elementContentType = [...]state{
81 elementNone: stateText,
82 elementScript: stateJS,
83 elementStyle: stateCSS,
84 elementTextarea: stateRCDATA,
85 elementTitle: stateRCDATA,
88 // tTag is the context transition function for the tag state.
89 func tTag(c context, s []byte) (context, int) {
90 // Find the attribute name.
91 i := eatWhiteSpace(s, 0)
97 state: elementContentType[c.element],
101 j, err := eatAttrName(s, i)
103 return context{state: stateError, err: err}, len(s)
105 state, attr := stateTag, attrNone
109 err: errorf(ErrBadHTML, nil, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
113 attrName := strings.ToLower(string(s[i:j]))
114 if c.element == elementScript && attrName == "type" {
115 attr = attrScriptType
117 switch attrType(attrName) {
124 case contentTypeSrcset:
130 state = stateAttrName
132 state = stateAfterName
134 return context{state: state, element: c.element, attr: attr}, j
137 // tAttrName is the context transition function for stateAttrName.
138 func tAttrName(c context, s []byte) (context, int) {
139 i, err := eatAttrName(s, 0)
141 return context{state: stateError, err: err}, len(s)
142 } else if i != len(s) {
143 c.state = stateAfterName
148 // tAfterName is the context transition function for stateAfterName.
149 func tAfterName(c context, s []byte) (context, int) {
150 // Look for the start of the value.
151 i := eatWhiteSpace(s, 0)
154 } else if s[i] != '=' {
155 // Occurs due to tag ending '>', and valueless attribute.
159 c.state = stateBeforeValue
164 var attrStartStates = [...]state{
167 attrScriptType: stateAttr,
170 attrSrcset: stateSrcset,
173 // tBeforeValue is the context transition function for stateBeforeValue.
174 func tBeforeValue(c context, s []byte) (context, int) {
175 i := eatWhiteSpace(s, 0)
179 // Find the attribute delimiter.
180 delim := delimSpaceOrTagEnd
183 delim, i = delimSingleQuote, i+1
185 delim, i = delimDoubleQuote, i+1
187 c.state, c.delim = attrStartStates[c.attr], delim
191 // tHTMLCmt is the context transition function for stateHTMLCmt.
192 func tHTMLCmt(c context, s []byte) (context, int) {
193 if i := bytes.Index(s, commentEnd); i != -1 {
194 return context{}, i + 3
199 // specialTagEndMarkers maps element types to the character sequence that
200 // case-insensitively signals the end of the special tag body.
201 var specialTagEndMarkers = [...][]byte{
202 elementScript: []byte("script"),
203 elementStyle: []byte("style"),
204 elementTextarea: []byte("textarea"),
205 elementTitle: []byte("title"),
209 specialTagEndPrefix = []byte("</")
210 tagEndSeparators = []byte("> \t\n\f/")
213 // tSpecialTagEnd is the context transition function for raw text and RCDATA
215 func tSpecialTagEnd(c context, s []byte) (context, int) {
216 if c.element != elementNone {
217 // script end tags ("</script") within script literals are ignored, so that
218 // we can properly escape them.
219 if c.element == elementScript && (isInScriptLiteral(c.state) || isComment(c.state)) {
222 if i := indexTagEnd(s, specialTagEndMarkers[c.element]); i != -1 {
229 // indexTagEnd finds the index of a special tag end in a case insensitive way, or returns -1
230 func indexTagEnd(s []byte, tag []byte) int {
232 plen := len(specialTagEndPrefix)
234 // Try to find the tag end prefix first
235 i := bytes.Index(s, specialTagEndPrefix)
240 // Try to match the actual tag if there is still space for it
241 if len(tag) <= len(s) && bytes.EqualFold(tag, s[:len(tag)]) {
243 // Check the tag is followed by a proper separator
244 if len(s) > 0 && bytes.IndexByte(tagEndSeparators, s[0]) != -1 {
254 // tAttr is the context transition function for the attribute state.
255 func tAttr(c context, s []byte) (context, int) {
259 // tURL is the context transition function for the URL state.
260 func tURL(c context, s []byte) (context, int) {
261 if bytes.ContainsAny(s, "#?") {
262 c.urlPart = urlPartQueryOrFrag
263 } else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone {
264 // HTML5 uses "Valid URL potentially surrounded by spaces" for
265 // attrs: https://www.w3.org/TR/html5/index.html#attributes-1
266 c.urlPart = urlPartPreQuery
271 // tJS is the context transition function for the JS state.
272 func tJS(c context, s []byte) (context, int) {
273 i := bytes.IndexAny(s, "\"`'/{}<-#")
275 // Entire input is non string, comment, regexp tokens.
276 c.jsCtx = nextJSCtx(s, c.jsCtx)
279 c.jsCtx = nextJSCtx(s[:i], c.jsCtx)
282 c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp
284 c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp
286 c.state, c.jsCtx = stateJSTmplLit, jsCtxRegexp
289 case i+1 < len(s) && s[i+1] == '/':
290 c.state, i = stateJSLineCmt, i+1
291 case i+1 < len(s) && s[i+1] == '*':
292 c.state, i = stateJSBlockCmt, i+1
293 case c.jsCtx == jsCtxRegexp:
294 c.state = stateJSRegexp
295 case c.jsCtx == jsCtxDivOp:
296 c.jsCtx = jsCtxRegexp
300 err: errorf(ErrSlashAmbig, nil, 0, "'/' could start a division or regexp: %.32q", s[i:]),
303 // ECMAScript supports HTML style comments for legacy reasons, see Appendix
304 // B.1.1 "HTML-like Comments". The handling of these comments is somewhat
305 // confusing. Multi-line comments are not supported, i.e. anything on lines
306 // between the opening and closing tokens is not considered a comment, but
307 // anything following the opening or closing token, on the same line, is
308 // ignored. As such we simply treat any line prefixed with "<!--" or "-->"
309 // as if it were actually prefixed with "//" and move on.
311 if i+3 < len(s) && bytes.Equal(commentStart, s[i:i+4]) {
312 c.state, i = stateJSHTMLOpenCmt, i+3
315 if i+2 < len(s) && bytes.Equal(commentEnd, s[i:i+3]) {
316 c.state, i = stateJSHTMLCloseCmt, i+2
318 // ECMAScript also supports "hashbang" comment lines, see Section 12.5.
320 if i+1 < len(s) && s[i+1] == '!' {
321 c.state, i = stateJSLineCmt, i+1
326 if c.jsTmplExprDepth == 0 {
329 for j := 0; j <= i; j++ {
339 if c.jsBraceDepth >= 0 {
344 c.state = stateJSTmplLit
351 func tJSTmpl(c context, s []byte) (context, int) {
354 i := k + bytes.IndexAny(s[k:], "`\\$")
364 err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
368 if len(s) >= i+2 && s[i+1] == '{' {
384 // tJSDelimited is the context transition function for the JS string and regexp
386 func tJSDelimited(c context, s []byte) (context, int) {
395 k, inCharset := 0, false
397 i := k + bytes.IndexAny(s[k:], specials)
407 err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
415 // If "</script" appears in a regex literal, the '/' should not
416 // close the regex literal, and it will later be escaped to
417 // "\x3C/script" in escapeText.
418 if i > 0 && i+7 <= len(s) && bytes.Compare(bytes.ToLower(s[i-1:i+7]), []byte("</script")) == 0 {
420 } else if !inCharset {
421 c.state, c.jsCtx = stateJS, jsCtxDivOp
427 c.state, c.jsCtx = stateJS, jsCtxDivOp
435 // This can be fixed by making context richer if interpolation
436 // into charsets is desired.
439 err: errorf(ErrPartialCharset, nil, 0, "unfinished JS regexp charset: %q", s),
446 var blockCommentEnd = []byte("*/")
448 // tBlockCmt is the context transition function for /*comment*/ states.
449 func tBlockCmt(c context, s []byte) (context, int) {
450 i := bytes.Index(s, blockCommentEnd)
455 case stateJSBlockCmt:
457 case stateCSSBlockCmt:
460 panic(c.state.String())
465 // tLineCmt is the context transition function for //comment states, and the JS HTML-like comment state.
466 func tLineCmt(c context, s []byte) (context, int) {
467 var lineTerminators string
470 case stateJSLineCmt, stateJSHTMLOpenCmt, stateJSHTMLCloseCmt:
471 lineTerminators, endState = "\n\r\u2028\u2029", stateJS
472 case stateCSSLineCmt:
473 lineTerminators, endState = "\n\f\r", stateCSS
474 // Line comments are not part of any published CSS standard but
475 // are supported by the 4 major browsers.
476 // This defines line comments as
477 // LINECOMMENT ::= "//" [^\n\f\d]*
478 // since https://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines
480 // nl ::= #xA | #xD #xA | #xD | #xC
482 panic(c.state.String())
485 i := bytes.IndexAny(s, lineTerminators)
490 // Per section 7.4 of EcmaScript 5 : https://es5.github.io/#x7.4
491 // "However, the LineTerminator at the end of the line is not
492 // considered to be part of the single-line comment; it is
493 // recognized separately by the lexical grammar and becomes part
494 // of the stream of input elements for the syntactic grammar."
498 // tCSS is the context transition function for the CSS state.
499 func tCSS(c context, s []byte) (context, int) {
500 // CSS quoted strings are almost never used except for:
501 // (1) URLs as in background: "/foo.png"
502 // (2) Multiword font-names as in font-family: "Times New Roman"
503 // (3) List separators in content values as in inline-lists:
505 // ul.inlineList { list-style: none; padding:0 }
506 // ul.inlineList > li { display: inline }
507 // ul.inlineList > li:before { content: ", " }
508 // ul.inlineList > li:first-child:before { content: "" }
510 // <ul class=inlineList><li>One<li>Two<li>Three</ul>
511 // (4) Attribute value selectors as in a[href="http://example.com/"]
513 // We conservatively treat all strings as URLs, but make some
514 // allowances to avoid confusion.
516 // In (1), our conservative assumption is justified.
517 // In (2), valid font names do not contain ':', '?', or '#', so our
518 // conservative assumption is fine since we will never transition past
520 // In (3), our protocol heuristic should not be tripped, and there
521 // should not be non-space content after a '?' or '#', so as long as
522 // we only %-encode RFC 3986 reserved characters we are ok.
523 // In (4), we should URL escape for URL attributes, and for others we
524 // have the attribute name available if our conservative assumption
525 // proves problematic for real code.
529 i := k + bytes.IndexAny(s[k:], `("'/`)
535 // Look for url to the left.
536 p := bytes.TrimRight(s[:i], "\t\n\f\r ")
537 if endsWithCSSKeyword(p, "url") {
538 j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r "))
540 case j != len(s) && s[j] == '"':
541 c.state, j = stateCSSDqURL, j+1
542 case j != len(s) && s[j] == '\'':
543 c.state, j = stateCSSSqURL, j+1
545 c.state = stateCSSURL
553 c.state = stateCSSLineCmt
556 c.state = stateCSSBlockCmt
561 c.state = stateCSSDqStr
564 c.state = stateCSSSqStr
571 // tCSSStr is the context transition function for the CSS string and URL states.
572 func tCSSStr(c context, s []byte) (context, int) {
575 case stateCSSDqStr, stateCSSDqURL:
577 case stateCSSSqStr, stateCSSSqURL:
580 // Unquoted URLs end with a newline or close parenthesis.
581 // The below includes the wc (whitespace character) and nl.
582 endAndEsc = "\\\t\n\f\r )"
584 panic(c.state.String())
589 i := k + bytes.IndexAny(s[k:], endAndEsc)
591 c, nread := tURL(c, decodeCSS(s[k:]))
599 err: errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in CSS string: %q", s),
606 c, _ = tURL(c, decodeCSS(s[:i+1]))
611 // tError is the context transition function for the error state.
612 func tError(c context, s []byte) (context, int) {
616 // eatAttrName returns the largest j such that s[i:j] is an attribute name.
617 // It returns an error if s[i:] does not look like it begins with an
618 // attribute name, such as encountering a quote mark without a preceding
620 func eatAttrName(s []byte, i int) (int, *Error) {
621 for j := i; j < len(s); j++ {
623 case ' ', '\t', '\n', '\f', '\r', '=', '>':
626 // These result in a parse warning in HTML5 and are
627 // indicative of serious problems if seen in an attr
628 // name in a template.
629 return -1, errorf(ErrBadHTML, nil, 0, "%q in attribute name: %.32q", s[j:j+1], s)
637 var elementNameMap = map[string]element{
638 "script": elementScript,
639 "style": elementStyle,
640 "textarea": elementTextarea,
641 "title": elementTitle,
644 // asciiAlpha reports whether c is an ASCII letter.
645 func asciiAlpha(c byte) bool {
646 return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
649 // asciiAlphaNum reports whether c is an ASCII letter or digit.
650 func asciiAlphaNum(c byte) bool {
651 return asciiAlpha(c) || '0' <= c && c <= '9'
654 // eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
655 func eatTagName(s []byte, i int) (int, element) {
656 if i == len(s) || !asciiAlpha(s[i]) {
657 return i, elementNone
662 if asciiAlphaNum(x) {
666 // Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
667 if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
673 return j, elementNameMap[strings.ToLower(string(s[i:j]))]
676 // eatWhiteSpace returns the largest j such that s[i:j] is white space.
677 func eatWhiteSpace(s []byte, i int) int {
678 for j := i; j < len(s); j++ {
680 case ' ', '\t', '\n', '\f', '\r':