1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package scanner implements a scanner for Go source text.
6 // It takes a []byte as source which can then be tokenized
7 // through repeated calls to the Scan method.
21 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
22 // encountered and a handler was installed, the handler is called with a
23 // position and an error message. The position points to the beginning of
24 // the offending token.
26 type ErrorHandler func(pos token.Position, msg string)
28 // A Scanner holds the scanner's internal state while processing
29 // a given text. It can be allocated as part of another data
30 // structure but must be initialized via Init before use.
34 file *token.File // source file handle
35 dir string // directory portion of file.Name()
37 err ErrorHandler // error reporting; or nil
38 mode Mode // scanning mode
41 ch rune // current character
42 offset int // character offset
43 rdOffset int // reading offset (position after current character)
44 lineOffset int // current line offset
45 insertSemi bool // insert a semicolon before next newline
47 // public state - ok to modify
48 ErrorCount int // number of errors encountered
52 bom = 0xFEFF // byte order mark, only permitted as very first character
53 eof = -1 // end of file
56 // Read the next Unicode char into s.ch.
57 // s.ch < 0 means end-of-file.
59 // For optimization, there is some overlap between this method and
61 func (s *Scanner) next() {
62 if s.rdOffset < len(s.src) {
65 s.lineOffset = s.offset
66 s.file.AddLine(s.offset)
68 r, w := rune(s.src[s.rdOffset]), 1
71 s.error(s.offset, "illegal character NUL")
72 case r >= utf8.RuneSelf:
74 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
75 if r == utf8.RuneError && w == 1 {
76 s.error(s.offset, "illegal UTF-8 encoding")
77 } else if r == bom && s.offset > 0 {
78 s.error(s.offset, "illegal byte order mark")
86 s.lineOffset = s.offset
87 s.file.AddLine(s.offset)
93 // peek returns the byte following the most recently read character without
94 // advancing the scanner. If the scanner is at EOF, peek returns 0.
95 func (s *Scanner) peek() byte {
96 if s.rdOffset < len(s.src) {
97 return s.src[s.rdOffset]
102 // A mode value is a set of flags (or 0).
103 // They control scanner behavior.
108 ScanComments Mode = 1 << iota // return comments as COMMENT tokens
109 dontInsertSemis // do not automatically insert semicolons - for testing only
112 // Init prepares the scanner s to tokenize the text src by setting the
113 // scanner at the beginning of src. The scanner uses the file set file
114 // for position information and it adds line information for each line.
115 // It is ok to re-use the same file when re-scanning the same file as
116 // line information which is already present is ignored. Init causes a
117 // panic if the file size does not match the src size.
119 // Calls to Scan will invoke the error handler err if they encounter a
120 // syntax error and err is not nil. Also, for each error encountered,
121 // the Scanner field ErrorCount is incremented by one. The mode parameter
122 // determines how comments are handled.
124 // Note that Init may call err if there is an error in the first character
127 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
128 // Explicitly initialize all fields since a scanner may be reused.
129 if file.Size() != len(src) {
130 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
133 s.dir, _ = filepath.Split(file.Name())
147 s.next() // ignore BOM at file beginning
151 func (s *Scanner) error(offs int, msg string) {
153 s.err(s.file.Position(s.file.Pos(offs)), msg)
158 func (s *Scanner) errorf(offs int, format string, args ...interface{}) {
159 s.error(offs, fmt.Sprintf(format, args...))
162 func (s *Scanner) scanComment() string {
163 // initial '/' already consumed; s.ch == '/' || s.ch == '*'
164 offs := s.offset - 1 // position of initial '/'
165 next := -1 // position immediately following the comment; < 0 means invalid comment
170 // (the final '\n' is not considered part of the comment)
172 for s.ch != '\n' && s.ch >= 0 {
178 // if we are at '\n', the position following the comment is afterwards
194 if ch == '*' && s.ch == '/' {
201 s.error(offs, "comment not terminated")
204 lit := s.src[offs:s.offset]
206 // On Windows, a (//-comment) line may end in "\r\n".
207 // Remove the final '\r' before analyzing the text for
208 // line directives (matching the compiler). Remove any
209 // other '\r' afterwards (matching the pre-existing be-
210 // havior of the scanner).
211 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
212 lit = lit[:len(lit)-1]
216 // interpret line directives
217 // (//line directives must start at the beginning of the current line)
218 if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
219 s.updateLineInfo(next, offs, lit)
223 lit = stripCR(lit, lit[1] == '*')
229 var prefix = []byte("line ")
231 // updateLineInfo parses the incoming comment text at offset offs
232 // as a line directive. If successful, it updates the line info table
233 // for the position next per the line directive.
234 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
235 // extract comment text
237 text = text[:len(text)-2] // lop off trailing "*/"
239 text = text[7:] // lop off leading "//line " or "/*line "
242 i, n, ok := trailingDigits(text)
244 return // ignore (not a line directive)
249 // text has a suffix :xxx but xxx is not a number
250 s.error(offs+i, "invalid line number: "+string(text[i:]))
255 i2, n2, ok2 := trailingDigits(text[:i-1])
257 //line filename:line:col
261 s.error(offs+i2, "invalid column number: "+string(text[i2:]))
264 text = text[:i2-1] // lop off ":col"
271 s.error(offs+i, "invalid line number: "+string(text[i:]))
275 // If we have a column (//line filename:line:col form),
276 // an empty filename means to use the previous filename.
277 filename := string(text[:i-1]) // lop off ":line", and trim white space
278 if filename == "" && ok2 {
279 filename = s.file.Position(s.file.Pos(offs)).Filename
280 } else if filename != "" {
281 // Put a relative filename in the current directory.
282 // This is for compatibility with earlier releases.
284 filename = filepath.Clean(filename)
285 if !filepath.IsAbs(filename) {
286 filename = filepath.Join(s.dir, filename)
290 s.file.AddLineColumnInfo(next, filename, line, col)
293 func trailingDigits(text []byte) (int, int, bool) {
294 i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
296 return 0, 0, false // no ":"
299 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
300 return i + 1, int(n), err == nil
303 func (s *Scanner) findLineEnd() bool {
304 // initial '/' already consumed
306 defer func(offs int) {
307 // reset scanner state to where it was upon calling findLineEnd
310 s.rdOffset = offs + 1
311 s.next() // consume initial '/' again
314 // read ahead until a newline, EOF, or non-comment token is found
315 for s.ch == '/' || s.ch == '*' {
317 //-style comment always contains a newline
320 /*-style comment: look for newline */
328 if ch == '*' && s.ch == '/' {
333 s.skipWhitespace() // s.insertSemi is set
334 if s.ch < 0 || s.ch == '\n' {
341 s.next() // consume '/'
347 func isLetter(ch rune) bool {
348 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
351 func isDigit(ch rune) bool {
352 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
355 // scanIdentifier reads the string of valid identifier characters at s.offset.
356 // It must only be called when s.ch is known to be a valid letter.
358 // Be careful when making changes to this function: it is optimized and affects
359 // scanning performance significantly.
360 func (s *Scanner) scanIdentifier() string {
363 // Optimize for the common case of an ASCII identifier.
365 // Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
366 // avoids conversions to runes.
368 // In case we encounter a non-ASCII character, fall back on the slower path
369 // of calling into s.next().
370 for rdOffset, b := range s.src[s.rdOffset:] {
371 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
372 // Avoid assigning a rune for the common case of an ascii character.
375 s.rdOffset += rdOffset
376 if 0 < b && b < utf8.RuneSelf {
377 // Optimization: we've encountered an ASCII character that's not a letter
378 // or number. Avoid the call into s.next() and corresponding set up.
380 // Note that s.next() does some line accounting if s.ch is '\n', so this
381 // shortcut is only possible because we know that the preceding character
384 s.offset = s.rdOffset
388 // We know that the preceding character is valid for an identifier because
389 // scanIdentifier is only called when s.ch is a letter, so calling s.next()
390 // at s.rdOffset resets the scanner state.
392 for isLetter(s.ch) || isDigit(s.ch) {
397 s.offset = len(s.src)
398 s.rdOffset = len(s.src)
402 return string(s.src[offs:s.offset])
405 func digitVal(ch rune) int {
407 case '0' <= ch && ch <= '9':
409 case 'a' <= lower(ch) && lower(ch) <= 'f':
410 return int(lower(ch) - 'a' + 10)
412 return 16 // larger than any legal digit val
415 func lower(ch rune) rune { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
416 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
417 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
419 // digits accepts the sequence { digit | '_' }.
420 // If base <= 10, digits accepts any decimal digit but records
421 // the offset (relative to the source start) of a digit >= base
422 // in *invalid, if *invalid < 0.
423 // digits returns a bitset describing whether the sequence contained
424 // digits (bit 0 is set), or separators '_' (bit 1 is set).
425 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
427 max := rune('0' + base)
428 for isDecimal(s.ch) || s.ch == '_' {
432 } else if s.ch >= max && *invalid < 0 {
433 *invalid = s.offset // record invalid rune offset
439 for isHex(s.ch) || s.ch == '_' {
451 func (s *Scanner) scanNumber() (token.Token, string) {
455 base := 10 // number base
456 prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
457 digsep := 0 // bit 0: digit present, bit 1: '_' present
458 invalid := -1 // index of invalid digit in literal, or < 0
468 base, prefix = 16, 'x'
471 base, prefix = 8, 'o'
474 base, prefix = 2, 'b'
476 base, prefix = 8, '0'
477 digsep = 1 // leading 0
480 digsep |= s.digits(base, &invalid)
486 if prefix == 'o' || prefix == 'b' {
487 s.error(s.offset, "invalid radix point in "+litname(prefix))
490 digsep |= s.digits(base, &invalid)
494 s.error(s.offset, litname(prefix)+" has no digits")
498 if e := lower(s.ch); e == 'e' || e == 'p' {
500 case e == 'e' && prefix != 0 && prefix != '0':
501 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
502 case e == 'p' && prefix != 'x':
503 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
507 if s.ch == '+' || s.ch == '-' {
510 ds := s.digits(10, nil)
513 s.error(s.offset, "exponent has no digits")
515 } else if prefix == 'x' && tok == token.FLOAT {
516 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
525 lit := string(s.src[offs:s.offset])
526 if tok == token.INT && invalid >= 0 {
527 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
530 if i := invalidSep(lit); i >= 0 {
531 s.error(offs+i, "'_' must separate successive digits")
538 func litname(prefix rune) string {
541 return "hexadecimal literal"
543 return "octal literal"
545 return "binary literal"
547 return "decimal literal"
550 // invalidSep returns the index of the first invalid separator in x, or -1.
551 func invalidSep(x string) int {
552 x1 := ' ' // prefix char, we only care if it's 'x'
553 d := '.' // digit, one of '_', '0' (a digit), or '.' (anything else)
556 // a prefix counts as a digit
557 if len(x) >= 2 && x[0] == '0' {
558 x1 = lower(rune(x[1]))
559 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
565 // mantissa and exponent
566 for ; i < len(x); i++ {
567 p := d // previous digit
574 case isDecimal(d) || x1 == 'x' && isHex(d):
590 // scanEscape parses an escape sequence where rune is the accepted
591 // escaped quote. In case of a syntax error, it stops at the offending
592 // character (without consuming it) and returns false. Otherwise
594 func (s *Scanner) scanEscape(quote rune) bool {
600 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
603 case '0', '1', '2', '3', '4', '5', '6', '7':
604 n, base, max = 3, 8, 255
607 n, base, max = 2, 16, 255
610 n, base, max = 4, 16, unicode.MaxRune
613 n, base, max = 8, 16, unicode.MaxRune
615 msg := "unknown escape sequence"
617 msg = "escape sequence not terminated"
625 d := uint32(digitVal(s.ch))
627 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
629 msg = "escape sequence not terminated"
631 s.error(s.offset, msg)
639 if x > max || 0xD800 <= x && x < 0xE000 {
640 s.error(offs, "escape sequence is invalid Unicode code point")
647 func (s *Scanner) scanRune() string {
648 // '\'' opening already consumed
655 if ch == '\n' || ch < 0 {
656 // only report error if we don't have one already
658 s.error(offs, "rune literal not terminated")
669 if !s.scanEscape('\'') {
672 // continue to read to closing quote
677 s.error(offs, "illegal rune literal")
680 return string(s.src[offs:s.offset])
683 func (s *Scanner) scanString() string {
684 // '"' opening already consumed
689 if ch == '\n' || ch < 0 {
690 s.error(offs, "string literal not terminated")
702 return string(s.src[offs:s.offset])
705 func stripCR(b []byte, comment bool) []byte {
706 c := make([]byte, len(b))
708 for j, ch := range b {
709 // In a /*-style comment, don't strip \r from *\r/ (incl.
710 // sequences of \r from *\r\r...\r/) since the resulting
711 // */ would terminate the comment too early unless the \r
712 // is immediately following the opening /* in which case
713 // it's ok because /*/ is not closed yet (issue #11151).
714 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
722 func (s *Scanner) scanRawString() string {
723 // '`' opening already consumed
730 s.error(offs, "raw string literal not terminated")
742 lit := s.src[offs:s.offset]
744 lit = stripCR(lit, false)
750 func (s *Scanner) skipWhitespace() {
751 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
756 // Helper functions for scanning multi-byte tokens such as >> += >>= .
757 // Different routines recognize different length tok_i based on matches
758 // of ch_i. If a token ends in '=', the result is tok1 or tok3
759 // respectively. Otherwise, the result is tok0 if there was no other
760 // matching character, or tok2 if the matching character was ch2.
762 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
770 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
782 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
798 // Scan scans the next token and returns the token position, the token,
799 // and its literal string if applicable. The source end is indicated by
802 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
803 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
804 // has the corresponding value.
806 // If the returned token is a keyword, the literal string is the keyword.
808 // If the returned token is token.SEMICOLON, the corresponding
809 // literal string is ";" if the semicolon was present in the source,
810 // and "\n" if the semicolon was inserted because of a newline or
813 // If the returned token is token.ILLEGAL, the literal string is the
814 // offending character.
816 // In all other cases, Scan returns an empty literal string.
818 // For more tolerant parsing, Scan will return a valid token if
819 // possible even if a syntax error was encountered. Thus, even
820 // if the resulting token sequence contains no illegal tokens,
821 // a client may not assume that no error occurred. Instead it
822 // must check the scanner's ErrorCount or the number of calls
823 // of the error handler, if there was one installed.
825 // Scan adds line information to the file added to the file
826 // set with Init. Token positions are relative to that file
827 // and thus relative to the file set.
829 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
833 // current token start
834 pos = s.file.Pos(s.offset)
836 // determine token value
840 lit = s.scanIdentifier()
842 // keywords are longer than one letter - avoid lookup otherwise
843 tok = token.Lookup(lit)
845 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
852 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
854 tok, lit = s.scanNumber()
856 s.next() // always make progress
860 s.insertSemi = false // EOF consumed
861 return pos, token.SEMICOLON, "\n"
865 // we only reach here if s.insertSemi was
866 // set in the first place and exited early
867 // from s.skipWhitespace()
868 s.insertSemi = false // newline consumed
869 return pos, token.SEMICOLON, "\n"
881 lit = s.scanRawString()
883 tok = s.switch2(token.COLON, token.DEFINE)
885 // fractions starting with a '.' are handled by outer switch
887 if s.ch == '.' && s.peek() == '.' {
889 s.next() // consume last '.'
895 tok = token.SEMICOLON
913 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
914 if tok == token.INC {
918 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
919 if tok == token.DEC {
923 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
925 if s.ch == '/' || s.ch == '*' {
927 if s.insertSemi && s.findLineEnd() {
928 // reset position to the beginning of the comment
930 s.offset = s.file.Offset(pos)
931 s.rdOffset = s.offset + 1
932 s.insertSemi = false // newline consumed
933 return pos, token.SEMICOLON, "\n"
935 comment := s.scanComment()
936 if s.mode&ScanComments == 0 {
938 s.insertSemi = false // newline consumed
944 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
947 tok = s.switch2(token.REM, token.REM_ASSIGN)
949 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
955 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
958 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
960 tok = s.switch2(token.ASSIGN, token.EQL)
962 tok = s.switch2(token.NOT, token.NEQ)
966 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
968 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
971 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
975 // next reports unexpected BOMs - don't repeat
977 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
979 insertSemi = s.insertSemi // preserve insertSemi info
984 if s.mode&dontInsertSemis == 0 {
985 s.insertSemi = insertSemi