src/go/scanner/scanner.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package scanner implements a scanner for Go source text.
   6 // It takes a []byte as source which can then be tokenized
   7 // through repeated calls to the Scan method.
   8 //
   9 package scanner
  10
  11 import (
  12         "bytes"
  13         "fmt"
  14         "go/token"
  15         "path/filepath"
  16         "strconv"
  17         "unicode"
  18         "unicode/utf8"
  19 )
  20
  21 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
  22 // encountered and a handler was installed, the handler is called with a
  23 // position and an error message. The position points to the beginning of
  24 // the offending token.
  25 //
  26 type ErrorHandler func(pos token.Position, msg string)
  27
  28 // A Scanner holds the scanner's internal state while processing
  29 // a given text. It can be allocated as part of another data
  30 // structure but must be initialized via Init before use.
  31 //
  32 type Scanner struct {
  33         // immutable state
  34         file *token.File  // source file handle
  35         dir  string       // directory portion of file.Name()
  36         src  []byte       // source
  37         err  ErrorHandler // error reporting; or nil
  38         mode Mode         // scanning mode
  39
  40         // scanning state
  41         ch         rune // current character
  42         offset     int  // character offset
  43         rdOffset   int  // reading offset (position after current character)
  44         lineOffset int  // current line offset
  45         insertSemi bool // insert a semicolon before next newline
  46
  47         // public state - ok to modify
  48         ErrorCount int // number of errors encountered
  49 }
  50
  51 const (
  52         bom = 0xFEFF // byte order mark, only permitted as very first character
  53         eof = -1     // end of file
  54 )
  55
  56 // Read the next Unicode char into s.ch.
  57 // s.ch < 0 means end-of-file.
  58 //
  59 // For optimization, there is some overlap between this method and
  60 // s.scanIdentifier.
  61 func (s *Scanner) next() {
  62         if s.rdOffset < len(s.src) {
  63                 s.offset = s.rdOffset
  64                 if s.ch == '\n' {
  65                         s.lineOffset = s.offset
  66                         s.file.AddLine(s.offset)
  67                 }
  68                 r, w := rune(s.src[s.rdOffset]), 1
  69                 switch {
  70                 case r == 0:
  71                         s.error(s.offset, "illegal character NUL")
  72                 case r >= utf8.RuneSelf:
  73                         // not ASCII
  74                         r, w = utf8.DecodeRune(s.src[s.rdOffset:])
  75                         if r == utf8.RuneError && w == 1 {
  76                                 s.error(s.offset, "illegal UTF-8 encoding")
  77                         } else if r == bom && s.offset > 0 {
  78                                 s.error(s.offset, "illegal byte order mark")
  79                         }
  80                 }
  81                 s.rdOffset += w
  82                 s.ch = r
  83         } else {
  84                 s.offset = len(s.src)
  85                 if s.ch == '\n' {
  86                         s.lineOffset = s.offset
  87                         s.file.AddLine(s.offset)
  88                 }
  89                 s.ch = eof
  90         }
  91 }
  92
  93 // peek returns the byte following the most recently read character without
  94 // advancing the scanner. If the scanner is at EOF, peek returns 0.
  95 func (s *Scanner) peek() byte {
  96         if s.rdOffset < len(s.src) {
  97                 return s.src[s.rdOffset]
  98         }
  99         return 0
 100 }
 101
 102 // A mode value is a set of flags (or 0).
 103 // They control scanner behavior.
 104 //
 105 type Mode uint
 106
 107 const (
 108         ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
 109         dontInsertSemis                  // do not automatically insert semicolons - for testing only
 110 )
 111
 112 // Init prepares the scanner s to tokenize the text src by setting the
 113 // scanner at the beginning of src. The scanner uses the file set file
 114 // for position information and it adds line information for each line.
 115 // It is ok to re-use the same file when re-scanning the same file as
 116 // line information which is already present is ignored. Init causes a
 117 // panic if the file size does not match the src size.
 118 //
 119 // Calls to Scan will invoke the error handler err if they encounter a
 120 // syntax error and err is not nil. Also, for each error encountered,
 121 // the Scanner field ErrorCount is incremented by one. The mode parameter
 122 // determines how comments are handled.
 123 //
 124 // Note that Init may call err if there is an error in the first character
 125 // of the file.
 126 //
 127 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
 128         // Explicitly initialize all fields since a scanner may be reused.
 129         if file.Size() != len(src) {
 130                 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
 131         }
 132         s.file = file
 133         s.dir, _ = filepath.Split(file.Name())
 134         s.src = src
 135         s.err = err
 136         s.mode = mode
 137
 138         s.ch = ' '
 139         s.offset = 0
 140         s.rdOffset = 0
 141         s.lineOffset = 0
 142         s.insertSemi = false
 143         s.ErrorCount = 0
 144
 145         s.next()
 146         if s.ch == bom {
 147                 s.next() // ignore BOM at file beginning
 148         }
 149 }
 150
 151 func (s *Scanner) error(offs int, msg string) {
 152         if s.err != nil {
 153                 s.err(s.file.Position(s.file.Pos(offs)), msg)
 154         }
 155         s.ErrorCount++
 156 }
 157
 158 func (s *Scanner) errorf(offs int, format string, args ...interface{}) {
 159         s.error(offs, fmt.Sprintf(format, args...))
 160 }
 161
 162 func (s *Scanner) scanComment() string {
 163         // initial '/' already consumed; s.ch == '/' || s.ch == '*'
 164         offs := s.offset - 1 // position of initial '/'
 165         next := -1           // position immediately following the comment; < 0 means invalid comment
 166         numCR := 0
 167
 168         if s.ch == '/' {
 169                 //-style comment
 170                 // (the final '\n' is not considered part of the comment)
 171                 s.next()
 172                 for s.ch != '\n' && s.ch >= 0 {
 173                         if s.ch == '\r' {
 174                                 numCR++
 175                         }
 176                         s.next()
 177                 }
 178                 // if we are at '\n', the position following the comment is afterwards
 179                 next = s.offset
 180                 if s.ch == '\n' {
 181                         next++
 182                 }
 183                 goto exit
 184         }
 185
 186         /*-style comment */
 187         s.next()
 188         for s.ch >= 0 {
 189                 ch := s.ch
 190                 if ch == '\r' {
 191                         numCR++
 192                 }
 193                 s.next()
 194                 if ch == '*' && s.ch == '/' {
 195                         s.next()
 196                         next = s.offset
 197                         goto exit
 198                 }
 199         }
 200
 201         s.error(offs, "comment not terminated")
 202
 203 exit:
 204         lit := s.src[offs:s.offset]
 205
 206         // On Windows, a (//-comment) line may end in "\r\n".
 207         // Remove the final '\r' before analyzing the text for
 208         // line directives (matching the compiler). Remove any
 209         // other '\r' afterwards (matching the pre-existing be-
 210         // havior of the scanner).
 211         if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
 212                 lit = lit[:len(lit)-1]
 213                 numCR--
 214         }
 215
 216         // interpret line directives
 217         // (//line directives must start at the beginning of the current line)
 218         if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
 219                 s.updateLineInfo(next, offs, lit)
 220         }
 221
 222         if numCR > 0 {
 223                 lit = stripCR(lit, lit[1] == '*')
 224         }
 225
 226         return string(lit)
 227 }
 228
 229 var prefix = []byte("line ")
 230
 231 // updateLineInfo parses the incoming comment text at offset offs
 232 // as a line directive. If successful, it updates the line info table
 233 // for the position next per the line directive.
 234 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
 235         // extract comment text
 236         if text[1] == '*' {
 237                 text = text[:len(text)-2] // lop off trailing "*/"
 238         }
 239         text = text[7:] // lop off leading "//line " or "/*line "
 240         offs += 7
 241
 242         i, n, ok := trailingDigits(text)
 243         if i == 0 {
 244                 return // ignore (not a line directive)
 245         }
 246         // i > 0
 247
 248         if !ok {
 249                 // text has a suffix :xxx but xxx is not a number
 250                 s.error(offs+i, "invalid line number: "+string(text[i:]))
 251                 return
 252         }
 253
 254         var line, col int
 255         i2, n2, ok2 := trailingDigits(text[:i-1])
 256         if ok2 {
 257                 //line filename:line:col
 258                 i, i2 = i2, i
 259                 line, col = n2, n
 260                 if col == 0 {
 261                         s.error(offs+i2, "invalid column number: "+string(text[i2:]))
 262                         return
 263                 }
 264                 text = text[:i2-1] // lop off ":col"
 265         } else {
 266                 //line filename:line
 267                 line = n
 268         }
 269
 270         if line == 0 {
 271                 s.error(offs+i, "invalid line number: "+string(text[i:]))
 272                 return
 273         }
 274
 275         // If we have a column (//line filename:line:col form),
 276         // an empty filename means to use the previous filename.
 277         filename := string(text[:i-1]) // lop off ":line", and trim white space
 278         if filename == "" && ok2 {
 279                 filename = s.file.Position(s.file.Pos(offs)).Filename
 280         } else if filename != "" {
 281                 // Put a relative filename in the current directory.
 282                 // This is for compatibility with earlier releases.
 283                 // See issue 26671.
 284                 filename = filepath.Clean(filename)
 285                 if !filepath.IsAbs(filename) {
 286                         filename = filepath.Join(s.dir, filename)
 287                 }
 288         }
 289
 290         s.file.AddLineColumnInfo(next, filename, line, col)
 291 }
 292
 293 func trailingDigits(text []byte) (int, int, bool) {
 294         i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
 295         if i < 0 {
 296                 return 0, 0, false // no ":"
 297         }
 298         // i >= 0
 299         n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
 300         return i + 1, int(n), err == nil
 301 }
 302
 303 func (s *Scanner) findLineEnd() bool {
 304         // initial '/' already consumed
 305
 306         defer func(offs int) {
 307                 // reset scanner state to where it was upon calling findLineEnd
 308                 s.ch = '/'
 309                 s.offset = offs
 310                 s.rdOffset = offs + 1
 311                 s.next() // consume initial '/' again
 312         }(s.offset - 1)
 313
 314         // read ahead until a newline, EOF, or non-comment token is found
 315         for s.ch == '/' || s.ch == '*' {
 316                 if s.ch == '/' {
 317                         //-style comment always contains a newline
 318                         return true
 319                 }
 320                 /*-style comment: look for newline */
 321                 s.next()
 322                 for s.ch >= 0 {
 323                         ch := s.ch
 324                         if ch == '\n' {
 325                                 return true
 326                         }
 327                         s.next()
 328                         if ch == '*' && s.ch == '/' {
 329                                 s.next()
 330                                 break
 331                         }
 332                 }
 333                 s.skipWhitespace() // s.insertSemi is set
 334                 if s.ch < 0 || s.ch == '\n' {
 335                         return true
 336                 }
 337                 if s.ch != '/' {
 338                         // non-comment token
 339                         return false
 340                 }
 341                 s.next() // consume '/'
 342         }
 343
 344         return false
 345 }
 346
 347 func isLetter(ch rune) bool {
 348         return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
 349 }
 350
 351 func isDigit(ch rune) bool {
 352         return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
 353 }
 354
 355 // scanIdentifier reads the string of valid identifier characters at s.offset.
 356 // It must only be called when s.ch is known to be a valid letter.
 357 //
 358 // Be careful when making changes to this function: it is optimized and affects
 359 // scanning performance significantly.
 360 func (s *Scanner) scanIdentifier() string {
 361         offs := s.offset
 362
 363         // Optimize for the common case of an ASCII identifier.
 364         //
 365         // Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
 366         // avoids conversions to runes.
 367         //
 368         // In case we encounter a non-ASCII character, fall back on the slower path
 369         // of calling into s.next().
 370         for rdOffset, b := range s.src[s.rdOffset:] {
 371                 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
 372                         // Avoid assigning a rune for the common case of an ascii character.
 373                         continue
 374                 }
 375                 s.rdOffset += rdOffset
 376                 if 0 < b && b < utf8.RuneSelf {
 377                         // Optimization: we've encountered an ASCII character that's not a letter
 378                         // or number. Avoid the call into s.next() and corresponding set up.
 379                         //
 380                         // Note that s.next() does some line accounting if s.ch is '\n', so this
 381                         // shortcut is only possible because we know that the preceding character
 382                         // is not '\n'.
 383                         s.ch = rune(b)
 384                         s.offset = s.rdOffset
 385                         s.rdOffset++
 386                         goto exit
 387                 }
 388                 // We know that the preceding character is valid for an identifier because
 389                 // scanIdentifier is only called when s.ch is a letter, so calling s.next()
 390                 // at s.rdOffset resets the scanner state.
 391                 s.next()
 392                 for isLetter(s.ch) || isDigit(s.ch) {
 393                         s.next()
 394                 }
 395                 goto exit
 396         }
 397         s.offset = len(s.src)
 398         s.rdOffset = len(s.src)
 399         s.ch = eof
 400
 401 exit:
 402         return string(s.src[offs:s.offset])
 403 }
 404
 405 func digitVal(ch rune) int {
 406         switch {
 407         case '0' <= ch && ch <= '9':
 408                 return int(ch - '0')
 409         case 'a' <= lower(ch) && lower(ch) <= 'f':
 410                 return int(lower(ch) - 'a' + 10)
 411         }
 412         return 16 // larger than any legal digit val
 413 }
 414
 415 func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
 416 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
 417 func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
 418
 419 // digits accepts the sequence { digit | '_' }.
 420 // If base <= 10, digits accepts any decimal digit but records
 421 // the offset (relative to the source start) of a digit >= base
 422 // in *invalid, if *invalid < 0.
 423 // digits returns a bitset describing whether the sequence contained
 424 // digits (bit 0 is set), or separators '_' (bit 1 is set).
 425 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
 426         if base <= 10 {
 427                 max := rune('0' + base)
 428                 for isDecimal(s.ch) || s.ch == '_' {
 429                         ds := 1
 430                         if s.ch == '_' {
 431                                 ds = 2
 432                         } else if s.ch >= max && *invalid < 0 {
 433                                 *invalid = s.offset // record invalid rune offset
 434                         }
 435                         digsep |= ds
 436                         s.next()
 437                 }
 438         } else {
 439                 for isHex(s.ch) || s.ch == '_' {
 440                         ds := 1
 441                         if s.ch == '_' {
 442                                 ds = 2
 443                         }
 444                         digsep |= ds
 445                         s.next()
 446                 }
 447         }
 448         return
 449 }
 450
 451 func (s *Scanner) scanNumber() (token.Token, string) {
 452         offs := s.offset
 453         tok := token.ILLEGAL
 454
 455         base := 10        // number base
 456         prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
 457         digsep := 0       // bit 0: digit present, bit 1: '_' present
 458         invalid := -1     // index of invalid digit in literal, or < 0
 459
 460         // integer part
 461         if s.ch != '.' {
 462                 tok = token.INT
 463                 if s.ch == '0' {
 464                         s.next()
 465                         switch lower(s.ch) {
 466                         case 'x':
 467                                 s.next()
 468                                 base, prefix = 16, 'x'
 469                         case 'o':
 470                                 s.next()
 471                                 base, prefix = 8, 'o'
 472                         case 'b':
 473                                 s.next()
 474                                 base, prefix = 2, 'b'
 475                         default:
 476                                 base, prefix = 8, '0'
 477                                 digsep = 1 // leading 0
 478                         }
 479                 }
 480                 digsep |= s.digits(base, &invalid)
 481         }
 482
 483         // fractional part
 484         if s.ch == '.' {
 485                 tok = token.FLOAT
 486                 if prefix == 'o' || prefix == 'b' {
 487                         s.error(s.offset, "invalid radix point in "+litname(prefix))
 488                 }
 489                 s.next()
 490                 digsep |= s.digits(base, &invalid)
 491         }
 492
 493         if digsep&1 == 0 {
 494                 s.error(s.offset, litname(prefix)+" has no digits")
 495         }
 496
 497         // exponent
 498         if e := lower(s.ch); e == 'e' || e == 'p' {
 499                 switch {
 500                 case e == 'e' && prefix != 0 && prefix != '0':
 501                         s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
 502                 case e == 'p' && prefix != 'x':
 503                         s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
 504                 }
 505                 s.next()
 506                 tok = token.FLOAT
 507                 if s.ch == '+' || s.ch == '-' {
 508                         s.next()
 509                 }
 510                 ds := s.digits(10, nil)
 511                 digsep |= ds
 512                 if ds&1 == 0 {
 513                         s.error(s.offset, "exponent has no digits")
 514                 }
 515         } else if prefix == 'x' && tok == token.FLOAT {
 516                 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
 517         }
 518
 519         // suffix 'i'
 520         if s.ch == 'i' {
 521                 tok = token.IMAG
 522                 s.next()
 523         }
 524
 525         lit := string(s.src[offs:s.offset])
 526         if tok == token.INT && invalid >= 0 {
 527                 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
 528         }
 529         if digsep&2 != 0 {
 530                 if i := invalidSep(lit); i >= 0 {
 531                         s.error(offs+i, "'_' must separate successive digits")
 532                 }
 533         }
 534
 535         return tok, lit
 536 }
 537
 538 func litname(prefix rune) string {
 539         switch prefix {
 540         case 'x':
 541                 return "hexadecimal literal"
 542         case 'o', '0':
 543                 return "octal literal"
 544         case 'b':
 545                 return "binary literal"
 546         }
 547         return "decimal literal"
 548 }
 549
 550 // invalidSep returns the index of the first invalid separator in x, or -1.
 551 func invalidSep(x string) int {
 552         x1 := ' ' // prefix char, we only care if it's 'x'
 553         d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
 554         i := 0
 555
 556         // a prefix counts as a digit
 557         if len(x) >= 2 && x[0] == '0' {
 558                 x1 = lower(rune(x[1]))
 559                 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
 560                         d = '0'
 561                         i = 2
 562                 }
 563         }
 564
 565         // mantissa and exponent
 566         for ; i < len(x); i++ {
 567                 p := d // previous digit
 568                 d = rune(x[i])
 569                 switch {
 570                 case d == '_':
 571                         if p != '0' {
 572                                 return i
 573                         }
 574                 case isDecimal(d) || x1 == 'x' && isHex(d):
 575                         d = '0'
 576                 default:
 577                         if p == '_' {
 578                                 return i - 1
 579                         }
 580                         d = '.'
 581                 }
 582         }
 583         if d == '_' {
 584                 return len(x) - 1
 585         }
 586
 587         return -1
 588 }
 589
 590 // scanEscape parses an escape sequence where rune is the accepted
 591 // escaped quote. In case of a syntax error, it stops at the offending
 592 // character (without consuming it) and returns false. Otherwise
 593 // it returns true.
 594 func (s *Scanner) scanEscape(quote rune) bool {
 595         offs := s.offset
 596
 597         var n int
 598         var base, max uint32
 599         switch s.ch {
 600         case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
 601                 s.next()
 602                 return true
 603         case '0', '1', '2', '3', '4', '5', '6', '7':
 604                 n, base, max = 3, 8, 255
 605         case 'x':
 606                 s.next()
 607                 n, base, max = 2, 16, 255
 608         case 'u':
 609                 s.next()
 610                 n, base, max = 4, 16, unicode.MaxRune
 611         case 'U':
 612                 s.next()
 613                 n, base, max = 8, 16, unicode.MaxRune
 614         default:
 615                 msg := "unknown escape sequence"
 616                 if s.ch < 0 {
 617                         msg = "escape sequence not terminated"
 618                 }
 619                 s.error(offs, msg)
 620                 return false
 621         }
 622
 623         var x uint32
 624         for n > 0 {
 625                 d := uint32(digitVal(s.ch))
 626                 if d >= base {
 627                         msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
 628                         if s.ch < 0 {
 629                                 msg = "escape sequence not terminated"
 630                         }
 631                         s.error(s.offset, msg)
 632                         return false
 633                 }
 634                 x = x*base + d
 635                 s.next()
 636                 n--
 637         }
 638
 639         if x > max || 0xD800 <= x && x < 0xE000 {
 640                 s.error(offs, "escape sequence is invalid Unicode code point")
 641                 return false
 642         }
 643
 644         return true
 645 }
 646
 647 func (s *Scanner) scanRune() string {
 648         // '\'' opening already consumed
 649         offs := s.offset - 1
 650
 651         valid := true
 652         n := 0
 653         for {
 654                 ch := s.ch
 655                 if ch == '\n' || ch < 0 {
 656                         // only report error if we don't have one already
 657                         if valid {
 658                                 s.error(offs, "rune literal not terminated")
 659                                 valid = false
 660                         }
 661                         break
 662                 }
 663                 s.next()
 664                 if ch == '\'' {
 665                         break
 666                 }
 667                 n++
 668                 if ch == '\\' {
 669                         if !s.scanEscape('\'') {
 670                                 valid = false
 671                         }
 672                         // continue to read to closing quote
 673                 }
 674         }
 675
 676         if valid && n != 1 {
 677                 s.error(offs, "illegal rune literal")
 678         }
 679
 680         return string(s.src[offs:s.offset])
 681 }
 682
 683 func (s *Scanner) scanString() string {
 684         // '"' opening already consumed
 685         offs := s.offset - 1
 686
 687         for {
 688                 ch := s.ch
 689                 if ch == '\n' || ch < 0 {
 690                         s.error(offs, "string literal not terminated")
 691                         break
 692                 }
 693                 s.next()
 694                 if ch == '"' {
 695                         break
 696                 }
 697                 if ch == '\\' {
 698                         s.scanEscape('"')
 699                 }
 700         }
 701
 702         return string(s.src[offs:s.offset])
 703 }
 704
 705 func stripCR(b []byte, comment bool) []byte {
 706         c := make([]byte, len(b))
 707         i := 0
 708         for j, ch := range b {
 709                 // In a /*-style comment, don't strip \r from *\r/ (incl.
 710                 // sequences of \r from *\r\r...\r/) since the resulting
 711                 // */ would terminate the comment too early unless the \r
 712                 // is immediately following the opening /* in which case
 713                 // it's ok because /*/ is not closed yet (issue #11151).
 714                 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
 715                         c[i] = ch
 716                         i++
 717                 }
 718         }
 719         return c[:i]
 720 }
 721
 722 func (s *Scanner) scanRawString() string {
 723         // '`' opening already consumed
 724         offs := s.offset - 1
 725
 726         hasCR := false
 727         for {
 728                 ch := s.ch
 729                 if ch < 0 {
 730                         s.error(offs, "raw string literal not terminated")
 731                         break
 732                 }
 733                 s.next()
 734                 if ch == '`' {
 735                         break
 736                 }
 737                 if ch == '\r' {
 738                         hasCR = true
 739                 }
 740         }
 741
 742         lit := s.src[offs:s.offset]
 743         if hasCR {
 744                 lit = stripCR(lit, false)
 745         }
 746
 747         return string(lit)
 748 }
 749
 750 func (s *Scanner) skipWhitespace() {
 751         for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
 752                 s.next()
 753         }
 754 }
 755
 756 // Helper functions for scanning multi-byte tokens such as >> += >>= .
 757 // Different routines recognize different length tok_i based on matches
 758 // of ch_i. If a token ends in '=', the result is tok1 or tok3
 759 // respectively. Otherwise, the result is tok0 if there was no other
 760 // matching character, or tok2 if the matching character was ch2.
 761
 762 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
 763         if s.ch == '=' {
 764                 s.next()
 765                 return tok1
 766         }
 767         return tok0
 768 }
 769
 770 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
 771         if s.ch == '=' {
 772                 s.next()
 773                 return tok1
 774         }
 775         if s.ch == ch2 {
 776                 s.next()
 777                 return tok2
 778         }
 779         return tok0
 780 }
 781
 782 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
 783         if s.ch == '=' {
 784                 s.next()
 785                 return tok1
 786         }
 787         if s.ch == ch2 {
 788                 s.next()
 789                 if s.ch == '=' {
 790                         s.next()
 791                         return tok3
 792                 }
 793                 return tok2
 794         }
 795         return tok0
 796 }
 797
 798 // Scan scans the next token and returns the token position, the token,
 799 // and its literal string if applicable. The source end is indicated by
 800 // token.EOF.
 801 //
 802 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
 803 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
 804 // has the corresponding value.
 805 //
 806 // If the returned token is a keyword, the literal string is the keyword.
 807 //
 808 // If the returned token is token.SEMICOLON, the corresponding
 809 // literal string is ";" if the semicolon was present in the source,
 810 // and "\n" if the semicolon was inserted because of a newline or
 811 // at EOF.
 812 //
 813 // If the returned token is token.ILLEGAL, the literal string is the
 814 // offending character.
 815 //
 816 // In all other cases, Scan returns an empty literal string.
 817 //
 818 // For more tolerant parsing, Scan will return a valid token if
 819 // possible even if a syntax error was encountered. Thus, even
 820 // if the resulting token sequence contains no illegal tokens,
 821 // a client may not assume that no error occurred. Instead it
 822 // must check the scanner's ErrorCount or the number of calls
 823 // of the error handler, if there was one installed.
 824 //
 825 // Scan adds line information to the file added to the file
 826 // set with Init. Token positions are relative to that file
 827 // and thus relative to the file set.
 828 //
 829 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
 830 scanAgain:
 831         s.skipWhitespace()
 832
 833         // current token start
 834         pos = s.file.Pos(s.offset)
 835
 836         // determine token value
 837         insertSemi := false
 838         switch ch := s.ch; {
 839         case isLetter(ch):
 840                 lit = s.scanIdentifier()
 841                 if len(lit) > 1 {
 842                         // keywords are longer than one letter - avoid lookup otherwise
 843                         tok = token.Lookup(lit)
 844                         switch tok {
 845                         case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
 846                                 insertSemi = true
 847                         }
 848                 } else {
 849                         insertSemi = true
 850                         tok = token.IDENT
 851                 }
 852         case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
 853                 insertSemi = true
 854                 tok, lit = s.scanNumber()
 855         default:
 856                 s.next() // always make progress
 857                 switch ch {
 858                 case -1:
 859                         if s.insertSemi {
 860                                 s.insertSemi = false // EOF consumed
 861                                 return pos, token.SEMICOLON, "\n"
 862                         }
 863                         tok = token.EOF
 864                 case '\n':
 865                         // we only reach here if s.insertSemi was
 866                         // set in the first place and exited early
 867                         // from s.skipWhitespace()
 868                         s.insertSemi = false // newline consumed
 869                         return pos, token.SEMICOLON, "\n"
 870                 case '"':
 871                         insertSemi = true
 872                         tok = token.STRING
 873                         lit = s.scanString()
 874                 case '\'':
 875                         insertSemi = true
 876                         tok = token.CHAR
 877                         lit = s.scanRune()
 878                 case '`':
 879                         insertSemi = true
 880                         tok = token.STRING
 881                         lit = s.scanRawString()
 882                 case ':':
 883                         tok = s.switch2(token.COLON, token.DEFINE)
 884                 case '.':
 885                         // fractions starting with a '.' are handled by outer switch
 886                         tok = token.PERIOD
 887                         if s.ch == '.' && s.peek() == '.' {
 888                                 s.next()
 889                                 s.next() // consume last '.'
 890                                 tok = token.ELLIPSIS
 891                         }
 892                 case ',':
 893                         tok = token.COMMA
 894                 case ';':
 895                         tok = token.SEMICOLON
 896                         lit = ";"
 897                 case '(':
 898                         tok = token.LPAREN
 899                 case ')':
 900                         insertSemi = true
 901                         tok = token.RPAREN
 902                 case '[':
 903                         tok = token.LBRACK
 904                 case ']':
 905                         insertSemi = true
 906                         tok = token.RBRACK
 907                 case '{':
 908                         tok = token.LBRACE
 909                 case '}':
 910                         insertSemi = true
 911                         tok = token.RBRACE
 912                 case '+':
 913                         tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
 914                         if tok == token.INC {
 915                                 insertSemi = true
 916                         }
 917                 case '-':
 918                         tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
 919                         if tok == token.DEC {
 920                                 insertSemi = true
 921                         }
 922                 case '*':
 923                         tok = s.switch2(token.MUL, token.MUL_ASSIGN)
 924                 case '/':
 925                         if s.ch == '/' || s.ch == '*' {
 926                                 // comment
 927                                 if s.insertSemi && s.findLineEnd() {
 928                                         // reset position to the beginning of the comment
 929                                         s.ch = '/'
 930                                         s.offset = s.file.Offset(pos)
 931                                         s.rdOffset = s.offset + 1
 932                                         s.insertSemi = false // newline consumed
 933                                         return pos, token.SEMICOLON, "\n"
 934                                 }
 935                                 comment := s.scanComment()
 936                                 if s.mode&ScanComments == 0 {
 937                                         // skip comment
 938                                         s.insertSemi = false // newline consumed
 939                                         goto scanAgain
 940                                 }
 941                                 tok = token.COMMENT
 942                                 lit = comment
 943                         } else {
 944                                 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
 945                         }
 946                 case '%':
 947                         tok = s.switch2(token.REM, token.REM_ASSIGN)
 948                 case '^':
 949                         tok = s.switch2(token.XOR, token.XOR_ASSIGN)
 950                 case '<':
 951                         if s.ch == '-' {
 952                                 s.next()
 953                                 tok = token.ARROW
 954                         } else {
 955                                 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
 956                         }
 957                 case '>':
 958                         tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
 959                 case '=':
 960                         tok = s.switch2(token.ASSIGN, token.EQL)
 961                 case '!':
 962                         tok = s.switch2(token.NOT, token.NEQ)
 963                 case '&':
 964                         if s.ch == '^' {
 965                                 s.next()
 966                                 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
 967                         } else {
 968                                 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
 969                         }
 970                 case '|':
 971                         tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
 972                 case '~':
 973                         tok = token.TILDE
 974                 default:
 975                         // next reports unexpected BOMs - don't repeat
 976                         if ch != bom {
 977                                 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
 978                         }
 979                         insertSemi = s.insertSemi // preserve insertSemi info
 980                         tok = token.ILLEGAL
 981                         lit = string(ch)
 982                 }
 983         }
 984         if s.mode&dontInsertSemis == 0 {
 985                 s.insertSemi = insertSemi
 986         }
 987
 988         return
 989 }