regexp: document and implement that invalid UTF-8 bytes are the same as U+FFFD

author Russ Cox <rsc@golang.org>

Thu, 7 Oct 2021 13:56:29 +0000 (09:56 -0400)

committer Russ Cox <rsc@golang.org>

Mon, 11 Oct 2021 15:28:50 +0000 (15:28 +0000)
author Russ Cox <rsc@golang.org>
Thu, 7 Oct 2021 13:56:29 +0000 (09:56 -0400)
committer Russ Cox <rsc@golang.org>
Mon, 11 Oct 2021 15:28:50 +0000 (15:28 +0000)
diff --git a/src/regexp/all_test.go b/src/regexp/all_test.go

index be7a2e7111876de6d6f7f6be2514556a587bfa87..c233cfa9eaac75fb1f33de6db2aa3b53887a9177 100644 (file)
--- a/src/regexp/all_test.go
+++ b/src/regexp/all_test.go
@@ -372,6 +372,9 @@ var literalPrefixTests = []MetaTest{
         {`^^0$$`, ``, ``, false},
         {`^$^$`, ``, ``, false},
         {`$$0^^`, ``, ``, false},
+       {`a\x{fffd}b`, ``, `a`, false},
+       {`\x{fffd}b`, ``, ``, false},
+       {"\ufffd", ``, ``, false},
  }
  
  func TestQuoteMeta(t *testing.T) {
diff --git a/src/regexp/find_test.go b/src/regexp/find_test.go

index 64c2239d905fdff2e26df72ba42b51cc1c85caca..2edbe9b86e61547d11c50040e8a39150378371ab 100644 (file)
--- a/src/regexp/find_test.go
+++ b/src/regexp/find_test.go
@@ -116,6 +116,13 @@ var findTests = []FindTest{
         {"\\`", "`", build(1, 0, 1)},
         {"[\\`]+", "`", build(1, 0, 1)},
  
+       {"\ufffd", "\xff", build(1, 0, 1)},
+       {"\ufffd", "hello\xffworld", build(1, 5, 6)},
+       {`.*`, "hello\xffworld", build(1, 0, 11)},
+       {`\x{fffd}`, "\xc2\x00", build(1, 0, 1)},
+       {"[\ufffd]", "\xff", build(1, 0, 1)},
+       {`[\x{fffd}]`, "\xc2\x00", build(1, 0, 1)},
+
         // long set of matches (longer than startSize)
         {
                 ".",
diff --git a/src/regexp/onepass.go b/src/regexp/onepass.go

index 2f3ce6f9f6cb877f9e4eda1d7c3d427ca9b1f741..bc47f4c4a830da065ca5053227a97b0a53588ad0 100644 (file)
--- a/src/regexp/onepass.go
+++ b/src/regexp/onepass.go
@@ -9,6 +9,7 @@ import (
         "sort"
         "strings"
         "unicode"
+       "unicode/utf8"
  )
  
  // "One-pass" regexp execution.
@@ -55,7 +56,7 @@ func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
  
         // Have prefix; gather characters.
         var buf strings.Builder
-       for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
+       for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError {
                 buf.WriteRune(i.Rune[0])
                 pc, i = i.Out, &p.Inst[i.Out]
         }
diff --git a/src/regexp/regexp.go b/src/regexp/regexp.go

index bfcf7910cf38041ed50c73ca79492a752c0801a5..af7259c9bfd2cbd0e2d9f3b1f587db97de612f35 100644 (file)
--- a/src/regexp/regexp.go
+++ b/src/regexp/regexp.go
@@ -20,6 +20,8 @@
  // or any book about automata theory.
  //
  // All characters are UTF-8-encoded code points.
+// Following utf8.DecodeRune, each byte of an invalid UTF-8 sequence
+// is treated as if it encoded utf8.RuneError (U+FFFD).
  //
  // There are 16 methods of Regexp that match a regular expression and identify
  // the matched text. Their names are matched by this regular expression:
@@ -276,7 +278,11 @@ func minInputLen(re *syntax.Regexp) int {
         case syntax.OpLiteral:
                 l := 0
                 for _, r := range re.Rune {
-                       l += utf8.RuneLen(r)
+                       if r == utf8.RuneError {
+                               l++
+                       } else {
+                               l += utf8.RuneLen(r)
+                       }
                 }
                 return l
         case syntax.OpCapture, syntax.OpPlus:
diff --git a/src/regexp/syntax/prog.go b/src/regexp/syntax/prog.go

index ae7a9a2fe0118a11f02451d32586193a10a08bce..8583f55e5421d88789cb7e036695d5244ec7a87b 100644 (file)
--- a/src/regexp/syntax/prog.go
+++ b/src/regexp/syntax/prog.go
@@ -8,6 +8,7 @@ import (
         "strconv"
         "strings"
         "unicode"
+       "unicode/utf8"
  )
  
  // Compiled program.
@@ -154,7 +155,7 @@ func (p *Prog) Prefix() (prefix string, complete bool) {
  
         // Have prefix; gather characters.
         var buf strings.Builder
-       for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
+       for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
                 buf.WriteRune(i.Rune[0])
                 i = p.skipNop(i.Out)
         }
author	Russ Cox <rsc@golang.org>
	Thu, 7 Oct 2021 13:56:29 +0000 (09:56 -0400)
committer	Russ Cox <rsc@golang.org>
	Mon, 11 Oct 2021 15:28:50 +0000 (15:28 +0000)
src/regexp/all_test.go		patch \| blob \| history
src/regexp/find_test.go		patch \| blob \| history
src/regexp/onepass.go		patch \| blob \| history
src/regexp/regexp.go		patch \| blob \| history
src/regexp/syntax/prog.go		patch \| blob \| history