]> Cypherpunks.ru repositories - gostls13.git/blob - src/syscall/wtf8_windows.go
cmd/compile/internal/inline: score call sites exposed by inlines
[gostls13.git] / src / syscall / wtf8_windows.go
1 // Copyright 2023 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Windows UTF-16 strings can contain unpaired surrogates, which can't be
6 // decoded into a valid UTF-8 string. This file defines a set of functions
7 // that can be used to encode and decode potentially ill-formed UTF-16 strings
8 // by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
9 //
10 // WTF-8 is a strict superset of UTF-8, i.e. any string that is
11 // well-formed in UTF-8 is also well-formed in WTF-8 and the content
12 // is unchanged. Also, the conversion never fails and is lossless.
13 //
14 // The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string
15 // is that the conversion is lossless even for ill-formed UTF-16 strings.
16 // This property allows to read an ill-formed UTF-16 string, convert it
17 // to a Go string, and convert it back to the same original UTF-16 string.
18 //
19 // See go.dev/issues/59971 for more info.
20
21 package syscall
22
23 import (
24         "unicode/utf16"
25         "unicode/utf8"
26 )
27
28 const (
29         surr1 = 0xd800
30         surr2 = 0xdc00
31         surr3 = 0xe000
32
33         tx    = 0b10000000
34         t3    = 0b11100000
35         maskx = 0b00111111
36         mask3 = 0b00001111
37
38         rune1Max = 1<<7 - 1
39         rune2Max = 1<<11 - 1
40 )
41
42 // encodeWTF16 returns the potentially ill-formed
43 // UTF-16 encoding of s.
44 func encodeWTF16(s string, buf []uint16) []uint16 {
45         for i := 0; i < len(s); {
46                 // Cannot use 'for range s' because it expects valid
47                 // UTF-8 runes.
48                 r, size := utf8.DecodeRuneInString(s[i:])
49                 if r == utf8.RuneError {
50                         // Check if s[i:] contains a valid WTF-8 encoded surrogate.
51                         if sc := s[i:]; len(sc) >= 3 && sc[0] == 0xED && 0xA0 <= sc[1] && sc[1] <= 0xBF && 0x80 <= sc[2] && sc[2] <= 0xBF {
52                                 r = rune(sc[0]&mask3)<<12 + rune(sc[1]&maskx)<<6 + rune(sc[2]&maskx)
53                                 buf = append(buf, uint16(r))
54                                 i += 3
55                                 continue
56                         }
57                 }
58                 i += size
59                 buf = utf16.AppendRune(buf, r)
60         }
61         return buf
62 }
63
64 // decodeWTF16 returns the WTF-8 encoding of
65 // the potentially ill-formed UTF-16 s.
66 func decodeWTF16(s []uint16, buf []byte) []byte {
67         for i := 0; i < len(s); i++ {
68                 var ar rune
69                 switch r := s[i]; {
70                 case r < surr1, surr3 <= r:
71                         // normal rune
72                         ar = rune(r)
73                 case surr1 <= r && r < surr2 && i+1 < len(s) &&
74                         surr2 <= s[i+1] && s[i+1] < surr3:
75                         // valid surrogate sequence
76                         ar = utf16.DecodeRune(rune(r), rune(s[i+1]))
77                         i++
78                 default:
79                         // WTF-8 fallback.
80                         // This only handles the 3-byte case of utf8.AppendRune,
81                         // as surrogates always fall in that case.
82                         ar = rune(r)
83                         if ar > utf8.MaxRune {
84                                 ar = utf8.RuneError
85                         }
86                         buf = append(buf, t3|byte(ar>>12), tx|byte(ar>>6)&maskx, tx|byte(ar)&maskx)
87                         continue
88                 }
89                 buf = utf8.AppendRune(buf, ar)
90         }
91         return buf
92 }