src/mime/encodedword.go

   1 // Copyright 2015 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package mime
   6
   7 import (
   8         "bytes"
   9         "encoding/base64"
  10         "errors"
  11         "fmt"
  12         "io"
  13         "strings"
  14         "unicode"
  15         "unicode/utf8"
  16 )
  17
  18 // A WordEncoder is an RFC 2047 encoded-word encoder.
  19 type WordEncoder byte
  20
  21 const (
  22         // BEncoding represents Base64 encoding scheme as defined by RFC 2045.
  23         BEncoding = WordEncoder('b')
  24         // QEncoding represents the Q-encoding scheme as defined by RFC 2047.
  25         QEncoding = WordEncoder('q')
  26 )
  27
  28 var (
  29         errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
  30 )
  31
  32 // Encode returns the encoded-word form of s. If s is ASCII without special
  33 // characters, it is returned unchanged. The provided charset is the IANA
  34 // charset name of s. It is case insensitive.
  35 func (e WordEncoder) Encode(charset, s string) string {
  36         if !needsEncoding(s) {
  37                 return s
  38         }
  39         return e.encodeWord(charset, s)
  40 }
  41
  42 func needsEncoding(s string) bool {
  43         for _, b := range s {
  44                 if (b < ' ' || b > '~') && b != '\t' {
  45                         return true
  46                 }
  47         }
  48         return false
  49 }
  50
  51 // encodeWord encodes a string into an encoded-word.
  52 func (e WordEncoder) encodeWord(charset, s string) string {
  53         var buf strings.Builder
  54         // Could use a hint like len(s)*3, but that's not enough for cases
  55         // with word splits and too much for simpler inputs.
  56         // 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
  57         buf.Grow(48)
  58
  59         e.openWord(&buf, charset)
  60         if e == BEncoding {
  61                 e.bEncode(&buf, charset, s)
  62         } else {
  63                 e.qEncode(&buf, charset, s)
  64         }
  65         closeWord(&buf)
  66
  67         return buf.String()
  68 }
  69
  70 const (
  71         // The maximum length of an encoded-word is 75 characters.
  72         // See RFC 2047, section 2.
  73         maxEncodedWordLen = 75
  74         // maxContentLen is how much content can be encoded, ignoring the header and
  75         // 2-byte footer.
  76         maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
  77 )
  78
  79 var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
  80
  81 // bEncode encodes s using base64 encoding and writes it to buf.
  82 func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) {
  83         w := base64.NewEncoder(base64.StdEncoding, buf)
  84         // If the charset is not UTF-8 or if the content is short, do not bother
  85         // splitting the encoded-word.
  86         if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
  87                 io.WriteString(w, s)
  88                 w.Close()
  89                 return
  90         }
  91
  92         var currentLen, last, runeLen int
  93         for i := 0; i < len(s); i += runeLen {
  94                 // Multi-byte characters must not be split across encoded-words.
  95                 // See RFC 2047, section 5.3.
  96                 _, runeLen = utf8.DecodeRuneInString(s[i:])
  97
  98                 if currentLen+runeLen <= maxBase64Len {
  99                         currentLen += runeLen
 100                 } else {
 101                         io.WriteString(w, s[last:i])
 102                         w.Close()
 103                         e.splitWord(buf, charset)
 104                         last = i
 105                         currentLen = runeLen
 106                 }
 107         }
 108         io.WriteString(w, s[last:])
 109         w.Close()
 110 }
 111
 112 // qEncode encodes s using Q encoding and writes it to buf. It splits the
 113 // encoded-words when necessary.
 114 func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) {
 115         // We only split encoded-words when the charset is UTF-8.
 116         if !isUTF8(charset) {
 117                 writeQString(buf, s)
 118                 return
 119         }
 120
 121         var currentLen, runeLen int
 122         for i := 0; i < len(s); i += runeLen {
 123                 b := s[i]
 124                 // Multi-byte characters must not be split across encoded-words.
 125                 // See RFC 2047, section 5.3.
 126                 var encLen int
 127                 if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
 128                         runeLen, encLen = 1, 1
 129                 } else {
 130                         _, runeLen = utf8.DecodeRuneInString(s[i:])
 131                         encLen = 3 * runeLen
 132                 }
 133
 134                 if currentLen+encLen > maxContentLen {
 135                         e.splitWord(buf, charset)
 136                         currentLen = 0
 137                 }
 138                 writeQString(buf, s[i:i+runeLen])
 139                 currentLen += encLen
 140         }
 141 }
 142
 143 // writeQString encodes s using Q encoding and writes it to buf.
 144 func writeQString(buf *strings.Builder, s string) {
 145         for i := 0; i < len(s); i++ {
 146                 switch b := s[i]; {
 147                 case b == ' ':
 148                         buf.WriteByte('_')
 149                 case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
 150                         buf.WriteByte(b)
 151                 default:
 152                         buf.WriteByte('=')
 153                         buf.WriteByte(upperhex[b>>4])
 154                         buf.WriteByte(upperhex[b&0x0f])
 155                 }
 156         }
 157 }
 158
 159 // openWord writes the beginning of an encoded-word into buf.
 160 func (e WordEncoder) openWord(buf *strings.Builder, charset string) {
 161         buf.WriteString("=?")
 162         buf.WriteString(charset)
 163         buf.WriteByte('?')
 164         buf.WriteByte(byte(e))
 165         buf.WriteByte('?')
 166 }
 167
 168 // closeWord writes the end of an encoded-word into buf.
 169 func closeWord(buf *strings.Builder) {
 170         buf.WriteString("?=")
 171 }
 172
 173 // splitWord closes the current encoded-word and opens a new one.
 174 func (e WordEncoder) splitWord(buf *strings.Builder, charset string) {
 175         closeWord(buf)
 176         buf.WriteByte(' ')
 177         e.openWord(buf, charset)
 178 }
 179
 180 func isUTF8(charset string) bool {
 181         return strings.EqualFold(charset, "UTF-8")
 182 }
 183
 184 const upperhex = "0123456789ABCDEF"
 185
 186 // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
 187 type WordDecoder struct {
 188         // CharsetReader, if non-nil, defines a function to generate
 189         // charset-conversion readers, converting from the provided
 190         // charset into UTF-8.
 191         // Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
 192         // are handled by default.
 193         // One of the CharsetReader's result values must be non-nil.
 194         CharsetReader func(charset string, input io.Reader) (io.Reader, error)
 195 }
 196
 197 // Decode decodes an RFC 2047 encoded-word.
 198 func (d *WordDecoder) Decode(word string) (string, error) {
 199         // See https://tools.ietf.org/html/rfc2047#section-2 for details.
 200         // Our decoder is permissive, we accept empty encoded-text.
 201         if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
 202                 return "", errInvalidWord
 203         }
 204         word = word[2 : len(word)-2]
 205
 206         // split word "UTF-8?q?text" into "UTF-8", 'q', and "text"
 207         charset, text, _ := strings.Cut(word, "?")
 208         if charset == "" {
 209                 return "", errInvalidWord
 210         }
 211         encoding, text, _ := strings.Cut(text, "?")
 212         if len(encoding) != 1 {
 213                 return "", errInvalidWord
 214         }
 215
 216         content, err := decode(encoding[0], text)
 217         if err != nil {
 218                 return "", err
 219         }
 220
 221         var buf strings.Builder
 222         if err := d.convert(&buf, charset, content); err != nil {
 223                 return "", err
 224         }
 225         return buf.String(), nil
 226 }
 227
 228 // DecodeHeader decodes all encoded-words of the given string. It returns an
 229 // error if and only if CharsetReader of d returns an error.
 230 func (d *WordDecoder) DecodeHeader(header string) (string, error) {
 231         // If there is no encoded-word, returns before creating a buffer.
 232         i := strings.Index(header, "=?")
 233         if i == -1 {
 234                 return header, nil
 235         }
 236
 237         var buf strings.Builder
 238
 239         buf.WriteString(header[:i])
 240         header = header[i:]
 241
 242         betweenWords := false
 243         for {
 244                 start := strings.Index(header, "=?")
 245                 if start == -1 {
 246                         break
 247                 }
 248                 cur := start + len("=?")
 249
 250                 i := strings.Index(header[cur:], "?")
 251                 if i == -1 {
 252                         break
 253                 }
 254                 charset := header[cur : cur+i]
 255                 cur += i + len("?")
 256
 257                 if len(header) < cur+len("Q??=") {
 258                         break
 259                 }
 260                 encoding := header[cur]
 261                 cur++
 262
 263                 if header[cur] != '?' {
 264                         break
 265                 }
 266                 cur++
 267
 268                 j := strings.Index(header[cur:], "?=")
 269                 if j == -1 {
 270                         break
 271                 }
 272                 text := header[cur : cur+j]
 273                 end := cur + j + len("?=")
 274
 275                 content, err := decode(encoding, text)
 276                 if err != nil {
 277                         betweenWords = false
 278                         buf.WriteString(header[:start+2])
 279                         header = header[start+2:]
 280                         continue
 281                 }
 282
 283                 // Write characters before the encoded-word. White-space and newline
 284                 // characters separating two encoded-words must be deleted.
 285                 if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
 286                         buf.WriteString(header[:start])
 287                 }
 288
 289                 if err := d.convert(&buf, charset, content); err != nil {
 290                         return "", err
 291                 }
 292
 293                 header = header[end:]
 294                 betweenWords = true
 295         }
 296
 297         if len(header) > 0 {
 298                 buf.WriteString(header)
 299         }
 300
 301         return buf.String(), nil
 302 }
 303
 304 func decode(encoding byte, text string) ([]byte, error) {
 305         switch encoding {
 306         case 'B', 'b':
 307                 return base64.StdEncoding.DecodeString(text)
 308         case 'Q', 'q':
 309                 return qDecode(text)
 310         default:
 311                 return nil, errInvalidWord
 312         }
 313 }
 314
 315 func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error {
 316         switch {
 317         case strings.EqualFold("utf-8", charset):
 318                 buf.Write(content)
 319         case strings.EqualFold("iso-8859-1", charset):
 320                 for _, c := range content {
 321                         buf.WriteRune(rune(c))
 322                 }
 323         case strings.EqualFold("us-ascii", charset):
 324                 for _, c := range content {
 325                         if c >= utf8.RuneSelf {
 326                                 buf.WriteRune(unicode.ReplacementChar)
 327                         } else {
 328                                 buf.WriteByte(c)
 329                         }
 330                 }
 331         default:
 332                 if d.CharsetReader == nil {
 333                         return fmt.Errorf("mime: unhandled charset %q", charset)
 334                 }
 335                 r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
 336                 if err != nil {
 337                         return err
 338                 }
 339                 if _, err = io.Copy(buf, r); err != nil {
 340                         return err
 341                 }
 342         }
 343         return nil
 344 }
 345
 346 // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
 347 // one byte of non-whitespace.
 348 func hasNonWhitespace(s string) bool {
 349         for _, b := range s {
 350                 switch b {
 351                 // Encoded-words can only be separated by linear white spaces which does
 352                 // not include vertical tabs (\v).
 353                 case ' ', '\t', '\n', '\r':
 354                 default:
 355                         return true
 356                 }
 357         }
 358         return false
 359 }
 360
 361 // qDecode decodes a Q encoded string.
 362 func qDecode(s string) ([]byte, error) {
 363         dec := make([]byte, len(s))
 364         n := 0
 365         for i := 0; i < len(s); i++ {
 366                 switch c := s[i]; {
 367                 case c == '_':
 368                         dec[n] = ' '
 369                 case c == '=':
 370                         if i+2 >= len(s) {
 371                                 return nil, errInvalidWord
 372                         }
 373                         b, err := readHexByte(s[i+1], s[i+2])
 374                         if err != nil {
 375                                 return nil, err
 376                         }
 377                         dec[n] = b
 378                         i += 2
 379                 case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
 380                         dec[n] = c
 381                 default:
 382                         return nil, errInvalidWord
 383                 }
 384                 n++
 385         }
 386
 387         return dec[:n], nil
 388 }
 389
 390 // readHexByte returns the byte from its quoted-printable representation.
 391 func readHexByte(a, b byte) (byte, error) {
 392         var hb, lb byte
 393         var err error
 394         if hb, err = fromHex(a); err != nil {
 395                 return 0, err
 396         }
 397         if lb, err = fromHex(b); err != nil {
 398                 return 0, err
 399         }
 400         return hb<<4 | lb, nil
 401 }
 402
 403 func fromHex(b byte) (byte, error) {
 404         switch {
 405         case b >= '0' && b <= '9':
 406                 return b - '0', nil
 407         case b >= 'A' && b <= 'F':
 408                 return b - 'A' + 10, nil
 409         // Accept badly encoded bytes.
 410         case b >= 'a' && b <= 'f':
 411                 return b - 'a' + 10, nil
 412         }
 413         return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
 414 }