|
| 1 | +package jsonparser |
| 2 | + |
| 3 | +import ( |
| 4 | + "bytes" |
| 5 | + "unicode/utf8" |
| 6 | +) |
| 7 | + |
| 8 | +// JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7 |
| 9 | + |
| 10 | +const highSurrogateOffset = 0xDB00 |
| 11 | +const lowSurrogateOffset = 0xDC00 |
| 12 | + |
| 13 | +func combineUTF16Surrogates(high, low rune) rune { |
| 14 | + return (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset) |
| 15 | +} |
| 16 | + |
| 17 | +const badHex = -1 |
| 18 | + |
| 19 | +func h2I(c byte) int { |
| 20 | + switch { |
| 21 | + case c >= '0' && c <= '9': |
| 22 | + return int(c - '0') |
| 23 | + case c >= 'A' && c <= 'F': |
| 24 | + return int(c - 'A' + 10) |
| 25 | + case c >= 'a' && c <= 'f': |
| 26 | + return int(c - 'a' + 10) |
| 27 | + } |
| 28 | + return badHex |
| 29 | +} |
| 30 | + |
| 31 | +// decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. In JSON, these can either come alone or as part |
| 32 | +// of "UTF16 surrogate pairs" that must be handled together; this function only handles one at a time |
| 33 | +func decodeSingleUnicodeEscape(in []byte) (rune, bool) { |
| 34 | + // We need at least 6 characters total |
| 35 | + if len(in) < 6 { |
| 36 | + return utf8.RuneError, false |
| 37 | + } |
| 38 | + |
| 39 | + // Convert hex to decimal |
| 40 | + h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5]) |
| 41 | + if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex { |
| 42 | + return utf8.RuneError, false |
| 43 | + } |
| 44 | + |
| 45 | + // Compose the hex digits |
| 46 | + return rune(h1<<12 + h2<<8 + h3<<4 + h4), true |
| 47 | +} |
| 48 | + |
| 49 | +func decodeUnicodeEscape(in []byte) (rune, int) { |
| 50 | + if r, ok := decodeSingleUnicodeEscape(in); !ok { |
| 51 | + // Invalid Unicode escape |
| 52 | + return utf8.RuneError, -1 |
| 53 | + } else if r < highSurrogateOffset { |
| 54 | + // Valid Unicode escape in Basic Multilingual Plane |
| 55 | + return r, 6 |
| 56 | + } else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain |
| 57 | + // UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate" |
| 58 | + return utf8.RuneError, -1 |
| 59 | + } else { |
| 60 | + // Valid UTF16 surrogate pair |
| 61 | + return combineUTF16Surrogates(r, r2), 12 |
| 62 | + } |
| 63 | + |
| 64 | +} |
| 65 | + |
| 66 | +// unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns |
| 67 | +// how many characters were consumed from 'in' and emitted into 'out'. |
| 68 | +// If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error. |
| 69 | +func unescapeToUTF8(in, out []byte) (inLen int, outLen int) { |
| 70 | + if len(in) < 2 || in[0] != '\\' { |
| 71 | + // Invalid escape due to insufficient characters for any escape or no initial backslash |
| 72 | + return -1, -1 |
| 73 | + } |
| 74 | + |
| 75 | + // https://tools.ietf.org/html/rfc7159#section-7 |
| 76 | + switch e := in[1]; e { |
| 77 | + case '"', '\\', 'n', 't', 'r', '/', 'b', 'f': |
| 78 | + // Valid basic 2-character escapes |
| 79 | + out[0] = e |
| 80 | + return 2, 1 |
| 81 | + case 'u': |
| 82 | + // Unicode escape |
| 83 | + if r, inLen := decodeUnicodeEscape(in); inLen == -1 { |
| 84 | + // Invalid Unicode escape |
| 85 | + return -1, -1 |
| 86 | + } else { |
| 87 | + // Valid Unicode escape; re-encode as UTF8 |
| 88 | + outLen := utf8.EncodeRune(out, r) |
| 89 | + return inLen, outLen |
| 90 | + } |
| 91 | + } |
| 92 | + |
| 93 | + return -1, -1 |
| 94 | +} |
| 95 | + |
| 96 | +// unescape unescapes the string contained in 'in' and returns it as a slice. |
| 97 | +// If 'in' contains no escaped characters: |
| 98 | +// Returns 'in'. |
| 99 | +// Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)): |
| 100 | +// 'out' is used to build the unescaped string and is returned with no extra allocation |
| 101 | +// Else: |
| 102 | +// A new slice is allocated and returned. |
| 103 | +func unescape(in, out []byte) ([]byte, error) { |
| 104 | + firstBackslash := bytes.IndexByte(in, '\\') |
| 105 | + if firstBackslash == -1 { |
| 106 | + return in, nil |
| 107 | + } |
| 108 | + |
| 109 | + // Get a buffer of sufficient size (allocate if needed) |
| 110 | + if cap(out) < len(in) { |
| 111 | + out = make([]byte, len(in)) |
| 112 | + } else { |
| 113 | + out = out[0:len(in)] |
| 114 | + } |
| 115 | + |
| 116 | + // Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice) |
| 117 | + copy(out, in[:firstBackslash]) |
| 118 | + in = in[firstBackslash:] |
| 119 | + buf := out[firstBackslash:] |
| 120 | + |
| 121 | + for len(in) > 0 { |
| 122 | + // Unescape the next escaped character |
| 123 | + inLen, bufLen := unescapeToUTF8(in, buf) |
| 124 | + if inLen == -1 { |
| 125 | + return nil, MalformedStringEscapeError |
| 126 | + } |
| 127 | + |
| 128 | + in = in[inLen:] |
| 129 | + buf = buf[bufLen:] |
| 130 | + |
| 131 | + // Copy everything up until the next backslash |
| 132 | + nextBackslash := bytes.IndexByte(in, '\\') |
| 133 | + if nextBackslash == -1 { |
| 134 | + copy(buf, in) |
| 135 | + buf = buf[len(in):] |
| 136 | + break |
| 137 | + } else { |
| 138 | + copy(buf, in[:nextBackslash]) |
| 139 | + buf = buf[nextBackslash:] |
| 140 | + in = in[nextBackslash:] |
| 141 | + } |
| 142 | + } |
| 143 | + |
| 144 | + // Trim the out buffer to the amount that was actually emitted |
| 145 | + return out[:len(out)-len(buf)], nil |
| 146 | +} |
0 commit comments