Skip to content

Commit b88eaff

Browse files
committed
Initial work on handling escaped strings in JSON
1 parent e6e4e5f commit b88eaff

File tree

4 files changed

+223
-14
lines changed

4 files changed

+223
-14
lines changed

parser.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,20 @@ import (
1111

1212
// Errors
1313
var (
14-
KeyPathNotFoundError = errors.New("Key path not found")
15-
UnknownValueTypeError = errors.New("Unknown value type")
16-
MalformedJsonError = errors.New("Malformed JSON error")
17-
MalformedStringError = errors.New("Value is string, but can't find closing '\"' symbol")
18-
MalformedArrayError = errors.New("Value is array, but can't find closing ']' symbol")
19-
MalformedObjectError = errors.New("Value looks like object, but can't find closing '}' symbol")
20-
MalformedValueError = errors.New("Value looks like Number/Boolean/None, but can't find its end: ',' or '}' symbol")
14+
KeyPathNotFoundError = errors.New("Key path not found")
15+
UnknownValueTypeError = errors.New("Unknown value type")
16+
MalformedJsonError = errors.New("Malformed JSON error")
17+
MalformedStringError = errors.New("Value is string, but can't find closing '\"' symbol")
18+
MalformedArrayError = errors.New("Value is array, but can't find closing ']' symbol")
19+
MalformedObjectError = errors.New("Value looks like object, but can't find closing '}' symbol")
20+
MalformedValueError = errors.New("Value looks like Number/Boolean/None, but can't find its end: ',' or '}' symbol")
21+
MalformedStringEscapeError = errors.New("Encountered an invalid escape sequence in a string")
2122
)
2223

24+
// How much stack space to allocate for unescaping JSON strings; if a string longer
25+
// than this needs to be escaped, it will result in a heap allocation
26+
const unescapeStackBufSize = 64
27+
2328
func tokenEnd(data []byte) int {
2429
for i, c := range data {
2530
switch c {
@@ -31,7 +36,6 @@ func tokenEnd(data []byte) int {
3136
return -1
3237
}
3338

34-
3539
// Find position of next character which is not ' ', ',', '}' or ']'
3640
func nextToken(data []byte, skipComma bool) int {
3741
for i, c := range data {
@@ -133,10 +137,10 @@ func searchKeys(data []byte, keys ...string) int {
133137
i += valueOffset
134138

135139
// if string is a Key, and key level match
136-
if data[i] == ':'{
140+
if data[i] == ':' {
137141
key := unsafeBytesToString(data[keyBegin:keyEnd])
138142

139-
if keyLevel == level-1 && // If key nesting level match current object nested level
143+
if keyLevel == level-1 && // If key nesting level match current object nested level
140144
keys[level-1] == key {
141145
keyLevel++
142146
// If we found all keys in path
@@ -392,9 +396,10 @@ func GetString(data []byte, keys ...string) (val string, err error) {
392396
return string(v), nil
393397
}
394398

395-
s, err := strconv.Unquote(`"` + unsafeBytesToString(v) + `"`)
399+
var stackbuf [unescapeStackBufSize]byte // stack-allocated array for allocation-free unescaping of small strings
400+
out, err := unescape(v, stackbuf[:])
396401

397-
return s, err
402+
return string(out), err
398403
}
399404

400405
// GetFloat returns the value retrieved by `Get`, cast to a float64 if possible.

parser_test.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,19 +326,26 @@ var getFloatTests = []Test{
326326

327327
var getStringTests = []Test{
328328
Test{
329-
desc: `Translate unicode symbols`,
329+
desc: `Translate Unicode symbols`,
330330
json: `{"c": "test"}`,
331331
path: []string{"c"},
332332
isFound: true,
333333
data: `test`,
334334
},
335335
Test{
336-
desc: `Translate unicode symbols`,
336+
desc: `Translate Unicode symbols`,
337337
json: `{"c": "15\u00b0C"}`,
338338
path: []string{"c"},
339339
isFound: true,
340340
data: `15°C`,
341341
},
342+
Test{
343+
desc: `Translate supplementary Unicode symbols`,
344+
json: `{"c": "\uD83D\uDE03"}`, // Smiley face (UTF16 surrogate pair)
345+
path: []string{"c"},
346+
isFound: true,
347+
data: "\U0001F603", // Smiley face
348+
},
342349
Test{
343350
desc: `Translate escape symbols`,
344351
json: `{"c": "\\\""}`,

parserescapes.go

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
package jsonparser
2+
3+
import (
4+
"bytes"
5+
"unicode/utf8"
6+
)
7+
8+
// JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7
9+
10+
const highSurrogateOffset = 0xDB00
11+
const lowSurrogateOffset = 0xDC00
12+
13+
func combineUTF16Surrogates(high, low rune) rune {
14+
return (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset)
15+
}
16+
17+
const badHex = -1
18+
19+
func h2I(c byte) int {
20+
switch {
21+
case c >= '0' && c <= '9':
22+
return int(c - '0')
23+
case c >= 'A' && c <= 'F':
24+
return int(c - 'A' + 10)
25+
case c >= 'a' && c <= 'f':
26+
return int(c - 'a' + 10)
27+
}
28+
return badHex
29+
}
30+
31+
// decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. In JSON, these can either come alone or as part
32+
// of "UTF16 surrogate pairs" that must be handled together; this function only handles one at a time
33+
func decodeSingleUnicodeEscape(in []byte) (rune, bool) {
34+
// We need at least 6 characters total
35+
if len(in) < 6 {
36+
return utf8.RuneError, false
37+
}
38+
39+
// Convert hex to decimal
40+
h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5])
41+
if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
42+
return utf8.RuneError, false
43+
}
44+
45+
// Compose the hex digits
46+
return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
47+
}
48+
49+
func decodeUnicodeEscape(in []byte) (rune, int) {
50+
if r, ok := decodeSingleUnicodeEscape(in); !ok {
51+
// Invalid Unicode escape
52+
return utf8.RuneError, -1
53+
} else if r < highSurrogateOffset {
54+
// Valid Unicode escape in Basic Multilingual Plane
55+
return r, 6
56+
} else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain
57+
// UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate"
58+
return utf8.RuneError, -1
59+
} else {
60+
// Valid UTF16 surrogate pair
61+
return combineUTF16Surrogates(r, r2), 12
62+
}
63+
64+
}
65+
66+
// unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns
67+
// how many characters were consumed from 'in' and emitted into 'out'.
68+
// If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error.
69+
func unescapeToUTF8(in, out []byte) (inLen int, outLen int) {
70+
if len(in) < 2 || in[0] != '\\' {
71+
// Invalid escape due to insufficient characters for any escape or no initial backslash
72+
return -1, -1
73+
}
74+
75+
// https://tools.ietf.org/html/rfc7159#section-7
76+
switch e := in[1]; e {
77+
case '"', '\\', 'n', 't', 'r', '/', 'b', 'f':
78+
// Valid basic 2-character escapes
79+
out[0] = e
80+
return 2, 1
81+
case 'u':
82+
// Unicode escape
83+
if r, inLen := decodeUnicodeEscape(in); inLen == -1 {
84+
// Invalid Unicode escape
85+
return -1, -1
86+
} else {
87+
// Valid Unicode escape; re-encode as UTF8
88+
outLen := utf8.EncodeRune(out, r)
89+
return inLen, outLen
90+
}
91+
}
92+
93+
return -1, -1
94+
}
95+
96+
// unescape unescapes the string contained in 'in' and returns it as a slice.
97+
// If 'in' contains no escaped characters:
98+
// Returns 'in'.
99+
// Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)):
100+
// 'out' is used to build the unescaped string and is returned with no extra allocation
101+
// Else:
102+
// A new slice is allocated and returned.
103+
func unescape(in, out []byte) ([]byte, error) {
104+
firstBackslash := bytes.IndexByte(in, '\\')
105+
if firstBackslash == -1 {
106+
return in, nil
107+
}
108+
109+
// Get a buffer of sufficient size (allocate if needed)
110+
if cap(out) < len(in) {
111+
out = make([]byte, len(in))
112+
} else {
113+
out = out[0:len(in)]
114+
}
115+
116+
// Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice)
117+
copy(out, in[:firstBackslash])
118+
in = in[firstBackslash:]
119+
buf := out[firstBackslash:]
120+
121+
for len(in) > 0 {
122+
// Unescape the next escaped character
123+
inLen, bufLen := unescapeToUTF8(in, buf)
124+
if inLen == -1 {
125+
return nil, MalformedStringEscapeError
126+
}
127+
128+
in = in[inLen:]
129+
buf = buf[bufLen:]
130+
131+
// Copy everything up until the next backslash
132+
nextBackslash := bytes.IndexByte(in, '\\')
133+
if nextBackslash == -1 {
134+
copy(buf, in)
135+
buf = buf[len(in):]
136+
break
137+
} else {
138+
copy(buf, in[:nextBackslash])
139+
buf = buf[nextBackslash:]
140+
in = in[nextBackslash:]
141+
}
142+
}
143+
144+
// Trim the out buffer to the amount that was actually emitted
145+
return out[:len(out)-len(buf)], nil
146+
}

parserescapes_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package jsonparser
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestH2I(t *testing.T) {
8+
hexChars := []byte{'0', '9', 'A', 'F', 'a', 'f', 'x', '\000'}
9+
hexValues := []int{0, 9, 10, 15, 10, 15, -1, -1}
10+
11+
for i, c := range hexChars {
12+
if v := h2I(c); v != hexValues[i] {
13+
t.Errorf("h2I('%c') returned wrong value (obtained %d, expected %d)", c, v, hexValues[i])
14+
}
15+
}
16+
}
17+
18+
func TestDecodeSingleUnicodeEscape(t *testing.T) {
19+
escapeSequences := []string{
20+
`\"`,
21+
`\\`,
22+
`\n`,
23+
`\t`,
24+
`\r`,
25+
`\/`,
26+
`\b`,
27+
`\f`,
28+
}
29+
30+
runeValues := []struct {
31+
r rune
32+
ok bool
33+
}{
34+
{'"', true},
35+
{'\\', true},
36+
{'\n', true},
37+
{'\t', true},
38+
{'/', true},
39+
{'\b', true},
40+
{'\f', true},
41+
}
42+
43+
for i, esc := range escapeSequences {
44+
expected := runeValues[i]
45+
if r, ok := decodeSingleUnicodeEscape([]byte(esc)); ok != expected.ok {
46+
t.Errorf("decodeSingleUnicodeEscape(%s) returned 'ok' mismatch: expected %t, obtained %t", esc, expected.ok, ok)
47+
} else if r != expected.r {
48+
t.Errorf("decodeSingleUnicodeEscape(%s) returned rune mismatch: expected %x (%c), obtained %x (%c)", esc, expected.r, expected.r, r, r)
49+
}
50+
}
51+
}

0 commit comments

Comments
 (0)