Skip to content

Commit 4c8249a

Browse files
committed
Merge branch 'pendo-io-escaping-cleaned-up'
2 parents baeb346 + 5abfa85 commit 4c8249a

File tree

6 files changed

+432
-34
lines changed

6 files changed

+432
-34
lines changed

bytes_safe.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ func parseFloat(b *[]byte) (float64, error) {
1717
}
1818

1919
func bytesToString(b *[]byte) string {
20-
return string(*b)
21-
}
20+
return string(*b)
21+
}

bytes_unsafe.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,5 +27,5 @@ func parseFloat(b *[]byte) (float64, error) {
2727
// A hack until issue golang/go#2632 is fixed.
2828
// See: https://github.com/golang/go/issues/2632
2929
func bytesToString(b *[]byte) string {
30-
return *(*string)(unsafe.Pointer(b))
31-
}
30+
return *(*string)(unsafe.Pointer(b))
31+
}

escape.go

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
package jsonparser
2+
3+
import (
4+
"bytes"
5+
"unicode/utf8"
6+
)
7+
8+
// JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7
9+
10+
const supplementalPlanesOffset = 0x10000
11+
const highSurrogateOffset = 0xD800
12+
const lowSurrogateOffset = 0xDC00
13+
14+
func combineUTF16Surrogates(high, low rune) rune {
15+
return supplementalPlanesOffset + (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset)
16+
}
17+
18+
const badHex = -1
19+
20+
func h2I(c byte) int {
21+
switch {
22+
case c >= '0' && c <= '9':
23+
return int(c - '0')
24+
case c >= 'A' && c <= 'F':
25+
return int(c - 'A' + 10)
26+
case c >= 'a' && c <= 'f':
27+
return int(c - 'a' + 10)
28+
}
29+
return badHex
30+
}
31+
32+
// decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. The prefix \u is assumed to be present and
33+
// is not checked.
34+
// In JSON, these escapes can either come alone or as part of "UTF16 surrogate pairs" that must be handled together.
35+
// This function only handles one; decodeUnicodeEscape handles this more complex case.
36+
func decodeSingleUnicodeEscape(in []byte) (rune, bool) {
37+
// We need at least 6 characters total
38+
if len(in) < 6 {
39+
return utf8.RuneError, false
40+
}
41+
42+
// Convert hex to decimal
43+
h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5])
44+
if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
45+
return utf8.RuneError, false
46+
}
47+
48+
// Compose the hex digits
49+
return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
50+
}
51+
52+
func decodeUnicodeEscape(in []byte) (rune, int) {
53+
if r, ok := decodeSingleUnicodeEscape(in); !ok {
54+
// Invalid Unicode escape
55+
return utf8.RuneError, -1
56+
} else if r < highSurrogateOffset {
57+
// Valid Unicode escape in Basic Multilingual Plane
58+
return r, 6
59+
} else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain
60+
// UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate"
61+
return utf8.RuneError, -1
62+
} else if r2 < lowSurrogateOffset {
63+
// Invalid UTF16 "low surrogate"
64+
return utf8.RuneError, -1
65+
} else {
66+
// Valid UTF16 surrogate pair
67+
return combineUTF16Surrogates(r, r2), 12
68+
}
69+
70+
}
71+
72+
// unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns
73+
// how many characters were consumed from 'in' and emitted into 'out'.
74+
// If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error.
75+
func unescapeToUTF8(in, out []byte) (inLen int, outLen int) {
76+
if len(in) < 2 || in[0] != '\\' {
77+
// Invalid escape due to insufficient characters for any escape or no initial backslash
78+
return -1, -1
79+
}
80+
81+
// https://tools.ietf.org/html/rfc7159#section-7
82+
switch e := in[1]; e {
83+
case '"', '\\', 'n', 't', 'r', '/', 'b', 'f':
84+
// Valid basic 2-character escapes
85+
out[0] = e
86+
return 2, 1
87+
case 'u':
88+
// Unicode escape
89+
if r, inLen := decodeUnicodeEscape(in); inLen == -1 {
90+
// Invalid Unicode escape
91+
return -1, -1
92+
} else {
93+
// Valid Unicode escape; re-encode as UTF8
94+
outLen := utf8.EncodeRune(out, r)
95+
return inLen, outLen
96+
}
97+
}
98+
99+
return -1, -1
100+
}
101+
102+
// unescape unescapes the string contained in 'in' and returns it as a slice.
103+
// If 'in' contains no escaped characters:
104+
// Returns 'in'.
105+
// Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)):
106+
// 'out' is used to build the unescaped string and is returned with no extra allocation
107+
// Else:
108+
// A new slice is allocated and returned.
109+
func Unescape(in, out []byte) ([]byte, error) {
110+
firstBackslash := bytes.IndexByte(in, '\\')
111+
if firstBackslash == -1 {
112+
return in, nil
113+
}
114+
115+
// Get a buffer of sufficient size (allocate if needed)
116+
if cap(out) < len(in) {
117+
out = make([]byte, len(in))
118+
} else {
119+
out = out[0:len(in)]
120+
}
121+
122+
// Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice)
123+
copy(out, in[:firstBackslash])
124+
in = in[firstBackslash:]
125+
buf := out[firstBackslash:]
126+
127+
for len(in) > 0 {
128+
// Unescape the next escaped character
129+
inLen, bufLen := unescapeToUTF8(in, buf)
130+
if inLen == -1 {
131+
return nil, MalformedStringEscapeError
132+
}
133+
134+
in = in[inLen:]
135+
buf = buf[bufLen:]
136+
137+
// Copy everything up until the next backslash
138+
nextBackslash := bytes.IndexByte(in, '\\')
139+
if nextBackslash == -1 {
140+
copy(buf, in)
141+
buf = buf[len(in):]
142+
break
143+
} else {
144+
copy(buf, in[:nextBackslash])
145+
buf = buf[nextBackslash:]
146+
in = in[nextBackslash:]
147+
}
148+
}
149+
150+
// Trim the out buffer to the amount that was actually emitted
151+
return out[:len(out)-len(buf)], nil
152+
}

escape_test.go

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
package jsonparser
2+
3+
import (
4+
"bytes"
5+
"testing"
6+
)
7+
8+
func TestH2I(t *testing.T) {
9+
hexChars := []byte{'0', '9', 'A', 'F', 'a', 'f', 'x', '\000'}
10+
hexValues := []int{0, 9, 10, 15, 10, 15, -1, -1}
11+
12+
for i, c := range hexChars {
13+
if v := h2I(c); v != hexValues[i] {
14+
t.Errorf("h2I('%c') returned wrong value (obtained %d, expected %d)", c, v, hexValues[i])
15+
}
16+
}
17+
}
18+
19+
type escapedUnicodeRuneTest struct {
20+
in string
21+
isErr bool
22+
out rune
23+
len int
24+
}
25+
26+
var commonUnicodeEscapeTests = []escapedUnicodeRuneTest{
27+
{in: `\u0041`, out: 'A', len: 6},
28+
{in: `\u0000`, out: 0, len: 6},
29+
{in: `\u00b0`, out: '°', len: 6},
30+
{in: `\u00B0`, out: '°', len: 6},
31+
32+
{in: `\x1234`, out: 0x1234, len: 6}, // These functions do not check the \u prefix
33+
34+
{in: ``, isErr: true},
35+
{in: `\`, isErr: true},
36+
{in: `\u`, isErr: true},
37+
{in: `\u1`, isErr: true},
38+
{in: `\u11`, isErr: true},
39+
{in: `\u111`, isErr: true},
40+
{in: `\u123X`, isErr: true},
41+
}
42+
43+
var singleUnicodeEscapeTests = append([]escapedUnicodeRuneTest{
44+
{in: `\uD83D`, out: 0xD83D, len: 6},
45+
{in: `\uDE03`, out: 0xDE03, len: 6},
46+
{in: `\uFFFF`, out: 0xFFFF, len: 6},
47+
}, commonUnicodeEscapeTests...)
48+
49+
var multiUnicodeEscapeTests = append([]escapedUnicodeRuneTest{
50+
{in: `\uD83D`, isErr: true},
51+
{in: `\uDE03`, isErr: true},
52+
{in: `\uFFFF`, isErr: true},
53+
54+
{in: `\uD83D\uDE03`, out: '\U0001F603', len: 12},
55+
{in: `\uD800\uDC00`, out: '\U00010000', len: 12},
56+
57+
{in: `\uD800\`, isErr: true},
58+
{in: `\uD800\u`, isErr: true},
59+
{in: `\uD800\uD`, isErr: true},
60+
{in: `\uD800\uDC`, isErr: true},
61+
{in: `\uD800\uDC0`, isErr: true},
62+
{in: `\uD800\uDBFF`, isErr: true}, // invalid low surrogate
63+
}, commonUnicodeEscapeTests...)
64+
65+
func TestDecodeSingleUnicodeEscape(t *testing.T) {
66+
for _, test := range singleUnicodeEscapeTests {
67+
r, ok := decodeSingleUnicodeEscape([]byte(test.in))
68+
isErr := !ok
69+
70+
if isErr != test.isErr {
71+
t.Errorf("decodeSingleUnicodeEscape(%s) returned isErr mismatch: expected %t, obtained %t", test.in, test.isErr, isErr)
72+
} else if isErr {
73+
continue
74+
} else if r != test.out {
75+
t.Errorf("decodeSingleUnicodeEscape(%s) returned rune mismatch: expected %x (%c), obtained %x (%c)", test.in, test.out, test.out, r, r)
76+
}
77+
}
78+
}
79+
80+
func TestDecodeUnicodeEscape(t *testing.T) {
81+
for _, test := range multiUnicodeEscapeTests {
82+
r, len := decodeUnicodeEscape([]byte(test.in))
83+
isErr := (len == -1)
84+
85+
if isErr != test.isErr {
86+
t.Errorf("decodeUnicodeEscape(%s) returned isErr mismatch: expected %t, obtained %t", test.in, test.isErr, isErr)
87+
} else if isErr {
88+
continue
89+
} else if len != test.len {
90+
t.Errorf("decodeUnicodeEscape(%s) returned length mismatch: expected %d, obtained %d", test.in, test.len, len)
91+
} else if r != test.out {
92+
t.Errorf("decodeUnicodeEscape(%s) returned rune mismatch: expected %x (%c), obtained %x (%c)", test.in, test.out, test.out, r, r)
93+
}
94+
}
95+
}
96+
97+
type unescapeTest struct {
98+
in string // escaped string
99+
out string // expected unescaped string
100+
canAlloc bool // can unescape cause an allocation (depending on buffer size)? true iff 'in' contains escape sequence(s)
101+
isErr bool // should this operation result in an error
102+
}
103+
104+
var unescapeTests = []unescapeTest{
105+
{in: ``, out: ``, canAlloc: false},
106+
{in: `a`, out: `a`, canAlloc: false},
107+
{in: `abcde`, out: `abcde`, canAlloc: false},
108+
109+
{in: `ab\\de`, out: `ab\de`, canAlloc: true},
110+
{in: `ab\"de`, out: `ab"de`, canAlloc: true},
111+
{in: `ab \u00B0 de`, out: `ab ° de`, canAlloc: true},
112+
{in: `ab \uD83D\uDE03 de`, out: "ab \U0001F603 de", canAlloc: true},
113+
{in: `\u0000\u0000\u0000\u0000\u0000`, out: "\u0000\u0000\u0000\u0000\u0000", canAlloc: true},
114+
{in: `\u0000 \u0000 \u0000 \u0000 \u0000`, out: "\u0000 \u0000 \u0000 \u0000 \u0000", canAlloc: true},
115+
{in: ` \u0000 \u0000 \u0000 \u0000 \u0000 `, out: " \u0000 \u0000 \u0000 \u0000 \u0000 ", canAlloc: true},
116+
117+
{in: `\uD800`, isErr: true},
118+
{in: `\uFFFF`, isErr: true},
119+
{in: `abcde\`, isErr: true},
120+
{in: `abcde\x`, isErr: true},
121+
{in: `abcde\u`, isErr: true},
122+
{in: `abcde\u1`, isErr: true},
123+
{in: `abcde\u12`, isErr: true},
124+
{in: `abcde\u123`, isErr: true},
125+
{in: `abcde\uD800`, isErr: true},
126+
{in: `ab\uD800de`, isErr: true},
127+
{in: `\uD800abcde`, isErr: true},
128+
}
129+
130+
// isSameMemory checks if two slices contain the same memory pointer (meaning one is a
131+
// subslice of the other, with possibly differing lengths/capacities).
132+
func isSameMemory(a, b []byte) bool {
133+
if cap(a) == 0 || cap(b) == 0 {
134+
return cap(a) == cap(b)
135+
} else if a, b = a[:1], b[:1]; a[0] != b[0] {
136+
return false
137+
} else {
138+
a[0]++
139+
same := (a[0] == b[0])
140+
a[0]--
141+
return same
142+
}
143+
144+
}
145+
146+
func TestUnescape(t *testing.T) {
147+
for _, test := range unescapeTests {
148+
type bufferTestCase struct {
149+
buf []byte
150+
isTooSmall bool
151+
}
152+
153+
var bufs []bufferTestCase
154+
155+
if len(test.in) == 0 {
156+
// If the input string is length 0, only a buffer of size 0 is a meaningful test
157+
bufs = []bufferTestCase{{nil, false}}
158+
} else {
159+
// For non-empty input strings, we can try several buffer sizes (0, len-1, len)
160+
bufs = []bufferTestCase{
161+
{nil, true},
162+
{make([]byte, 0, len(test.in)-1), true},
163+
{make([]byte, 0, len(test.in)), false},
164+
}
165+
}
166+
167+
for _, buftest := range bufs {
168+
in := []byte(test.in)
169+
buf := buftest.buf
170+
171+
out, err := Unescape(in, buf)
172+
isErr := (err != nil)
173+
isAlloc := !isSameMemory(out, in) && !isSameMemory(out, buf)
174+
175+
if isErr != test.isErr {
176+
t.Errorf("Unescape(`%s`, bufsize=%d) returned isErr mismatch: expected %t, obtained %t", test.in, cap(buf), test.isErr, isErr)
177+
break
178+
} else if isErr {
179+
continue
180+
} else if !bytes.Equal(out, []byte(test.out)) {
181+
t.Errorf("Unescape(`%s`, bufsize=%d) returned unescaped mismatch: expected `%s` (%v, len %d), obtained `%s` (%v, len %d)", test.in, cap(buf), test.out, []byte(test.out), len(test.out), string(out), out, len(out))
182+
break
183+
} else if isAlloc != (test.canAlloc && buftest.isTooSmall) {
184+
t.Errorf("Unescape(`%s`, bufsize=%d) returned isAlloc mismatch: expected %t, obtained %t", test.in, cap(buf), buftest.isTooSmall, isAlloc)
185+
break
186+
}
187+
}
188+
}
189+
}

0 commit comments

Comments
 (0)