Skip to content

Commit 244adfe

Browse files
committed
Fixed bugs and completed test cases
1 parent b88eaff commit 244adfe

File tree

2 files changed

+202
-30
lines changed

2 files changed

+202
-30
lines changed

parserescapes.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@ package jsonparser
22

33
import (
44
"bytes"
5+
"fmt"
56
"unicode/utf8"
67
)
78

89
// JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7
910

10-
const highSurrogateOffset = 0xDB00
11+
const supplementalPlanesOffset = 0x10000
12+
const highSurrogateOffset = 0xD800
1113
const lowSurrogateOffset = 0xDC00
1214

1315
func combineUTF16Surrogates(high, low rune) rune {
14-
return (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset)
16+
return supplementalPlanesOffset + (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset)
1517
}
1618

1719
const badHex = -1
@@ -28,8 +30,12 @@ func h2I(c byte) int {
2830
return badHex
2931
}
3032

31-
// decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. In JSON, these can either come alone or as part
32-
// of "UTF16 surrogate pairs" that must be handled together; this function only handles one at a time
33+
// decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. The prefix \u is assumed to be present and
34+
// is not checked.
35+
// In JSON, these escapes can either come alone or as part of "UTF16 surrogate pairs" that must be handled together.
36+
// This function only handles one; decodeUnicodeEscape handles this more complex case.
37+
var _ = fmt.Println
38+
3339
func decodeSingleUnicodeEscape(in []byte) (rune, bool) {
3440
// We need at least 6 characters total
3541
if len(in) < 6 {
@@ -125,6 +131,8 @@ func unescape(in, out []byte) ([]byte, error) {
125131
return nil, MalformedStringEscapeError
126132
}
127133

134+
//fmt.Printf("Decoded rune from UTF: inLen: %d, outLen: %d, rune UTF8: %x\n", inLen, bufLen, buf[:bufLen])
135+
128136
in = in[inLen:]
129137
buf = buf[bufLen:]
130138

parserescapes_test.go

Lines changed: 190 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package jsonparser
22

33
import (
4+
"bytes"
5+
"fmt"
46
"testing"
57
)
68

@@ -15,37 +17,199 @@ func TestH2I(t *testing.T) {
1517
}
1618
}
1719

20+
type escapedUnicodeRuneTest struct {
21+
in string
22+
isErr bool
23+
out rune
24+
len int
25+
}
26+
27+
var commonUnicodeEscapeTests = []escapedUnicodeRuneTest{
28+
{in: `\u0041`, out: 'A', len: 6},
29+
{in: `\u0000`, out: 0, len: 6},
30+
{in: `\u00b0`, out: '°', len: 6},
31+
{in: `\u00B0`, out: '°', len: 6},
32+
33+
{in: `\x1234`, out: 0x1234, len: 6}, // These functions do not check the \u prefix
34+
35+
{in: ``, isErr: true},
36+
{in: `\`, isErr: true},
37+
{in: `\u`, isErr: true},
38+
{in: `\u1`, isErr: true},
39+
{in: `\u11`, isErr: true},
40+
{in: `\u111`, isErr: true},
41+
{in: `\u123X`, isErr: true},
42+
}
43+
44+
var singleUnicodeEscapeTests = append([]escapedUnicodeRuneTest{
45+
{in: `\uD83D`, out: 0xD83D, len: 6},
46+
{in: `\uDE03`, out: 0xDE03, len: 6},
47+
{in: `\uFFFF`, out: 0xFFFF, len: 6},
48+
}, commonUnicodeEscapeTests...)
49+
50+
var multiUnicodeEscapeTests = append([]escapedUnicodeRuneTest{
51+
{in: `\uD83D`, isErr: true},
52+
{in: `\uDE03`, isErr: true},
53+
{in: `\uFFFF`, isErr: true},
54+
55+
{in: `\uD83D\uDE03`, out: '\U0001F603', len: 12},
56+
{in: `\uD800\uDC00`, out: '\U00010000', len: 12},
57+
58+
{in: `\uD800\`, isErr: true},
59+
{in: `\uD800\u`, isErr: true},
60+
{in: `\uD800\uD`, isErr: true},
61+
{in: `\uD800\uDC`, isErr: true},
62+
{in: `\uD800\uDC0`, isErr: true},
63+
}, commonUnicodeEscapeTests...)
64+
1865
func TestDecodeSingleUnicodeEscape(t *testing.T) {
19-
escapeSequences := []string{
20-
`\"`,
21-
`\\`,
22-
`\n`,
23-
`\t`,
24-
`\r`,
25-
`\/`,
26-
`\b`,
27-
`\f`,
66+
for _, test := range singleUnicodeEscapeTests {
67+
r, ok := decodeSingleUnicodeEscape([]byte(test.in))
68+
isErr := !ok
69+
70+
if isErr != test.isErr {
71+
t.Errorf("decodeSingleUnicodeEscape(%s) returned isErr mismatch: expected %t, obtained %t", test.in, test.isErr, isErr)
72+
} else if isErr {
73+
continue
74+
} else if r != test.out {
75+
t.Errorf("decodeSingleUnicodeEscape(%s) returned rune mismatch: expected %x (%c), obtained %x (%c)", test.in, test.out, test.out, r, r)
76+
}
2877
}
78+
}
79+
80+
func TestDecodeUnicodeEscape(t *testing.T) {
81+
for _, test := range multiUnicodeEscapeTests {
82+
r, len := decodeUnicodeEscape([]byte(test.in))
83+
isErr := (len == -1)
2984

30-
runeValues := []struct {
31-
r rune
32-
ok bool
33-
}{
34-
{'"', true},
35-
{'\\', true},
36-
{'\n', true},
37-
{'\t', true},
38-
{'/', true},
39-
{'\b', true},
40-
{'\f', true},
85+
if isErr != test.isErr {
86+
t.Errorf("decodeUnicodeEscape(%s) returned isErr mismatch: expected %t, obtained %t", test.in, test.isErr, isErr)
87+
} else if isErr {
88+
continue
89+
} else if len != test.len {
90+
t.Errorf("decodeUnicodeEscape(%s) returned length mismatch: expected %d, obtained %d", test.in, test.len, len)
91+
} else if r != test.out {
92+
t.Errorf("decodeUnicodeEscape(%s) returned rune mismatch: expected %x (%c), obtained %x (%c)", test.in, test.out, test.out, r, r)
93+
}
4194
}
95+
}
96+
97+
type unescapeTest struct {
98+
in string
99+
out string
100+
canAlloc bool
101+
isErr bool
102+
}
42103

43-
for i, esc := range escapeSequences {
44-
expected := runeValues[i]
45-
if r, ok := decodeSingleUnicodeEscape([]byte(esc)); ok != expected.ok {
46-
t.Errorf("decodeSingleUnicodeEscape(%s) returned 'ok' mismatch: expected %t, obtained %t", esc, expected.ok, ok)
47-
} else if r != expected.r {
48-
t.Errorf("decodeSingleUnicodeEscape(%s) returned rune mismatch: expected %x (%c), obtained %x (%c)", esc, expected.r, expected.r, r, r)
104+
var unescapeTests = []unescapeTest{
105+
{in: ``, out: ``, canAlloc: false},
106+
{in: `a`, out: `a`, canAlloc: false},
107+
{in: `abcde`, out: `abcde`, canAlloc: false},
108+
109+
{in: `ab\\de`, out: `ab\de`, canAlloc: true},
110+
{in: `ab\"de`, out: `ab"de`, canAlloc: true},
111+
{in: `ab \u00B0 de`, out: `ab ° de`, canAlloc: true},
112+
{in: `ab \uD83D\uDE03 de`, out: "ab \U0001F603 de", canAlloc: true},
113+
{in: `\u0000\u0000\u0000\u0000\u0000`, out: "\u0000\u0000\u0000\u0000\u0000", canAlloc: true},
114+
{in: `\u0000 \u0000 \u0000 \u0000 \u0000`, out: "\u0000 \u0000 \u0000 \u0000 \u0000", canAlloc: true},
115+
{in: ` \u0000 \u0000 \u0000 \u0000 \u0000 `, out: " \u0000 \u0000 \u0000 \u0000 \u0000 ", canAlloc: true},
116+
117+
{in: `\uD800`, isErr: true},
118+
{in: `\uFFFF`, isErr: true},
119+
{in: `abcde\`, isErr: true},
120+
{in: `abcde\x`, isErr: true},
121+
{in: `abcde\u`, isErr: true},
122+
{in: `abcde\u1`, isErr: true},
123+
{in: `abcde\u12`, isErr: true},
124+
{in: `abcde\u123`, isErr: true},
125+
{in: `abcde\uD800`, isErr: true},
126+
{in: `ab\uD800de`, isErr: true},
127+
{in: `\uD800abcde`, isErr: true},
128+
}
129+
130+
// isSameMemory checks if two slices contain the same memory pointer (meaning one is a
131+
// subslice of the other, with possibly differing lengths/capacities).
132+
func isSameMemory(a, b []byte) bool {
133+
if cap(a) == 0 || cap(b) == 0 {
134+
return cap(a) == cap(b)
135+
} else if a, b = a[:1], b[:1]; a[0] != b[0] {
136+
return false
137+
} else {
138+
a[0]++
139+
same := (a[0] == b[0])
140+
a[0]--
141+
return same
142+
}
143+
144+
}
145+
146+
func TestUnescape(t *testing.T) {
147+
148+
for _, test := range unescapeTests {
149+
type bufferTestCase struct {
150+
buf []byte
151+
isTooSmall bool
152+
}
153+
154+
var bufs []bufferTestCase
155+
156+
if len(test.in) == 0 {
157+
// If the input string is length 0, only a buffer of size 0 is a meaningful test
158+
bufs = []bufferTestCase{{nil, false}}
159+
} else {
160+
// For non-empty input strings, we can try several buffer sizes (0, len-1, len)
161+
bufs = []bufferTestCase{
162+
{nil, true},
163+
{make([]byte, 0, len(test.in)-1), true},
164+
{make([]byte, 0, len(test.in)), false},
165+
}
166+
}
167+
168+
for _, buftest := range bufs {
169+
in := []byte(test.in)
170+
buf := buftest.buf
171+
172+
out, err := unescape(in, buf)
173+
isErr := (err != nil)
174+
isAlloc := !isSameMemory(out, in) && !isSameMemory(out, buf)
175+
176+
if isErr != test.isErr {
177+
t.Errorf("unescape(`%s`, bufsize=%d) returned isErr mismatch: expected %t, obtained %t", test.in, cap(buf), test.isErr, isErr)
178+
break
179+
} else if isErr {
180+
continue
181+
} else if !bytes.Equal(out, []byte(test.out)) {
182+
t.Errorf("unescape(`%s`, bufsize=%d) returned unescaped mismatch: expected `%s` (%v, len %d), obtained `%s` (%v, len %d)", test.in, cap(buf), test.out, []byte(test.out), len(test.out), string(out), out, len(out))
183+
break
184+
} else if isAlloc != (test.canAlloc && buftest.isTooSmall) {
185+
t.Errorf("unescape(`%s`, bufsize=%d) returned isAlloc mismatch: expected %t, obtained %t", test.in, cap(buf), buftest.isTooSmall, isAlloc)
186+
break
187+
}
49188
}
50189
}
51190
}
191+
192+
//
193+
//escapeSequences := []string{
194+
//`\"`,
195+
//`\\`,
196+
//`\n`,
197+
//`\t`,
198+
//`\r`,
199+
//`\/`,
200+
//`\b`,
201+
//`\f`,
202+
//}
203+
//
204+
//runeValues := []struct {
205+
//r rune
206+
//ok bool
207+
//}{
208+
//{'"', true},
209+
//{'\\', true},
210+
//{'\n', true},
211+
//{'\t', true},
212+
//{'/', true},
213+
//{'\b', true},
214+
//{'\f', true},
215+
//}

0 commit comments

Comments
 (0)