Skip to content

Added ParsePrimitiveValue convenience function for users #38

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
152 changes: 152 additions & 0 deletions escape.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
package jsonparser

import (
"bytes"
"unicode/utf8"
)

// JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7

const supplementalPlanesOffset = 0x10000
const highSurrogateOffset = 0xD800
const lowSurrogateOffset = 0xDC00

func combineUTF16Surrogates(high, low rune) rune {
return supplementalPlanesOffset + (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset)
}

const badHex = -1

func h2I(c byte) int {
switch {
case c >= '0' && c <= '9':
return int(c - '0')
case c >= 'A' && c <= 'F':
return int(c - 'A' + 10)
case c >= 'a' && c <= 'f':
return int(c - 'a' + 10)
}
return badHex
}

// decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. The prefix \u is assumed to be present and
// is not checked.
// In JSON, these escapes can either come alone or as part of "UTF16 surrogate pairs" that must be handled together.
// This function only handles one; decodeUnicodeEscape handles this more complex case.
func decodeSingleUnicodeEscape(in []byte) (rune, bool) {
// We need at least 6 characters total
if len(in) < 6 {
return utf8.RuneError, false
}

// Convert hex to decimal
h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5])
if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
return utf8.RuneError, false
}

// Compose the hex digits
return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
}

func decodeUnicodeEscape(in []byte) (rune, int) {
if r, ok := decodeSingleUnicodeEscape(in); !ok {
// Invalid Unicode escape
return utf8.RuneError, -1
} else if r < highSurrogateOffset {
// Valid Unicode escape in Basic Multilingual Plane
return r, 6
} else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain
// UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate"
return utf8.RuneError, -1
} else if r2 < lowSurrogateOffset {
// Invalid UTF16 "low surrogate"
return utf8.RuneError, -1
} else {
// Valid UTF16 surrogate pair
return combineUTF16Surrogates(r, r2), 12
}

}

// unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns
// how many characters were consumed from 'in' and emitted into 'out'.
// If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error.
func unescapeToUTF8(in, out []byte) (inLen int, outLen int) {
if len(in) < 2 || in[0] != '\\' {
// Invalid escape due to insufficient characters for any escape or no initial backslash
return -1, -1
}

// https://tools.ietf.org/html/rfc7159#section-7
switch e := in[1]; e {
case '"', '\\', 'n', 't', 'r', '/', 'b', 'f':
// Valid basic 2-character escapes
out[0] = e
return 2, 1
case 'u':
// Unicode escape
if r, inLen := decodeUnicodeEscape(in); inLen == -1 {
// Invalid Unicode escape
return -1, -1
} else {
// Valid Unicode escape; re-encode as UTF8
outLen := utf8.EncodeRune(out, r)
return inLen, outLen
}
}

return -1, -1
}

// unescape unescapes the string contained in 'in' and returns it as a slice.
// If 'in' contains no escaped characters:
// Returns 'in'.
// Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)):
// 'out' is used to build the unescaped string and is returned with no extra allocation
// Else:
// A new slice is allocated and returned.
func Unescape(in, out []byte) ([]byte, error) {
firstBackslash := bytes.IndexByte(in, '\\')
if firstBackslash == -1 {
return in, nil
}

// Get a buffer of sufficient size (allocate if needed)
if cap(out) < len(in) {
out = make([]byte, len(in))
} else {
out = out[0:len(in)]
}

// Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice)
copy(out, in[:firstBackslash])
in = in[firstBackslash:]
buf := out[firstBackslash:]

for len(in) > 0 {
// Unescape the next escaped character
inLen, bufLen := unescapeToUTF8(in, buf)
if inLen == -1 {
return nil, MalformedStringEscapeError
}

in = in[inLen:]
buf = buf[bufLen:]

// Copy everything up until the next backslash
nextBackslash := bytes.IndexByte(in, '\\')
if nextBackslash == -1 {
copy(buf, in)
buf = buf[len(in):]
break
} else {
copy(buf, in[:nextBackslash])
buf = buf[nextBackslash:]
in = in[nextBackslash:]
}
}

// Trim the out buffer to the amount that was actually emitted
return out[:len(out)-len(buf)], nil
}
189 changes: 189 additions & 0 deletions escape_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
package jsonparser

import (
"bytes"
"testing"
)

func TestH2I(t *testing.T) {
hexChars := []byte{'0', '9', 'A', 'F', 'a', 'f', 'x', '\000'}
hexValues := []int{0, 9, 10, 15, 10, 15, -1, -1}

for i, c := range hexChars {
if v := h2I(c); v != hexValues[i] {
t.Errorf("h2I('%c') returned wrong value (obtained %d, expected %d)", c, v, hexValues[i])
}
}
}

type escapedUnicodeRuneTest struct {
in string
isErr bool
out rune
len int
}

var commonUnicodeEscapeTests = []escapedUnicodeRuneTest{
{in: `\u0041`, out: 'A', len: 6},
{in: `\u0000`, out: 0, len: 6},
{in: `\u00b0`, out: '°', len: 6},
{in: `\u00B0`, out: '°', len: 6},

{in: `\x1234`, out: 0x1234, len: 6}, // These functions do not check the \u prefix

{in: ``, isErr: true},
{in: `\`, isErr: true},
{in: `\u`, isErr: true},
{in: `\u1`, isErr: true},
{in: `\u11`, isErr: true},
{in: `\u111`, isErr: true},
{in: `\u123X`, isErr: true},
}

var singleUnicodeEscapeTests = append([]escapedUnicodeRuneTest{
{in: `\uD83D`, out: 0xD83D, len: 6},
{in: `\uDE03`, out: 0xDE03, len: 6},
{in: `\uFFFF`, out: 0xFFFF, len: 6},
}, commonUnicodeEscapeTests...)

var multiUnicodeEscapeTests = append([]escapedUnicodeRuneTest{
{in: `\uD83D`, isErr: true},
{in: `\uDE03`, isErr: true},
{in: `\uFFFF`, isErr: true},

{in: `\uD83D\uDE03`, out: '\U0001F603', len: 12},
{in: `\uD800\uDC00`, out: '\U00010000', len: 12},

{in: `\uD800\`, isErr: true},
{in: `\uD800\u`, isErr: true},
{in: `\uD800\uD`, isErr: true},
{in: `\uD800\uDC`, isErr: true},
{in: `\uD800\uDC0`, isErr: true},
{in: `\uD800\uDBFF`, isErr: true}, // invalid low surrogate
}, commonUnicodeEscapeTests...)

func TestDecodeSingleUnicodeEscape(t *testing.T) {
for _, test := range singleUnicodeEscapeTests {
r, ok := decodeSingleUnicodeEscape([]byte(test.in))
isErr := !ok

if isErr != test.isErr {
t.Errorf("decodeSingleUnicodeEscape(%s) returned isErr mismatch: expected %t, obtained %t", test.in, test.isErr, isErr)
} else if isErr {
continue
} else if r != test.out {
t.Errorf("decodeSingleUnicodeEscape(%s) returned rune mismatch: expected %x (%c), obtained %x (%c)", test.in, test.out, test.out, r, r)
}
}
}

func TestDecodeUnicodeEscape(t *testing.T) {
for _, test := range multiUnicodeEscapeTests {
r, len := decodeUnicodeEscape([]byte(test.in))
isErr := (len == -1)

if isErr != test.isErr {
t.Errorf("decodeUnicodeEscape(%s) returned isErr mismatch: expected %t, obtained %t", test.in, test.isErr, isErr)
} else if isErr {
continue
} else if len != test.len {
t.Errorf("decodeUnicodeEscape(%s) returned length mismatch: expected %d, obtained %d", test.in, test.len, len)
} else if r != test.out {
t.Errorf("decodeUnicodeEscape(%s) returned rune mismatch: expected %x (%c), obtained %x (%c)", test.in, test.out, test.out, r, r)
}
}
}

type unescapeTest struct {
in string // escaped string
out string // expected unescaped string
canAlloc bool // can unescape cause an allocation (depending on buffer size)? true iff 'in' contains escape sequence(s)
isErr bool // should this operation result in an error
}

var unescapeTests = []unescapeTest{
{in: ``, out: ``, canAlloc: false},
{in: `a`, out: `a`, canAlloc: false},
{in: `abcde`, out: `abcde`, canAlloc: false},

{in: `ab\\de`, out: `ab\de`, canAlloc: true},
{in: `ab\"de`, out: `ab"de`, canAlloc: true},
{in: `ab \u00B0 de`, out: `ab ° de`, canAlloc: true},
{in: `ab \uD83D\uDE03 de`, out: "ab \U0001F603 de", canAlloc: true},
{in: `\u0000\u0000\u0000\u0000\u0000`, out: "\u0000\u0000\u0000\u0000\u0000", canAlloc: true},
{in: `\u0000 \u0000 \u0000 \u0000 \u0000`, out: "\u0000 \u0000 \u0000 \u0000 \u0000", canAlloc: true},
{in: ` \u0000 \u0000 \u0000 \u0000 \u0000 `, out: " \u0000 \u0000 \u0000 \u0000 \u0000 ", canAlloc: true},

{in: `\uD800`, isErr: true},
{in: `\uFFFF`, isErr: true},
{in: `abcde\`, isErr: true},
{in: `abcde\x`, isErr: true},
{in: `abcde\u`, isErr: true},
{in: `abcde\u1`, isErr: true},
{in: `abcde\u12`, isErr: true},
{in: `abcde\u123`, isErr: true},
{in: `abcde\uD800`, isErr: true},
{in: `ab\uD800de`, isErr: true},
{in: `\uD800abcde`, isErr: true},
}

// isSameMemory checks if two slices contain the same memory pointer (meaning one is a
// subslice of the other, with possibly differing lengths/capacities).
func isSameMemory(a, b []byte) bool {
if cap(a) == 0 || cap(b) == 0 {
return cap(a) == cap(b)
} else if a, b = a[:1], b[:1]; a[0] != b[0] {
return false
} else {
a[0]++
same := (a[0] == b[0])
a[0]--
return same
}

}

func TestUnescape(t *testing.T) {
for _, test := range unescapeTests {
type bufferTestCase struct {
buf []byte
isTooSmall bool
}

var bufs []bufferTestCase

if len(test.in) == 0 {
// If the input string is length 0, only a buffer of size 0 is a meaningful test
bufs = []bufferTestCase{{nil, false}}
} else {
// For non-empty input strings, we can try several buffer sizes (0, len-1, len)
bufs = []bufferTestCase{
{nil, true},
{make([]byte, 0, len(test.in)-1), true},
{make([]byte, 0, len(test.in)), false},
}
}

for _, buftest := range bufs {
in := []byte(test.in)
buf := buftest.buf

out, err := Unescape(in, buf)
isErr := (err != nil)
isAlloc := !isSameMemory(out, in) && !isSameMemory(out, buf)

if isErr != test.isErr {
t.Errorf("Unescape(`%s`, bufsize=%d) returned isErr mismatch: expected %t, obtained %t", test.in, cap(buf), test.isErr, isErr)
break
} else if isErr {
continue
} else if !bytes.Equal(out, []byte(test.out)) {
t.Errorf("Unescape(`%s`, bufsize=%d) returned unescaped mismatch: expected `%s` (%v, len %d), obtained `%s` (%v, len %d)", test.in, cap(buf), test.out, []byte(test.out), len(test.out), string(out), out, len(out))
break
} else if isAlloc != (test.canAlloc && buftest.isTooSmall) {
t.Errorf("Unescape(`%s`, bufsize=%d) returned isAlloc mismatch: expected %t, obtained %t", test.in, cap(buf), buftest.isTooSmall, isAlloc)
break
}
}
}
}
Loading