Initial work on handling escaped strings in JSON

daboyuka · daboyuka · commit b88eaff40e19 · 2016-04-07T12:05:07.000-04:00
diff --git a/parser.go b/parser.go
@@ -11,15 +11,20 @@ import (
 
 // Errors
 var (
-	KeyPathNotFoundError = errors.New("Key path not found")
-	UnknownValueTypeError = errors.New("Unknown value type")
-	MalformedJsonError = errors.New("Malformed JSON error")
-	MalformedStringError = errors.New("Value is string, but can't find closing '\"' symbol")
-	MalformedArrayError = errors.New("Value is array, but can't find closing ']' symbol")
-	MalformedObjectError = errors.New("Value looks like object, but can't find closing '}' symbol")
-	MalformedValueError = errors.New("Value looks like Number/Boolean/None, but can't find its end: ',' or '}' symbol")
+	KeyPathNotFoundError       = errors.New("Key path not found")
+	UnknownValueTypeError      = errors.New("Unknown value type")
+	MalformedJsonError         = errors.New("Malformed JSON error")
+	MalformedStringError       = errors.New("Value is string, but can't find closing '\"' symbol")
+	MalformedArrayError        = errors.New("Value is array, but can't find closing ']' symbol")
+	MalformedObjectError       = errors.New("Value looks like object, but can't find closing '}' symbol")
+	MalformedValueError        = errors.New("Value looks like Number/Boolean/None, but can't find its end: ',' or '}' symbol")
+	MalformedStringEscapeError = errors.New("Encountered an invalid escape sequence in a string")
 )
 
+// How much stack space to allocate for unescaping JSON strings; if a string longer
+// than this needs to be escaped, it will result in a heap allocation
+const unescapeStackBufSize = 64
+
 func tokenEnd(data []byte) int {
 	for i, c := range data {
 		switch c {
@@ -31,7 +36,6 @@ func tokenEnd(data []byte) int {
 	return -1
 }
 
-
 // Find position of next character which is not ' ', ',', '}' or ']'
 func nextToken(data []byte, skipComma bool) int {
 	for i, c := range data {
@@ -133,10 +137,10 @@ func searchKeys(data []byte, keys ...string) int {
 			i += valueOffset
 
 			// if string is a Key, and key level match
-			if data[i] == ':'{
+			if data[i] == ':' {
 				key := unsafeBytesToString(data[keyBegin:keyEnd])
 
-			 	if keyLevel == level-1 && // If key nesting level match current object nested level
+				if keyLevel == level-1 && // If key nesting level match current object nested level
 					keys[level-1] == key {
 					keyLevel++
 					// If we found all keys in path
@@ -392,9 +396,10 @@ func GetString(data []byte, keys ...string) (val string, err error) {
 		return string(v), nil
 	}
 
-	s, err := strconv.Unquote(`"` + unsafeBytesToString(v) + `"`)
+	var stackbuf [unescapeStackBufSize]byte // stack-allocated array for allocation-free unescaping of small strings
+	out, err := unescape(v, stackbuf[:])
 
-	return s, err
+	return string(out), err
 }
 
 // GetFloat returns the value retrieved by `Get`, cast to a float64 if possible.
diff --git a/parser_test.go b/parser_test.go
@@ -326,19 +326,26 @@ var getFloatTests = []Test{
 
 var getStringTests = []Test{
 	Test{
-		desc:    `Translate unicode symbols`,
+		desc:    `Translate Unicode symbols`,
 		json:    `{"c": "test"}`,
 		path:    []string{"c"},
 		isFound: true,
 		data:    `test`,
 	},
 	Test{
-		desc:    `Translate unicode symbols`,
+		desc:    `Translate Unicode symbols`,
 		json:    `{"c": "15\u00b0C"}`,
 		path:    []string{"c"},
 		isFound: true,
 		data:    `15°C`,
 	},
+	Test{
+		desc:    `Translate supplementary Unicode symbols`,
+		json:    `{"c": "\uD83D\uDE03"}`, // Smiley face (UTF16 surrogate pair)
+		path:    []string{"c"},
+		isFound: true,
+		data:    "\U0001F603", // Smiley face
+	},
 	Test{
 		desc:    `Translate escape symbols`,
 		json:    `{"c": "\\\""}`,
diff --git a/parserescapes.go b/parserescapes.go
@@ -0,0 +1,146 @@
+package jsonparser
+
+import (
+	"bytes"
+	"unicode/utf8"
+)
+
+// JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7
+
+const highSurrogateOffset = 0xDB00
+const lowSurrogateOffset = 0xDC00
+
+func combineUTF16Surrogates(high, low rune) rune {
+	return (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset)
+}
+
+const badHex = -1
+
+func h2I(c byte) int {
+	switch {
+	case c >= '0' && c <= '9':
+		return int(c - '0')
+	case c >= 'A' && c <= 'F':
+		return int(c - 'A' + 10)
+	case c >= 'a' && c <= 'f':
+		return int(c - 'a' + 10)
+	}
+	return badHex
+}
+
+// decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. In JSON, these can either come alone or as part
+// of "UTF16 surrogate pairs" that must be handled together; this function only handles one at a time
+func decodeSingleUnicodeEscape(in []byte) (rune, bool) {
+	// We need at least 6 characters total
+	if len(in) < 6 {
+		return utf8.RuneError, false
+	}
+
+	// Convert hex to decimal
+	h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5])
+	if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
+		return utf8.RuneError, false
+	}
+
+	// Compose the hex digits
+	return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
+}
+
+func decodeUnicodeEscape(in []byte) (rune, int) {
+	if r, ok := decodeSingleUnicodeEscape(in); !ok {
+		// Invalid Unicode escape
+		return utf8.RuneError, -1
+	} else if r < highSurrogateOffset {
+		// Valid Unicode escape in Basic Multilingual Plane
+		return r, 6
+	} else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain
+		// UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate"
+		return utf8.RuneError, -1
+	} else {
+		// Valid UTF16 surrogate pair
+		return combineUTF16Surrogates(r, r2), 12
+	}
+
+}
+
+// unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns
+// how many characters were consumed from 'in' and emitted into 'out'.
+// If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error.
+func unescapeToUTF8(in, out []byte) (inLen int, outLen int) {
+	if len(in) < 2 || in[0] != '\\' {
+		// Invalid escape due to insufficient characters for any escape or no initial backslash
+		return -1, -1
+	}
+
+	// https://tools.ietf.org/html/rfc7159#section-7
+	switch e := in[1]; e {
+	case '"', '\\', 'n', 't', 'r', '/', 'b', 'f':
+		// Valid basic 2-character escapes
+		out[0] = e
+		return 2, 1
+	case 'u':
+		// Unicode escape
+		if r, inLen := decodeUnicodeEscape(in); inLen == -1 {
+			// Invalid Unicode escape
+			return -1, -1
+		} else {
+			// Valid Unicode escape; re-encode as UTF8
+			outLen := utf8.EncodeRune(out, r)
+			return inLen, outLen
+		}
+	}
+
+	return -1, -1
+}
+
+// unescape unescapes the string contained in 'in' and returns it as a slice.
+// If 'in' contains no escaped characters:
+//   Returns 'in'.
+// Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)):
+//   'out' is used to build the unescaped string and is returned with no extra allocation
+// Else:
+//   A new slice is allocated and returned.
+func unescape(in, out []byte) ([]byte, error) {
+	firstBackslash := bytes.IndexByte(in, '\\')
+	if firstBackslash == -1 {
+		return in, nil
+	}
+
+	// Get a buffer of sufficient size (allocate if needed)
+	if cap(out) < len(in) {
+		out = make([]byte, len(in))
+	} else {
+		out = out[0:len(in)]
+	}
+
+	// Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice)
+	copy(out, in[:firstBackslash])
+	in = in[firstBackslash:]
+	buf := out[firstBackslash:]
+
+	for len(in) > 0 {
+		// Unescape the next escaped character
+		inLen, bufLen := unescapeToUTF8(in, buf)
+		if inLen == -1 {
+			return nil, MalformedStringEscapeError
+		}
+
+		in = in[inLen:]
+		buf = buf[bufLen:]
+
+		// Copy everything up until the next backslash
+		nextBackslash := bytes.IndexByte(in, '\\')
+		if nextBackslash == -1 {
+			copy(buf, in)
+			buf = buf[len(in):]
+			break
+		} else {
+			copy(buf, in[:nextBackslash])
+			buf = buf[nextBackslash:]
+			in = in[nextBackslash:]
+		}
+	}
+
+	// Trim the out buffer to the amount that was actually emitted
+	return out[:len(out)-len(buf)], nil
+}
diff --git a/parserescapes_test.go b/parserescapes_test.go
@@ -0,0 +1,51 @@
+package jsonparser
+
+import (
+	"testing"
+)
+
+func TestH2I(t *testing.T) {
+	hexChars := []byte{'0', '9', 'A', 'F', 'a', 'f', 'x', '\000'}
+	hexValues := []int{0, 9, 10, 15, 10, 15, -1, -1}
+
+	for i, c := range hexChars {
+		if v := h2I(c); v != hexValues[i] {
+			t.Errorf("h2I('%c') returned wrong value (obtained %d, expected %d)", c, v, hexValues[i])
+		}
+	}
+}
+
+func TestDecodeSingleUnicodeEscape(t *testing.T) {
+	escapeSequences := []string{
+		`\"`,
+		`\\`,
+		`\n`,
+		`\t`,
+		`\r`,
+		`\/`,
+		`\b`,
+		`\f`,
+	}
+
+	runeValues := []struct {
+		r  rune
+		ok bool
+	}{
+		{'"', true},
+		{'\\', true},
+		{'\n', true},
+		{'\t', true},
+		{'/', true},
+		{'\b', true},
+		{'\f', true},
+	}
+
+	for i, esc := range escapeSequences {
+		expected := runeValues[i]
+		if r, ok := decodeSingleUnicodeEscape([]byte(esc)); ok != expected.ok {
+			t.Errorf("decodeSingleUnicodeEscape(%s) returned 'ok' mismatch: expected %t, obtained %t", esc, expected.ok, ok)
+		} else if r != expected.r {
+			t.Errorf("decodeSingleUnicodeEscape(%s) returned rune mismatch: expected %x (%c), obtained %x (%c)", esc, expected.r, expected.r, r, r)
+		}
+	}
+}