Skip to content

Commit 2493a9f

Browse files
committed
Improve performance of writing raw UTF-8 encoded byte arrays
The output escape table covers just 7-bits, meaning that a raw UTF-8 byte cannot be used to index into the table without a branch test for negative bytes (i.e. bytes larger than 0x7F). This extra check occurs in a tight loop and can be avoided if the lookup table were to cover all 8-bit indices. This commit introduces ad-hoc logic in `UTF8JsonGenerator#writeUTF8String` to create an extended copy of `_outputEscapes` if necessary, writing the copy back into the field to avoid having to compute it again (unless it is changed). This ad-hoc strategy was chosen as it is the least disruptive to existing code, as a larger-scale change around `CharacterEscapes` would impact public api or otherwise subtle chances for breakages.
1 parent 2128a70 commit 2493a9f

File tree

1 file changed

+29
-15
lines changed

1 file changed

+29
-15
lines changed

src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -647,11 +647,16 @@ public void writeUTF8String(byte[] text, int offset, int len) throws IOException
647647
_flushBuffer();
648648
}
649649
_outputBuffer[_outputTail++] = _quoteChar;
650+
651+
// When writing raw UTF-8 encoded bytes, it is beneficial if the escaping table can directly be indexed into
652+
// using the byte value.
653+
final int[] extendedOutputEscapes = _extendOutputEscapesTo8Bits();
654+
650655
// One or multiple segments?
651656
if (len <= _outputMaxContiguous) {
652-
_writeUTF8Segment(text, offset, len);
657+
_writeUTF8Segment(text, offset, len, extendedOutputEscapes);
653658
} else {
654-
_writeUTF8Segments(text, offset, len);
659+
_writeUTF8Segments(text, offset, len, extendedOutputEscapes);
655660
}
656661
if (_outputTail >= _outputEnd) {
657662
_flushBuffer();
@@ -1846,28 +1851,26 @@ private final int _handleLongCustomEscape(byte[] outputBuffer, int outputPtr, in
18461851
* to fit in the output buffer after escaping; as such, we just need to
18471852
* chunk writes.
18481853
*/
1849-
private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen)
1854+
private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen, final int[] extendedOutputEscapes)
18501855
throws IOException, JsonGenerationException
18511856
{
18521857
do {
18531858
int len = Math.min(_outputMaxContiguous, totalLen);
1854-
_writeUTF8Segment(utf8, offset, len);
1859+
_writeUTF8Segment(utf8, offset, len, extendedOutputEscapes);
18551860
offset += len;
18561861
totalLen -= len;
18571862
} while (totalLen > 0);
18581863
}
18591864

1860-
private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len)
1865+
private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len, final int[] extendedOutputEscapes)
18611866
throws IOException, JsonGenerationException
18621867
{
18631868
// fast loop to see if escaping is needed; don't copy, just look
1864-
final int[] escCodes = _outputEscapes;
1865-
18661869
for (int ptr = offset, end = offset + len; ptr < end; ) {
18671870
// 28-Feb-2011, tatu: escape codes just cover 7-bit range, so:
1868-
int ch = utf8[ptr++];
1869-
if ((ch >= 0) && escCodes[ch] != 0) {
1870-
_writeUTF8Segment2(utf8, offset, len);
1871+
int ch = utf8[ptr++] & 0xFF;
1872+
if (extendedOutputEscapes[ch] != 0) {
1873+
_writeUTF8Segment2(utf8, offset, len, extendedOutputEscapes);
18711874
return;
18721875
}
18731876
}
@@ -1880,7 +1883,7 @@ private final void _writeUTF8Segment(byte[] utf8, final int offset, final int le
18801883
_outputTail += len;
18811884
}
18821885

1883-
private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
1886+
private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len, final int[] extendedOutputEscapes)
18841887
throws IOException, JsonGenerationException
18851888
{
18861889
int outputPtr = _outputTail;
@@ -1892,17 +1895,16 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
18921895
}
18931896

18941897
final byte[] outputBuffer = _outputBuffer;
1895-
final int[] escCodes = _outputEscapes;
18961898
len += offset; // so 'len' becomes 'end'
18971899

18981900
while (offset < len) {
18991901
byte b = utf8[offset++];
1900-
int ch = b;
1901-
if (ch < 0 || escCodes[ch] == 0) {
1902+
int ch = b & 0xFF;
1903+
int escape = extendedOutputEscapes[ch];
1904+
if (escape == 0) {
19021905
outputBuffer[outputPtr++] = b;
19031906
continue;
19041907
}
1905-
int escape = escCodes[ch];
19061908
if (escape > 0) { // 2-char escape, fine
19071909
outputBuffer[outputPtr++] = BYTE_BACKSLASH;
19081910
outputBuffer[outputPtr++] = (byte) escape;
@@ -1914,6 +1916,18 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
19141916
_outputTail = outputPtr;
19151917
}
19161918

1919+
private int[] _extendOutputEscapesTo8Bits() {
1920+
final int[] escapes = _outputEscapes;
1921+
if (escapes.length >= 0xFF) {
1922+
return escapes;
1923+
}
1924+
1925+
final int[] extended = new int[0xFF];
1926+
System.arraycopy(escapes, 0, extended, 0, escapes.length);
1927+
_outputEscapes = extended;
1928+
return extended;
1929+
}
1930+
19171931
/*
19181932
/**********************************************************
19191933
/* Internal methods, low-level writing, base64 encoded

0 commit comments

Comments
 (0)