Improve performance of writing raw UTF-8 encoded byte arrays

JoostK · JoostK · commit 2493a9f6a605 · 2024-10-20T14:07:24.000+02:00
The output escape table covers just 7-bits, meaning that a raw UTF-8 byte cannot
be used to index into the table without a branch test for negative bytes (i.e. bytes
larger than 0x7F). This extra check occurs in a tight loop and can be avoided if the
lookup table were to cover all 8-bit indices.

This commit introduces ad-hoc logic in `UTF8JsonGenerator#writeUTF8String` to create
an extended copy of `_outputEscapes` if necessary, writing the copy back into the field
to avoid having to compute it again (unless it is changed). This ad-hoc strategy was
chosen as it is the least disruptive to existing code, as a larger-scale change around
`CharacterEscapes` would impact public api or otherwise subtle chances for breakages.
diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8JsonGenerator.java
@@ -647,11 +647,16 @@ public void writeUTF8String(byte[] text, int offset, int len) throws IOException
             _flushBuffer();
         }
         _outputBuffer[_outputTail++] = _quoteChar;
+
+        // When writing raw UTF-8 encoded bytes, it is beneficial if the escaping table can directly be indexed into
+        // using the byte value.
+        final int[] extendedOutputEscapes = _extendOutputEscapesTo8Bits();
+
         // One or multiple segments?
         if (len <= _outputMaxContiguous) {
-            _writeUTF8Segment(text, offset, len);
+            _writeUTF8Segment(text, offset, len, extendedOutputEscapes);
         } else {
-            _writeUTF8Segments(text, offset, len);
+            _writeUTF8Segments(text, offset, len, extendedOutputEscapes);
         }
         if (_outputTail >= _outputEnd) {
             _flushBuffer();
@@ -1846,28 +1851,26 @@ private final int _handleLongCustomEscape(byte[] outputBuffer, int outputPtr, in
      * to fit in the output buffer after escaping; as such, we just need to
      * chunk writes.
      */
-    private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen)
+    private final void _writeUTF8Segments(byte[] utf8, int offset, int totalLen, final int[] extendedOutputEscapes)
         throws IOException, JsonGenerationException
     {
         do {
             int len = Math.min(_outputMaxContiguous, totalLen);
-            _writeUTF8Segment(utf8, offset, len);
+            _writeUTF8Segment(utf8, offset, len, extendedOutputEscapes);
             offset += len;
             totalLen -= len;
         } while (totalLen > 0);
     }
 
-    private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len)
+    private final void _writeUTF8Segment(byte[] utf8, final int offset, final int len, final int[] extendedOutputEscapes)
         throws IOException, JsonGenerationException
     {
         // fast loop to see if escaping is needed; don't copy, just look
-        final int[] escCodes = _outputEscapes;
-
         for (int ptr = offset, end = offset + len; ptr < end; ) {
             // 28-Feb-2011, tatu: escape codes just cover 7-bit range, so:
-            int ch = utf8[ptr++];
-            if ((ch >= 0) && escCodes[ch] != 0) {
-                _writeUTF8Segment2(utf8, offset, len);
+            int ch = utf8[ptr++] & 0xFF;
+            if (extendedOutputEscapes[ch] != 0) {
+                _writeUTF8Segment2(utf8, offset, len, extendedOutputEscapes);
                 return;
             }
         }
@@ -1880,7 +1883,7 @@ private final void _writeUTF8Segment(byte[] utf8, final int offset, final int le
         _outputTail += len;
     }
 
-    private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
+    private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len, final int[] extendedOutputEscapes)
         throws IOException, JsonGenerationException
     {
         int outputPtr = _outputTail;
@@ -1892,17 +1895,16 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
         }
 
         final byte[] outputBuffer = _outputBuffer;
-        final int[] escCodes = _outputEscapes;
         len += offset; // so 'len' becomes 'end'
 
         while (offset < len) {
             byte b = utf8[offset++];
-            int ch = b;
-            if (ch < 0 || escCodes[ch] == 0) {
+            int ch = b & 0xFF;
+            int escape = extendedOutputEscapes[ch];
+            if (escape == 0) {
                 outputBuffer[outputPtr++] = b;
                 continue;
             }
-            int escape = escCodes[ch];
             if (escape > 0) { // 2-char escape, fine
                 outputBuffer[outputPtr++] = BYTE_BACKSLASH;
                 outputBuffer[outputPtr++] = (byte) escape;
@@ -1914,6 +1916,18 @@ private final void _writeUTF8Segment2(final byte[] utf8, int offset, int len)
         _outputTail = outputPtr;
     }
 
+    private int[] _extendOutputEscapesTo8Bits() {
+        final int[] escapes = _outputEscapes;
+        if (escapes.length >= 0xFF) {
+            return escapes;
+        }
+
+        final int[] extended = new int[0xFF];
+        System.arraycopy(escapes, 0, extended, 0, escapes.length);
+        _outputEscapes = extended;
+        return extended;
+    }
+
     /*
     /**********************************************************
     /* Internal methods, low-level writing, base64 encoded