Skip to content

Commit 8cab937

Browse files
committed
Merge branch '2.18'
2 parents 7a28627 + 4d47aae commit 8cab937

File tree

8 files changed

+110
-32
lines changed

8 files changed

+110
-32
lines changed

release-notes/CREDITS-2.x

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,15 @@ Antonin Janec (@xtonic)
435435
* Contributed #1218: Simplify Unicode surrogate pair conversion for generation
436436
(2.17.0)
437437

438+
Ian Roberts (@ianroberts)
439+
* Reported #223: `UTF8JsonGenerator` writes supplementary characters as a
440+
surrogate pair: should use 4-byte encoding
441+
(2.18.0)
442+
443+
Radovan Netuka (@rnetuka)
444+
* Contributed fix for #223: `UTF8JsonGenerator` writes supplementary characters as a
445+
surrogate pair: should use 4-byte encoding
446+
438447
Jared Stehler (@jaredstehler)
439448
* Reported, contributed fix for #1274: `NUL`-corrupted keys, values on JSON serialization
440449
(2.18.0)

release-notes/VERSION-2.x

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ a pure JSON library.
1616

1717
2.18.0 (not yet released)
1818

19+
#223: `UTF8JsonGenerator` writes supplementary characters as a surrogate pair:
20+
should use 4-byte encoding
21+
(reported by Ian R)
22+
(fix contributed by Radovan N)
1923
#1230: Improve performance of `float` and `double` parsing from `TextBuffer`
2024
(implemented by @pjfanning)
2125
#1251: `InternCache` replace synchronized with `ReentrantLock` - the cache

src/main/java/tools/jackson/core/json/JsonWriteFeature.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,25 @@ public enum JsonWriteFeature
107107
*/
108108
ESCAPE_FORWARD_SLASHES(true),
109109

110+
/**
111+
* Feature that specifies how characters outside "Basic Multilingual Plane" (BMP) -- ones encoded
112+
* as 4-byte UTF-8 sequences but represented in JVM memory as 2 16-bit "surrogate" {@code chars} --
113+
* should be encoded as UTF-8 by {@link JsonGenerator}.
114+
* If enabled, surrogate pairs are combined and flushed as a
115+
* single, 4-byte UTF-8 character.
116+
* If disabled, each {@code char} of pair is written as 2 separate characters: that is, as 2
117+
* separate 3-byte UTF-8 characters with values in Surrogate character ranges
118+
* ({@code 0xD800} - {@code 0xDBFF} and {@code 0xDC00} - {@code 0xDFFF})
119+
* <p>
120+
* Note that this feature only has effect for {@link JsonGenerator}s that directly encode
121+
* {@code byte}-based output, as UTF-8 (target {@link java.io.OutputStream}, {@code byte[]}
122+
* and so on); it will not (can not) change handling of
123+
* {@code char}-based output (like {@link java.io.Writer} or {@link java.lang.String}).
124+
* <p>
125+
* Feature is enabled by default in Jackson 3.0 (was disabled in 2.x).
126+
*/
127+
COMBINE_UNICODE_SURROGATES_IN_UTF8(true),
128+
110129
;
111130

112131
final private boolean _defaultState;

src/main/java/tools/jackson/core/json/UTF8JsonGenerator.java

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1545,6 +1545,16 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int
15451545
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
15461546
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
15471547
} else {
1548+
// 3- or 4-byte character
1549+
if (_isSurrogateChar(ch)) {
1550+
final boolean combineSurrogates = JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_formatWriteFeatures);
1551+
if (combineSurrogates && offset < end) {
1552+
char highSurrogate = (char) ch;
1553+
char lowSurrogate = cbuf[offset++];
1554+
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
1555+
continue;
1556+
}
1557+
}
15481558
outputPtr = _outputMultiByteChar(ch, outputPtr);
15491559
}
15501560
}
@@ -1583,6 +1593,16 @@ private final void _writeStringSegment2(final String text, int offset, final int
15831593
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
15841594
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
15851595
} else {
1596+
// 3- or 4-byte character
1597+
if (_isSurrogateChar(ch)) {
1598+
final boolean combineSurrogates = JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_formatWriteFeatures);
1599+
if (combineSurrogates && offset < end) {
1600+
char highSurrogate = (char) ch;
1601+
char lowSurrogate = text.charAt(offset++);
1602+
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
1603+
continue;
1604+
}
1605+
}
15861606
outputPtr = _outputMultiByteChar(ch, outputPtr);
15871607
}
15881608
}
@@ -2177,6 +2197,19 @@ protected final void _outputSurrogates(int surr1, int surr2) throws JacksonExcep
21772197
bbuf[_outputTail++] = (byte) (0x80 | (c & 0x3f));
21782198
}
21792199

2200+
// @since 2.18
2201+
private int _outputSurrogatePair(char highSurrogate, char lowSurrogate, int outputPtr) {
2202+
final int unicode = 0x10000 + ((highSurrogate & 0x03FF) << 10)
2203+
+ (lowSurrogate & 0x03FF);
2204+
2205+
_outputBuffer[outputPtr++] = (byte) (0xF0 + ((unicode >> 18) & 0x07));
2206+
_outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 12) & 0x3F));
2207+
_outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 6) & 0x3F));
2208+
_outputBuffer[outputPtr++] = (byte) (0x80 + (unicode & 0x3F));
2209+
2210+
return outputPtr;
2211+
}
2212+
21802213
/**
21812214
*
21822215
* @param ch
@@ -2262,5 +2295,10 @@ protected final void _flushBuffer() throws JacksonException
22622295
private byte[] getHexBytes() {
22632296
return _cfgWriteHexUppercase ? HEX_BYTES_UPPER : HEX_BYTES_LOWER;
22642297
}
2298+
2299+
// @since 2.18
2300+
private boolean _isSurrogateChar(int ch) {
2301+
return (ch & 0xD800) == 0xD800;
2302+
}
22652303
}
22662304

src/test/java/tools/jackson/core/json/StreamWriteFeaturesTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
import java.math.BigDecimal;
55
import java.math.BigInteger;
66

7+
import org.junit.jupiter.api.Test;
8+
79
import tools.jackson.core.*;
810
import tools.jackson.core.exc.StreamWriteException;
911

10-
import org.junit.jupiter.api.Test;
11-
1212
import static org.junit.jupiter.api.Assertions.*;
1313

1414
/**

src/test/java/tools/jackson/core/json/StringGenerationFromReaderTest.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@ class StringGenerationFromReaderTest
2424
"Longer text & other stuff:\twith some\r\n\r\n random linefeeds etc added in to cause some \"special\" handling \\\\ to occur...\n"
2525
};
2626

27-
private final JsonFactory FACTORY = newStreamFactory();
27+
// 17-Sep-2024, tatu: [core#223] change to surrogates, let's use old behavior
28+
// for now for simpler testing
29+
private final JsonFactory FACTORY = streamFactoryBuilder()
30+
.disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
31+
.build();
2832

2933
@Test
3034
void basicEscaping() throws Exception

src/test/java/tools/jackson/core/json/StringGenerationTest.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@ class StringGenerationTest
2424
"Longer text & other stuff:\twith some\r\n\r\n random linefeeds etc added in to cause some \"special\" handling \\\\ to occur...\n"
2525
};
2626

27-
private final JsonFactory FACTORY = new JsonFactory();
27+
// 17-Sep-2024, tatu: [core#223] change to surrogates, let's use old behavior
28+
// for now for simpler testing
29+
private final JsonFactory FACTORY = streamFactoryBuilder()
30+
.disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
31+
.build();
2832

2933
@Test
3034
void basicEscaping() throws Exception
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,26 @@
1-
package tools.jackson.failing;
1+
package tools.jackson.core.json;
22

33
import java.io.ByteArrayOutputStream;
44
import java.io.StringWriter;
55
import java.io.Writer;
66

7-
import tools.jackson.core.*;
8-
import tools.jackson.core.json.JsonFactory;
9-
107
import org.junit.jupiter.api.Test;
118

9+
import tools.jackson.core.*;
10+
1211
import static org.junit.jupiter.api.Assertions.assertEquals;
12+
import static org.junit.jupiter.api.Assertions.assertTrue;
1313

1414
class Surrogate223Test extends JUnit5TestBase
1515
{
16-
private final JsonFactory JSON_F = newStreamFactory();
16+
private final JsonFactory DEFAULT_JSON_F = newStreamFactory();
17+
18+
// for [core#223]
19+
@Test
20+
void surrogatesDefaultSetting() throws Exception {
21+
// default in 3.x should be disabled:
22+
assertTrue(DEFAULT_JSON_F.isEnabled(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8));
23+
}
1724

1825
// for [core#223]
1926
@Test
@@ -24,36 +31,41 @@ void surrogatesByteBacked() throws Exception
2431
final String toQuote = new String(Character.toChars(0x1F602));
2532
assertEquals(2, toQuote.length()); // just sanity check
2633

27-
// default should be disabled:
28-
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));
29-
3034
out = new ByteArrayOutputStream();
31-
g = JSON_F.createGenerator(ObjectWriteContext.empty(), out);
35+
36+
JsonFactory f = JsonFactory.builder()
37+
.enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
38+
.build();
39+
g = f.createGenerator(ObjectWriteContext.empty(), out);
3240
g.writeStartArray();
3341
g.writeString(toQuote);
3442
g.writeEndArray();
3543
g.close();
3644
assertEquals(2 + 2 + 4, out.size()); // brackets, quotes, 4-byte encoding
3745

3846
// Also parse back to ensure correctness
39-
JsonParser p = JSON_F.createParser(ObjectReadContext.empty(), out.toByteArray());
47+
JsonParser p = f.createParser(ObjectReadContext.empty(), out.toByteArray());
4048
assertToken(JsonToken.START_ARRAY, p.nextToken());
4149
assertToken(JsonToken.VALUE_STRING, p.nextToken());
50+
assertEquals(toQuote, p.getText());
4251
assertToken(JsonToken.END_ARRAY, p.nextToken());
4352
p.close();
4453

4554
// but may revert back to original behavior
4655
out = new ByteArrayOutputStream();
47-
g = JSON_F.createGenerator(ObjectWriteContext.empty(), out);
48-
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
56+
f = JsonFactory.builder()
57+
.disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
58+
.build();
59+
60+
g = f.createGenerator(ObjectWriteContext.empty(), out);
4961
g.writeStartArray();
5062
g.writeString(toQuote);
5163
g.writeEndArray();
5264
g.close();
5365
assertEquals(2 + 2 + 12, out.size()); // brackets, quotes, 2 x 6 byte JSON escape
5466
}
5567

56-
// for [core#223]
68+
// for [core#223]: no change for character-backed (cannot do anything)
5769
@Test
5870
void surrogatesCharBacked() throws Exception
5971
{
@@ -62,32 +74,20 @@ void surrogatesCharBacked() throws Exception
6274
final String toQuote = new String(Character.toChars(0x1F602));
6375
assertEquals(2, toQuote.length()); // just sanity check
6476

65-
// default should be disabled:
66-
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));
67-
6877
out = new StringWriter();
69-
g = JSON_F.createGenerator(ObjectWriteContext.empty(), out);
78+
g = DEFAULT_JSON_F.createGenerator(ObjectWriteContext.empty(), out);
7079
g.writeStartArray();
7180
g.writeString(toQuote);
7281
g.writeEndArray();
7382
g.close();
7483
assertEquals(2 + 2 + 2, out.toString().length()); // brackets, quotes, 2 chars as is
7584

7685
// Also parse back to ensure correctness
77-
JsonParser p = JSON_F.createParser(ObjectReadContext.empty(), out.toString());
86+
JsonParser p = DEFAULT_JSON_F.createParser(ObjectReadContext.empty(), out.toString());
7887
assertToken(JsonToken.START_ARRAY, p.nextToken());
7988
assertToken(JsonToken.VALUE_STRING, p.nextToken());
89+
assertEquals(toQuote, p.getText());
8090
assertToken(JsonToken.END_ARRAY, p.nextToken());
8191
p.close();
82-
83-
// but may revert back to original behavior
84-
out = new StringWriter();
85-
g = JSON_F.createGenerator(ObjectWriteContext.empty(), out);
86-
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
87-
g.writeStartArray();
88-
g.writeString(toQuote);
89-
g.writeEndArray();
90-
g.close();
91-
assertEquals(2 + 2 + 12, out.toString().length()); // brackets, quotes, 2 x 6 byte JSON escape
9292
}
9393
}

0 commit comments

Comments
 (0)