Skip to content

Commit 92bd79d

Browse files
authored
Merge branch 'master' into feature/FasterXML#220-do-not-use-Arrays.copyOf
2 parents d2319fe + 2ba911b commit 92bd79d

File tree

5 files changed

+112
-22
lines changed

5 files changed

+112
-22
lines changed

.github/workflows/main.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,12 @@ permissions:
1616

1717
jobs:
1818
build:
19-
runs-on: ${{ matrix.os }}
19+
runs-on: 'ubuntu-22.04'
2020
strategy:
2121
fail-fast: false
2222
matrix:
2323
# With Woodstox 7.0 can finally test versions past 11
2424
java_version: ['8', '11', '17', '21']
25-
os: ['ubuntu-20.04']
2625
env:
2726
JAVA_OPTS: "-XX:+TieredCompilation -XX:TieredStopAtLevel=1"
2827
steps:

release-notes/CREDITS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,8 @@ Philipp Nanz (@philippn)
116116
(7.1.0)
117117
* Contributed #211: Disable `resolveEntityReferences` by default for newly created SAX parsers
118118
(7.1.0)
119+
120+
Winfried Gerlach (@winfriedgerlach)
121+
122+
* Contributed #220: Switch to lookup tables in hotspots `isNameChar()`/`isNameStartChar()`
123+
(7.1.1)

release-notes/VERSION

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ Project: woodstox
99
#213: SAX: `Locator#getSystemId` and `Locator#getPublicId` are not
1010
available during `startDocument` event
1111
(fix contributed by Philipp N)
12+
#220: Switch to lookup tables in hotspots `isNameChar()`/`isNameStartChar()`
13+
(contributed by @winfriedgerlach)
1214

1315
7.1.0 (22-Oct-2024)
1416

src/main/java/com/ctc/wstx/io/WstxInputData.java

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
import com.ctc.wstx.util.XmlChars;
1919

20+
import java.util.stream.IntStream;
21+
2022
/**
2123
* Base class used by readers (specifically, by
2224
* {@link com.ctc.wstx.sr.StreamScanner}, and its sub-classes)
@@ -50,6 +52,25 @@ public class WstxInputData
5052
*/
5153
public final static int MAX_UNICODE_CHAR = 0x10FFFF;
5254

55+
// @since 7.1.1
56+
private static final boolean[] ASCII_NAME_START_CHARS = new boolean[128];
57+
static {
58+
IntStream.rangeClosed('a', 'z').forEach(i -> ASCII_NAME_START_CHARS[i] = true);
59+
IntStream.rangeClosed('A', 'Z').forEach(i -> ASCII_NAME_START_CHARS[i] = true);
60+
ASCII_NAME_START_CHARS['_'] = true;
61+
}
62+
63+
// @since 7.1.1
64+
private static final boolean[] ASCII_NAME_CHARS = new boolean[128];
65+
static {
66+
IntStream.rangeClosed('a', 'z').forEach(i -> ASCII_NAME_CHARS[i] = true);
67+
IntStream.rangeClosed('A', 'Z').forEach(i -> ASCII_NAME_CHARS[i] = true);
68+
IntStream.rangeClosed('0', '9').forEach(i -> ASCII_NAME_CHARS[i] = true);
69+
ASCII_NAME_CHARS['.'] = true;
70+
ASCII_NAME_CHARS['-'] = true;
71+
ASCII_NAME_CHARS['_'] = true;
72+
}
73+
5374
/*
5475
////////////////////////////////////////////////////
5576
// Configuration
@@ -153,14 +174,9 @@ protected final boolean isNameStartChar(char c)
153174
/* First, let's handle 7-bit ascii range (identical between xml
154175
* 1.0 and 1.1)
155176
*/
156-
if (c <= 0x7A) { // 'z' or earlier
157-
if (c >= 0x61) { // 'a' - 'z' are ok
158-
return true;
159-
}
160-
if (c < 0x41) { // before 'A' just white space
161-
return false;
162-
}
163-
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
177+
if (c < 128) {
178+
// this is performance critical, so we use a lookup table instead of if-branches
179+
return ASCII_NAME_START_CHARS[c];
164180
}
165181
/* Ok, otherwise need to use a big honking bit sets... which
166182
* differ between 1.0 and 1.1
@@ -178,18 +194,9 @@ protected final boolean isNameStartChar(char c)
178194
protected final boolean isNameChar(char c)
179195
{
180196
// First, let's handle 7-bit ascii range
181-
if (c <= 0x7A) { // 'z' or earlier
182-
if (c >= 0x61) { // 'a' - 'z' are ok
183-
return true;
184-
}
185-
if (c <= 0x5A) {
186-
if (c >= 0x41) { // 'A' - 'Z' ok too
187-
return true;
188-
}
189-
// As are 0-9, '.' and '-'
190-
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
191-
}
192-
return (c == 0x5F); // '_' is ok too
197+
if (c < 128) {
198+
// this is performance critical, so we use a lookup table instead of if-branches
199+
return ASCII_NAME_CHARS[c];
193200
}
194201
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
195202
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
package com.ctc.wstx.io;
2+
3+
import com.ctc.wstx.util.XmlChars;
4+
import junit.framework.TestCase;
5+
import org.junit.Test;
6+
7+
import java.util.stream.IntStream;
8+
9+
public class WstxInputDataTest extends TestCase {
10+
11+
@Test
12+
public void testIsNameStartCharBehavesSameAsBranchyVersion() {
13+
WstxInputData wstxInputDataXml10 = new WstxInputData();
14+
WstxInputData wstxInputDataXml11 = new WstxInputData();
15+
wstxInputDataXml11.mXml11 = true;
16+
17+
// include all 7-bit ASCII characters plus some left and right
18+
IntStream.range(-10, 138).forEach(i -> {
19+
char c = (char) i;
20+
assertEquals(isNameStartCharBranchy(c, false), wstxInputDataXml10.isNameStartChar(c));
21+
assertEquals(isNameStartCharBranchy(c, true), wstxInputDataXml11.isNameStartChar(c));
22+
});
23+
}
24+
25+
// previous implementation with branches
26+
private final boolean isNameStartCharBranchy(char c, boolean mXml11) {
27+
/* First, let's handle 7-bit ascii range (identical between xml
28+
* 1.0 and 1.1)
29+
*/
30+
if (c <= 0x7A) { // 'z' or earlier
31+
if (c >= 0x61) { // 'a' - 'z' are ok
32+
return true;
33+
}
34+
if (c < 0x41) { // before 'A' just white space
35+
return false;
36+
}
37+
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
38+
}
39+
/* Ok, otherwise need to use a big honking bit sets... which
40+
* differ between 1.0 and 1.1
41+
*/
42+
return mXml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
43+
}
44+
45+
@Test
46+
public void testIsNameCharBehavesSameAsBranchyVersion() {
47+
WstxInputData wstxInputDataXml10 = new WstxInputData();
48+
WstxInputData wstxInputDataXml11 = new WstxInputData();
49+
wstxInputDataXml11.mXml11 = true;
50+
51+
// include all 7-bit ASCII characters plus some left and right
52+
IntStream.range(-10, 138).forEach(i -> {
53+
char c = (char) i;
54+
assertEquals(isNameCharBranchy(c, false), wstxInputDataXml10.isNameChar(c));
55+
assertEquals(isNameCharBranchy(c, true), wstxInputDataXml11.isNameChar(c));
56+
});
57+
}
58+
59+
// previous implementation with branches
60+
private final boolean isNameCharBranchy(char c, boolean mXml11) {
61+
// First, let's handle 7-bit ascii range
62+
if (c <= 0x7A) { // 'z' or earlier
63+
if (c >= 0x61) { // 'a' - 'z' are ok
64+
return true;
65+
}
66+
if (c <= 0x5A) {
67+
if (c >= 0x41) { // 'A' - 'Z' ok too
68+
return true;
69+
}
70+
// As are 0-9, '.' and '-'
71+
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
72+
}
73+
return (c == 0x5F); // '_' is ok too
74+
}
75+
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
76+
}
77+
}

0 commit comments

Comments
 (0)