Skip to content

Commit b0a292e

Browse files
(#220) Switch to lookup tables in hotspots isNameChar()/isNameStartChar() (#221)
1 parent 012a512 commit b0a292e

File tree

3 files changed

+104
-20
lines changed

3 files changed

+104
-20
lines changed

release-notes/VERSION

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ Project: woodstox
99
#213: SAX: `Locator#getSystemId` and `Locator#getPublicId` are not
1010
available during `startDocument` event
1111
(fix contributed by Philipp N)
12+
#221: Switch to lookup tables in hotspots `isNameChar()`/`isNameStartChar()`
13+
(contributed by @winfriedgerlach)
1214

1315
7.1.0 (22-Oct-2024)
1416

src/main/java/com/ctc/wstx/io/WstxInputData.java

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
import com.ctc.wstx.util.XmlChars;
1919

20+
import java.util.stream.IntStream;
21+
2022
/**
2123
* Base class used by readers (specifically, by
2224
* {@link com.ctc.wstx.sr.StreamScanner}, and its sub-classes)
@@ -50,6 +52,23 @@ public class WstxInputData
5052
*/
5153
public final static int MAX_UNICODE_CHAR = 0x10FFFF;
5254

55+
private static final boolean[] asciiNameStartChars = new boolean[128];
56+
static {
57+
IntStream.rangeClosed('a', 'z').forEach(i -> asciiNameStartChars[i] = true);
58+
IntStream.rangeClosed('A', 'Z').forEach(i -> asciiNameStartChars[i] = true);
59+
asciiNameStartChars['_'] = true;
60+
}
61+
62+
private static final boolean[] asciiNameChars = new boolean[128];
63+
static {
64+
IntStream.rangeClosed('a', 'z').forEach(i -> asciiNameChars[i] = true);
65+
IntStream.rangeClosed('A', 'Z').forEach(i -> asciiNameChars[i] = true);
66+
IntStream.rangeClosed('0', '9').forEach(i -> asciiNameChars[i] = true);
67+
asciiNameChars['.'] = true;
68+
asciiNameChars['-'] = true;
69+
asciiNameChars['_'] = true;
70+
}
71+
5372
/*
5473
////////////////////////////////////////////////////
5574
// Configuration
@@ -153,14 +172,9 @@ protected final boolean isNameStartChar(char c)
153172
/* First, let's handle 7-bit ascii range (identical between xml
154173
* 1.0 and 1.1)
155174
*/
156-
if (c <= 0x7A) { // 'z' or earlier
157-
if (c >= 0x61) { // 'a' - 'z' are ok
158-
return true;
159-
}
160-
if (c < 0x41) { // before 'A' just white space
161-
return false;
162-
}
163-
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
175+
if (c < 128) {
176+
// this is performance critical, so we use a lookup table instead of if-branches
177+
return asciiNameStartChars[c];
164178
}
165179
/* Ok, otherwise need to use a big honking bit sets... which
166180
* differ between 1.0 and 1.1
@@ -178,18 +192,9 @@ protected final boolean isNameStartChar(char c)
178192
protected final boolean isNameChar(char c)
179193
{
180194
// First, let's handle 7-bit ascii range
181-
if (c <= 0x7A) { // 'z' or earlier
182-
if (c >= 0x61) { // 'a' - 'z' are ok
183-
return true;
184-
}
185-
if (c <= 0x5A) {
186-
if (c >= 0x41) { // 'A' - 'Z' ok too
187-
return true;
188-
}
189-
// As are 0-9, '.' and '-'
190-
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
191-
}
192-
return (c == 0x5F); // '_' is ok too
195+
if (c < 128) {
196+
// this is performance critical, so we use a lookup table instead of if-branches
197+
return asciiNameChars[c];
193198
}
194199
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
195200
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
package com.ctc.wstx.io;
2+
3+
import com.ctc.wstx.util.XmlChars;
4+
import junit.framework.TestCase;
5+
import org.junit.Test;
6+
7+
import java.util.stream.IntStream;
8+
9+
public class WstxInputDataTest extends TestCase {
10+
11+
@Test
12+
public void testIsNameStartCharBehavesSameAsBranchyVersion() {
13+
WstxInputData wstxInputDataXml10 = new WstxInputData();
14+
WstxInputData wstxInputDataXml11 = new WstxInputData();
15+
wstxInputDataXml11.mXml11 = true;
16+
17+
// include all 7-bit ASCII characters plus some left and right
18+
IntStream.range(-10, 138).forEach(i -> {
19+
char c = (char) i;
20+
assertEquals(isNameStartCharBranchy(c, false), wstxInputDataXml10.isNameStartChar(c));
21+
assertEquals(isNameStartCharBranchy(c, true), wstxInputDataXml11.isNameStartChar(c));
22+
});
23+
}
24+
25+
// previous implementation with branches
26+
private final boolean isNameStartCharBranchy(char c, boolean mXml11) {
27+
/* First, let's handle 7-bit ascii range (identical between xml
28+
* 1.0 and 1.1)
29+
*/
30+
if (c <= 0x7A) { // 'z' or earlier
31+
if (c >= 0x61) { // 'a' - 'z' are ok
32+
return true;
33+
}
34+
if (c < 0x41) { // before 'A' just white space
35+
return false;
36+
}
37+
return (c <= 0x5A) || (c == '_'); // 'A' - 'Z' and '_' are ok
38+
}
39+
/* Ok, otherwise need to use a big honking bit sets... which
40+
* differ between 1.0 and 1.1
41+
*/
42+
return mXml11 ? XmlChars.is11NameStartChar(c) : XmlChars.is10NameStartChar(c);
43+
}
44+
45+
@Test
46+
public void testIsNameCharBehavesSameAsBranchyVersion() {
47+
WstxInputData wstxInputDataXml10 = new WstxInputData();
48+
WstxInputData wstxInputDataXml11 = new WstxInputData();
49+
wstxInputDataXml11.mXml11 = true;
50+
51+
// include all 7-bit ASCII characters plus some left and right
52+
IntStream.range(-10, 138).forEach(i -> {
53+
char c = (char) i;
54+
assertEquals(isNameCharBranchy(c, false), wstxInputDataXml10.isNameChar(c));
55+
assertEquals(isNameCharBranchy(c, true), wstxInputDataXml11.isNameChar(c));
56+
});
57+
}
58+
59+
// previous implementation with branches
60+
private final boolean isNameCharBranchy(char c, boolean mXml11) {
61+
// First, let's handle 7-bit ascii range
62+
if (c <= 0x7A) { // 'z' or earlier
63+
if (c >= 0x61) { // 'a' - 'z' are ok
64+
return true;
65+
}
66+
if (c <= 0x5A) {
67+
if (c >= 0x41) { // 'A' - 'Z' ok too
68+
return true;
69+
}
70+
// As are 0-9, '.' and '-'
71+
return (c >= 0x30 && c <= 0x39) || (c == '.') || (c == '-');
72+
}
73+
return (c == 0x5F); // '_' is ok too
74+
}
75+
return mXml11 ? XmlChars.is11NameChar(c) : XmlChars.is10NameChar(c);
76+
}
77+
}

0 commit comments

Comments
 (0)