Skip to content

Commit 431ad54

Browse files
committed
Skip NBSP when reading characters, just like other whitespace characters
Adds a test that spaces and NBSP get the right character offsets in the segmenter annotator
1 parent 7c84960 commit 431ad54

File tree

3 files changed

+18
-3
lines changed

3 files changed

+18
-3
lines changed

itest/src/edu/stanford/nlp/pipeline/ChineseSegmenterAnnotatorITest.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,18 @@ public void testPipeline() {
3838
new int[]{0, 1, 3, 5, 7, 8},
3939
new int[]{1, 3, 5, 7, 8, 9});
4040

41+
// test that it does something reasonable with spaces
42+
testOne("我在 加州 工作 ",
43+
new String[]{"我", "在", "加州", "工作"},
44+
new int[]{0, 1, 3, 6},
45+
new int[]{1, 2, 5, 8});
46+
47+
// test that it does something reasonable with NBSP
48+
testOne("我在 加州 工作 ",
49+
new String[]{"我", "在", "加州", "工作"},
50+
new int[]{0, 1, 3, 6},
51+
new int[]{1, 2, 5, 8});
52+
4153
// All of the tools should now produce () instead of -LRB- -RRB-
4254
testOne("你马上回来(北京)吗?",
4355
new String[]{"你", "马上", "回来", "(", "北京", ")", "吗", "?"},

src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,9 @@ public List<String> segmentString(String sentence, DocumentReaderAndWriter<IN> r
689689
if (segmented.length() == 0) {
690690
return Collections.emptyList();
691691
} else {
692+
// \\p{Zs} would catch more whitespace options than \\s,
693+
// but hopefully the upstream segmentation handled
694+
// unusual whitespace such as NBSP already
692695
return Arrays.asList(segmented.split("\\s"));
693696
}
694697
}

src/edu/stanford/nlp/wordseg/Sighan2005DocumentReaderAndWriter.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ public List<CoreLabel> apply(String line) {
160160
// TODO: can double chars be whitespace / isocontrol?
161161
int codePoint = Character.codePointAt(line, index);
162162
CoreLabel wi = new CoreLabel();
163-
if ( ! Character.isWhitespace(codePoint) && ! Character.isISOControl(codePoint)) {
163+
if (!Character.isWhitespace(codePoint) && !Character.isISOControl(codePoint) && !Character.isSpaceChar(codePoint)) {
164164
boolean surrogate = Character.isSupplementaryCodePoint(codePoint);
165165
String wordString;
166166
if (surrogate) {
@@ -171,7 +171,7 @@ public List<CoreLabel> apply(String line) {
171171
wi.set(CoreAnnotations.CharAnnotation.class, intern(wordString));
172172

173173
// non-breaking space is skipped as well
174-
while (Character.isWhitespace(origLine.charAt(origIndex)) || Character.isISOControl(origLine.charAt(origIndex)) || (origLine.charAt(origIndex) == '\u00A0')) {
174+
while (Character.isWhitespace(origLine.charAt(origIndex)) || Character.isISOControl(origLine.charAt(origIndex)) || Character.isSpaceChar(origLine.charAt(origIndex))) {
175175
origIndex++;
176176
}
177177

@@ -197,7 +197,7 @@ public List<CoreLabel> apply(String line) {
197197
wi.set(CoreAnnotations.AnswerAnnotation.class, "1");
198198
wi.set(CoreAnnotations.SpaceBeforeAnnotation.class, "1");
199199
wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "1");
200-
} else if (Character.isWhitespace(line.charAt(index - 1)) || Character.isISOControl(line.charAt(index - 1))) {
200+
} else if (Character.isWhitespace(line.charAt(index - 1)) || Character.isISOControl(line.charAt(index - 1)) || Character.isSpaceChar(line.charAt(index - 1))) {
201201
wi.set(CoreAnnotations.AnswerAnnotation.class, "1");
202202
wi.set(CoreAnnotations.SpaceBeforeAnnotation.class, "1");
203203
wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "1");

0 commit comments

Comments
 (0)