Skip to content

Commit 2f52505

Browse files
committed
Fix issue with umlaut at the start of a token
1 parent 679baf3 commit 2f52505

File tree

2 files changed

+12
-1
lines changed

2 files changed

+12
-1
lines changed

itest/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessorITest.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,4 +65,15 @@ public void testUmlauts() {
6565
List<String> umlautGoldTokens = Arrays.asList("Welcher", "der", "Befunde", "ist", furry, "eine", "Gehirnerkrankung", "typisch", "?");
6666
testExample(umlautExample, umlautGoldTokens);
6767
}
68+
69+
/**
70+
* Test that an umlaut at the start of a word doesn't crash
71+
*/
72+
public void testUmlautSpaces() {
73+
String antik = "Antik ̈orper";
74+
assertEquals(12, antik.length());
75+
76+
List<String> goldTokens = Arrays.asList(antik.substring(0, 5), antik.substring(6, 12));
77+
testExample(antik, goldTokens);
78+
}
6879
}

src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ public static String condenseUmlauts(String value) {
7575
ns = new StringBuilder(value.length());
7676
ns.append(value.substring(0, i));
7777
}
78-
final char prev = ns.charAt(ns.length() - 1);
78+
final char prev = ns.length() == 0 ? ' ' : ns.charAt(ns.length() - 1);
7979
if (prev == 'a') {
8080
ns.setCharAt(ns.length() - 1, 'ä');
8181
} else if (prev == 'A') {

0 commit comments

Comments
 (0)