Skip to content

Commit cfb5028

Browse files
committed
Merge branch 'dev' of https://github.com/stanfordnlp/CoreNLP into dev
2 parents fdf69c4 + bd93593 commit cfb5028

File tree

3 files changed

+14
-3
lines changed

3 files changed

+14
-3
lines changed

doc/corenlp/README.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ LICENSE
4141
CHANGES
4242
---------------------------------
4343

44-
2021-10-14 4.3.1 Fix some issues with Hungarian and Italian
45-
pipelines.
44+
2021-10-14 4.3.1 Minor bugfixes to German umlaut tokenization,
45+
CDC tokenizer offset, Hungarian model package
4646

4747
2021-09-26 4.3.0 Add trained tokenizer from corenlp-it, add
4848
Italian and Hungarian pipelines using data

itest/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessorITest.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,4 +65,15 @@ public void testUmlauts() {
6565
List<String> umlautGoldTokens = Arrays.asList("Welcher", "der", "Befunde", "ist", furry, "eine", "Gehirnerkrankung", "typisch", "?");
6666
testExample(umlautExample, umlautGoldTokens);
6767
}
68+
69+
/**
70+
* Test that an umlaut at the start of a word doesn't crash
71+
*/
72+
public void testUmlautSpaces() {
73+
String antik = "Antik ̈orper";
74+
assertEquals(12, antik.length());
75+
76+
List<String> goldTokens = Arrays.asList(antik.substring(0, 5), antik.substring(6, 12));
77+
testExample(antik, goldTokens);
78+
}
6879
}

src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ public static String condenseUmlauts(String value) {
7575
ns = new StringBuilder(value.length());
7676
ns.append(value.substring(0, i));
7777
}
78-
final char prev = ns.charAt(ns.length() - 1);
78+
final char prev = ns.length() == 0 ? ' ' : ns.charAt(ns.length() - 1);
7979
if (prev == 'a') {
8080
ns.setCharAt(ns.length() - 1, 'ä');
8181
} else if (prev == 'A') {

0 commit comments

Comments
 (0)