Skip to content

Commit 5360afa

Browse files
committed
Add an option to turn off ssplit in the tokenizer annotator. Not sure this is useful, but at least it allows for more fine-grained testing of the WordToSentenceProcessor
1 parent 3e828eb commit 5360afa

File tree

2 files changed

+11
-6
lines changed

2 files changed

+11
-6
lines changed

src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,11 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
256256
this.cleanxmlAnnotator = null;
257257
}
258258

259-
this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
259+
if (PropertiesUtils.getBool(props, STANFORD_TOKENIZE + "." + STANFORD_SSPLIT, true)) {
260+
this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
261+
} else {
262+
this.ssplitAnnotator = null;
263+
}
260264
}
261265

262266
/**
@@ -445,7 +449,9 @@ public void annotate(Annotation annotation) {
445449
if (this.cleanxmlAnnotator != null) {
446450
this.cleanxmlAnnotator.annotate(annotation);
447451
}
448-
this.ssplitAnnotator.annotate(annotation);
452+
if (this.ssplitAnnotator != null) {
453+
this.ssplitAnnotator.annotate(annotation);
454+
}
449455
}
450456

451457
@Override

test/src/edu/stanford/nlp/process/WordToSentenceProcessorTest.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
public class WordToSentenceProcessorTest extends TestCase {
1919

2020
private static final TokenizerAnnotator onelineTokenizer =
21-
new TokenizerAnnotator(false, PropertiesUtils.asProperties("tokenize.language", "en", "ssplit.isOneSentence", "true"), null);
21+
new TokenizerAnnotator(false, PropertiesUtils.asProperties("tokenize.language", "en", "tokenize.ssplit", "false"), null);
2222
private static final TokenizerAnnotator udNL =
23-
new TokenizerAnnotator(false, "en", "invertible,tokenizeNLs=true");
23+
new TokenizerAnnotator(false, PropertiesUtils.asProperties("tokenize.language", "en", "tokenize.ssplit", "false"), "invertible,tokenizeNLs=true");
2424
private static final TokenizerAnnotator wsNL =
2525
new TokenizerAnnotator(false, PropertiesUtils.asProperties("tokenize.whitespace", "true", "invertible", "true", "tokenizeNLs", "true"));
2626

@@ -208,9 +208,8 @@ public void testBlankLines() {
208208
public void testExclamationPoint() {
209209
Annotation annotation = new Annotation("Foo!!");
210210
onelineTokenizer.annotate(annotation);
211-
// the TokenizerAnnotator will add ids by default
212211
List<CoreLabel> list = annotation.get(CoreAnnotations.TokensAnnotation.class);
213-
assertEquals("Wrong double bang", "[Foo-1, !!-2]", list.toString());
212+
assertEquals("Wrong double bang", "[Foo, !!]", list.toString());
214213
}
215214

216215
public void testChinese() {

0 commit comments

Comments
 (0)