Add an option to turn off ssplit in the tokenizer annotator. Not sure this is useful, but at least it allows for more fine-grained testing of the WordToSentenceProcessor

AngledLuffa · AngledLuffa · commit 5360afa4ad89 · 2022-04-07T01:01:32.000-07:00
diff --git a/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java b/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
@@ -256,7 +256,11 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
       this.cleanxmlAnnotator = null;
     }
 
-    this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
+    if (PropertiesUtils.getBool(props, STANFORD_TOKENIZE + "." + STANFORD_SSPLIT, true)) {
+      this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
+    } else {
+      this.ssplitAnnotator = null;
+    }
   }
 
   /**
@@ -445,7 +449,9 @@ public void annotate(Annotation annotation) {
     if (this.cleanxmlAnnotator != null) {
       this.cleanxmlAnnotator.annotate(annotation);
     }
-    this.ssplitAnnotator.annotate(annotation);
+    if (this.ssplitAnnotator != null) {
+      this.ssplitAnnotator.annotate(annotation);
+    }
   }
 
   @Override
diff --git a/test/src/edu/stanford/nlp/process/WordToSentenceProcessorTest.java b/test/src/edu/stanford/nlp/process/WordToSentenceProcessorTest.java
@@ -18,9 +18,9 @@
 public class WordToSentenceProcessorTest extends TestCase {
 
   private static final TokenizerAnnotator onelineTokenizer =
-    new TokenizerAnnotator(false, PropertiesUtils.asProperties("tokenize.language", "en", "ssplit.isOneSentence", "true"), null);
+    new TokenizerAnnotator(false, PropertiesUtils.asProperties("tokenize.language", "en", "tokenize.ssplit", "false"), null);
   private static final TokenizerAnnotator udNL =
-    new TokenizerAnnotator(false, "en", "invertible,tokenizeNLs=true");
+    new TokenizerAnnotator(false, PropertiesUtils.asProperties("tokenize.language", "en", "tokenize.ssplit", "false"), "invertible,tokenizeNLs=true");
   private static final TokenizerAnnotator wsNL =
     new TokenizerAnnotator(false, PropertiesUtils.asProperties("tokenize.whitespace", "true", "invertible", "true", "tokenizeNLs", "true"));
 
@@ -208,9 +208,8 @@ public void testBlankLines() {
   public void testExclamationPoint() {
     Annotation annotation = new Annotation("Foo!!");
     onelineTokenizer.annotate(annotation);
-    // the TokenizerAnnotator will add ids by default
     List<CoreLabel> list = annotation.get(CoreAnnotations.TokensAnnotation.class);
-    assertEquals("Wrong double bang", "[Foo-1, !!-2]", list.toString());
+    assertEquals("Wrong double bang", "[Foo, !!]", list.toString());
   }
 
   public void testChinese() {

Original file line number	Diff line number	Diff line change
`@@ -256,7 +256,11 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {`
`256`	`256`	`this.cleanxmlAnnotator = null;`
`257`	`257`	`}`
`258`	`258`
`259`		`- this.ssplitAnnotator = new WordsToSentencesAnnotator(props);`
	`259`	`+ if (PropertiesUtils.getBool(props, STANFORD_TOKENIZE + "." + STANFORD_SSPLIT, true)) {`
	`260`	`+ this.ssplitAnnotator = new WordsToSentencesAnnotator(props);`
	`261`	`+ } else {`
	`262`	`+ this.ssplitAnnotator = null;`
	`263`	`+ }`
`260`	`264`	`}`
`261`	`265`
`262`	`266`	`/**`
`@@ -445,7 +449,9 @@ public void annotate(Annotation annotation) {`
`445`	`449`	`if (this.cleanxmlAnnotator != null) {`
`446`	`450`	`this.cleanxmlAnnotator.annotate(annotation);`
`447`	`451`	`}`
`448`		`- this.ssplitAnnotator.annotate(annotation);`
	`452`	`+ if (this.ssplitAnnotator != null) {`
	`453`	`+ this.ssplitAnnotator.annotate(annotation);`
	`454`	`+ }`
`449`	`455`	`}`
`450`	`456`
`451`	`457`	`@Override`