Read a SpacesBefore at the start of a sentence / document and keep it on the token. Other spaces are generally set to match the after of the previous token. Include a short test of that in the unit test

AngledLuffa · AngledLuffa · commit b3f8f97397f5 · 2025-05-29T15:48:15.000-07:00
diff --git a/data/edu/stanford/nlp/pipeline/en-example.conllu b/data/edu/stanford/nlp/pipeline/en-example.conllu
@@ -1,6 +1,6 @@
 # sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0022
 # text = Over 300 Iraqis are reported dead and 500 wounded in Fallujah alone.
-1	Over	over	ADV	RB	_	2	advmod	2:advmod	_
+1	Over	over	ADV	RB	_	2	advmod	2:advmod	SpacesBefore=\s\s
 2	300	300	NUM	CD	NumForm=Digit|NumType=Card	3	nummod	3:nummod	_
 3	Iraqis	Iraqi	PROPN	NNPS	Number=Plur	5	nsubj:pass	5:nsubj:pass|6:nsubj:xsubj|8:nsubj:pass	_
 4	are	be	AUX	VBP	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	5	aux:pass	5:aux:pass	_
diff --git a/itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java b/itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java
@@ -419,6 +419,8 @@ public void testReadingInEmpties() throws ClassNotFoundException, IOException {
 
     SemanticGraph enhanced = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
     assertEquals(EXPECTED_ENHANCED, enhanced);
+
+    assertEquals("  ", tokens.get(0).before());
   }
 
 }
diff --git a/src/edu/stanford/nlp/pipeline/CoNLLUReader.java b/src/edu/stanford/nlp/pipeline/CoNLLUReader.java
@@ -25,8 +25,8 @@ public class CoNLLUReader {
    **/
   // TODO: read sent_id?
   // TODO: read comments in general
-  // TODO: SpacesBefore on the first token should be checked
   // TODO: reconsider the newline as the after on the last word
+  // TODO: keep around the rest of the misc annotations
   public static final int CoNLLU_IndexField = 0;
   public static final int CoNLLU_WordField = 1;
   public static final int CoNLLU_LemmaField = 2;
@@ -408,6 +408,19 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
       cl.set(extraColumns.get(extraColumnIdx), fields.get(extraColumnIdx));
     }
 
+    Map<String, String> miscKeyValues = new HashMap<>();
+    if (!fields.get(CoNLLU_MiscField).equals("_")) {
+      Arrays.stream(fields.get(CoNLLU_MiscField).split("\\|")).forEach(
+        kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
+    }
+
+    // SpacesBefore on a word that isn't the first in a document will
+    // be replaced with the SpacesAfter from the previous token later
+    String spacesBefore = miscKeyValues.get("SpacesBefore");
+    if (spacesBefore != null) {
+      cl.setBefore(unescapeSpacesAfter(spacesBefore));
+    }
+
     // handle the MWT info and after text
     if (isEmpty) {
       // don't set an after for empty tokens
@@ -437,10 +450,10 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
       } else {
         String miscInfo = sentence.mwtMiscs.get(sentence.mwtData.get(sentenceTokenIndex - 1));
         if (miscInfo != null && !miscInfo.equals("_")) {
-          Map<String, String> miscKeyValues = new HashMap<>();
+          Map<String, String> mwtKeyValues = new HashMap<>();
           Arrays.stream(miscInfo.split("\\|")).forEach(
-            kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
-          String spaceAfter = miscToSpaceAfter(miscKeyValues);
+            kv -> mwtKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
+          String spaceAfter = miscToSpaceAfter(mwtKeyValues);
           cl.setAfter(spaceAfter);
         } else {
           cl.setAfter(" ");
@@ -450,15 +463,8 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
       cl.setIsMWT(false);
       cl.setIsMWTFirst(false);
 
-      if (!fields.get(CoNLLU_MiscField).equals("_")) {
-        Map<String, String> miscKeyValues = new HashMap<>();
-        Arrays.stream(fields.get(CoNLLU_MiscField).split("\\|")).forEach(
-          kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
-        String spaceAfter = miscToSpaceAfter(miscKeyValues);
-        cl.setAfter(spaceAfter);
-      } else {
-        cl.setAfter(" ");
-      }
+      String spaceAfter = miscToSpaceAfter(miscKeyValues);
+      cl.setAfter(spaceAfter);
     }
     return cl;
   }
@@ -477,7 +483,9 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
     // the last token should have a newline after
     coreLabels.get(coreLabels.size() - 1).setAfter(System.lineSeparator());
     // set before
-    coreLabels.get(0).setBefore("");
+    if (!coreLabels.get(0).containsKey(CoreAnnotations.BeforeAnnotation.class)) {
+      coreLabels.get(0).setBefore("");
+    }
     for (int i = 1 ; i < coreLabels.size() ; i++) {
       // all words should match the after of the previous token
       coreLabels.get(i).setBefore(coreLabels.get(i - 1).after());

Original file line number	Diff line number	Diff line change
`@@ -419,6 +419,8 @@ public void testReadingInEmpties() throws ClassNotFoundException, IOException {`
`419`	`419`
`420`	`420`	`SemanticGraph enhanced = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);`
`421`	`421`	`assertEquals(EXPECTED_ENHANCED, enhanced);`
	`422`	`+`
	`423`	`+ assertEquals(" ", tokens.get(0).before());`
`422`	`424`	`}`
`423`	`425`
`424`	`426`	`}`