Don't use lineSeparator for the 'after' between sentences. Just put a normal whitespace unless the CoNLLU document specifically has a SpacesAfter

AngledLuffa · AngledLuffa · commit a9bef7d3c835 · 2025-05-30T17:00:52.000-07:00
diff --git a/itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java b/itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java
@@ -31,7 +31,7 @@ public class CoNLLUReaderITest {
     "Pero la  existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica.",
     "De allí las rebajas."
   };
-  static final String EXPECTED_TEXT = String.join(System.lineSeparator(), EXPECTED_SENTENCE_TEXT) + System.lineSeparator();
+  static final String EXPECTED_TEXT = String.join(" ", EXPECTED_SENTENCE_TEXT) + " ";
 
   static final String[][] EXPECTED_WORD_TEXT = {
     {"Pero", "la", "existencia", "de", "dos", "recién", "nacidos", "en", "la", "misma", "caja", "sólo", "podía", "deber", "se", "a", "un", "descuido", "de", "fábrica", "."},
@@ -194,8 +194,6 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
         CoreLabel token = tokens.get(j);
         if (i == 0 && j == 1) {
           assertEquals("  ", token.after());
-        } else if (j == tokens.size() - 1) {
-          assertEquals(System.lineSeparator(), token.after());
         } else if (j == tokens.size() - 2) {
           assertEquals("", token.after());
         } else if (i == 0 && j == 13) {
@@ -207,10 +205,7 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
         if (i == 0 && j == 2) {
           assertEquals("  ", token.before());
         } else if (i == 0 && j == 0) {
-          // TODO: is it properly reading the SpacesBefore on the first token?
           assertEquals("", token.before());
-        } else if (j == 0) {
-          assertEquals(System.lineSeparator(), token.before());
         } else if (j == tokens.size() - 1) {
           assertEquals("", token.before());
         } else if (i == 0 && j == 14) {
@@ -265,7 +260,11 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
         if (i == 0 && (j == 13 || j == 14)) {
           expectedKeys += 1;
         }
-        assertEquals(expectedKeys, token.keySet().size());
+        if (i == 0 && j == 0) {
+          // The very first key won't have a Before unless the document specifically has one
+          expectedKeys -= 1;
+        }
+        assertEquals("Error at sentence " + i + " word " + j, expectedKeys, token.keySet().size());
 
         // The known fields should be the ones checked above:
         //    CoreAnnotations.TextAnnotation
diff --git a/src/edu/stanford/nlp/pipeline/CoNLLUReader.java b/src/edu/stanford/nlp/pipeline/CoNLLUReader.java
@@ -332,16 +332,22 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
     // set sentences
     finalAnnotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
     // build document wide CoreLabels list
+    // TODO: do we need to put new SentenceIndexAnnotations on each of the IndexedWords?
+    // TODO: what about document annotation?
+    //   We should confirm that setting the SentenceIndexAnnotation like this isn't
+    //   distorting any of the SemanticGraphs
     List<CoreLabel> tokens = new ArrayList<>();
     finalAnnotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
     int documentIdx = 0;
     int sentenceIdx = 0;
     for (CoreMap sentence : finalAnnotation.get(CoreAnnotations.SentencesAnnotation.class)) {
       sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIdx);
       if (sentenceIdx > 0) {
-        // for now we're treating a CoNLL-U document as sentences separated by newline
-        // so every sentence after the first should have a newline as the previous character
-        sentence.get(CoreAnnotations.TokensAnnotation.class).get(0).setBefore(System.lineSeparator());
+        CoreMap previousSentence = finalAnnotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceIdx-1);
+        List<CoreLabel> previousTokens = previousSentence.get(CoreAnnotations.TokensAnnotation.class);
+        CoreLabel previousToken = previousTokens.get(previousTokens.size() - 1);
+        String previousAfter = previousToken.get(CoreAnnotations.AfterAnnotation.class);
+        sentence.get(CoreAnnotations.TokensAnnotation.class).get(0).set(CoreAnnotations.BeforeAnnotation.class, previousAfter);
       }
       for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
         token.set(CoreAnnotations.TokenBeginAnnotation.class, documentIdx);
@@ -482,15 +488,10 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
       CoreLabel cl = convertLineToCoreLabel(sentence, line);
       coreLabels.add(cl);
     }
-    // the last token should have a newline after
-    coreLabels.get(coreLabels.size() - 1).setAfter(System.lineSeparator());
-    // set before
-    if (!coreLabels.get(0).containsKey(CoreAnnotations.BeforeAnnotation.class)) {
-      coreLabels.get(0).setBefore("");
-    }
     for (int i = 1 ; i < coreLabels.size() ; i++) {
       // all words should match the after of the previous token
-      coreLabels.get(i).setBefore(coreLabels.get(i - 1).after());
+      coreLabels.get(i).set(CoreAnnotations.BeforeAnnotation.class,
+                            coreLabels.get(i - 1).get(CoreAnnotations.AfterAnnotation.class));
     }
     // handle MWT tokens and build the final sentence text
     int sentenceCharBegin = doc.docText.length();