CoNLLUReader processes empty tokens and adds them to the Sentence CoreMap with the EmptyTokensAnnotation list. Add a check in the CoNLLUReaderITest that checks that a sentence with an empty token is properly read.

AngledLuffa · AngledLuffa · commit 75bec7c1436e · 2025-05-28T23:24:51.000-07:00
diff --git a/data/edu/stanford/nlp/pipeline/en-example.conllu b/data/edu/stanford/nlp/pipeline/en-example.conllu
@@ -0,0 +1,17 @@
+# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0022
+# text = Over 300 Iraqis are reported dead and 500 wounded in Fallujah alone.
+1	Over	over	ADV	RB	_	2	advmod	2:advmod	_
+2	300	300	NUM	CD	NumForm=Digit|NumType=Card	3	nummod	3:nummod	_
+3	Iraqis	Iraqi	PROPN	NNPS	Number=Plur	5	nsubj:pass	5:nsubj:pass|6:nsubj:xsubj|8:nsubj:pass	_
+4	are	be	AUX	VBP	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	5	aux:pass	5:aux:pass	_
+5	reported	report	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	0:root	_
+6	dead	dead	ADJ	JJ	Degree=Pos	5	xcomp	5:xcomp	_
+7	and	and	CCONJ	CC	_	8	cc	8:cc|8.1:cc	_
+8	500	500	NUM	CD	NumForm=Digit|NumType=Card	5	conj	5:conj:and|8.1:nsubj:pass|9:nsubj:xsubj	_
+8.1	reported	report	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	_	_	5:conj:and	CopyOf=5
+9	wounded	wounded	ADJ	JJ	Degree=Pos	8	orphan	8.1:xcomp	_
+10	in	in	ADP	IN	_	11	case	11:case	_
+11	Fallujah	Fallujah	PROPN	NNP	Number=Sing	5	obl	5:obl:in	_
+12	alone	alone	ADV	RB	_	11	advmod	11:advmod	SpaceAfter=No
+13	.	.	PUNCT	.	_	5	punct	5:punct	_
+
diff --git a/itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java b/itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java
@@ -23,9 +23,6 @@
 public class CoNLLUReaderITest {
 
   public String examplePath = String.format("edu/stanford/nlp/pipeline/es-example.conllu");
-  public StanfordCoreNLP pipeline;
-  public Annotation goldDocument;
-  public Annotation readInDocument;
 
   static final String[] EXPECTED_SENTENCE_TEXT = {
     "Pero la  existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica.",
@@ -98,7 +95,7 @@ public class CoNLLUReaderITest {
 
   @Test
   public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException {
-    readInDocument = new CoNLLUReader(new Properties()).readCoNLLUFile(examplePath).get(0);
+    Annotation readInDocument = readInDocument = new CoNLLUReader(new Properties()).readCoNLLUFile(examplePath).get(0);
 
     assertTrue(readInDocument.containsKey(CoreAnnotations.TextAnnotation.class));
     assertTrue(readInDocument.containsKey(CoreAnnotations.TokensAnnotation.class));
@@ -321,4 +318,42 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
       }
     }
   }
+
+  public String emptiesPath = String.format("edu/stanford/nlp/pipeline/en-example.conllu");
+
+  String[] EXPECTED_ENGLISH_WORDS = {
+    "Over", "300", "Iraqis", "are", "reported", "dead", "and", "500", "wounded", "in", "Fallujah", "alone", "."
+  };
+
+  @Test
+  /**
+   * Here we run fewer tests.  Just make sure the EmptyToken is properly handled,
+   * and make sure there isn't some weird line skipping going on with the rest of the tokens
+   */
+  public void testReadingInEmpties() throws ClassNotFoundException, IOException {
+    Annotation readInDocument = new CoNLLUReader(new Properties()).readCoNLLUFile(emptiesPath).get(0);
+
+    // this document only has one sentence
+    List<CoreMap> sentences = readInDocument.get(CoreAnnotations.SentencesAnnotation.class);
+    assertEquals(1, sentences.size());
+
+    CoreMap sentence = sentences.get(0);
+
+    // cursory check of the tokens
+    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+    assertEquals(13, tokens.size());
+    assertEquals(13, EXPECTED_ENGLISH_WORDS.length);
+    for (int i = 0; i < tokens.size(); ++i) {
+      assertEquals(i+1, tokens.get(i).index());
+      assertEquals(EXPECTED_ENGLISH_WORDS[i], tokens.get(i).value());
+    }
+
+    List<CoreLabel> emptyTokens = sentence.get(CoreAnnotations.EmptyTokensAnnotation.class);
+    assertEquals(1, emptyTokens.size());
+    CoreLabel empty = emptyTokens.get(0);
+    assertEquals(8, empty.index());
+    assertEquals(Integer.valueOf(1), empty.get(CoreAnnotations.EmptyIndexAnnotation.class));
+    assertEquals("reported", empty.value());
+  }
+
 }
diff --git a/src/edu/stanford/nlp/ling/CoreAnnotations.java b/src/edu/stanford/nlp/ling/CoreAnnotations.java
@@ -372,6 +372,20 @@ public Class<Integer> getType() {
     }
   }
 
+  /**
+   * This represents a list of the empty words.  We can attach to the
+   * Sentence CoreMap when reading CoNLLU files with such empty words
+   * <br>
+   * See the desctiption of EmptyIndexAnnotation for more explanation
+   * of when this is relevant
+   */
+  public static class EmptyTokensAnnotation implements CoreAnnotation<List<CoreLabel>> {
+    @Override
+    public Class<List<CoreLabel>> getType() {
+      return ErasureUtils.uncheckedCast(List.class);
+    }
+  }
+
   /**
    * This indexes the beginning of a span of words, e.g., a constituent in a
    * tree. See {@link edu.stanford.nlp.trees.Tree#indexSpans(int)}.
diff --git a/src/edu/stanford/nlp/pipeline/CoNLLUReader.java b/src/edu/stanford/nlp/pipeline/CoNLLUReader.java
@@ -24,7 +24,6 @@ public class CoNLLUReader {
    * field constants
    **/
   // TODO: we should handle field 8, DEPS, for an enhanced dependencies
-  // doing that requires processing the empty nodes somehow
   // TODO: read sent_id?
   // TODO: read comments in general
   // TODO: SpacesBefore on the first token should be checked
@@ -48,6 +47,7 @@ public class CoNLLUReader {
   public static Pattern DOCUMENT_LINE = Pattern.compile("^# newdoc");
   public static Pattern MWT_LINE = Pattern.compile("^[0-9]+-[0-9]+.*");
   public static Pattern TOKEN_LINE = Pattern.compile("^[0-9]+\t.*");
+  public static Pattern EMPTY_LINE = Pattern.compile("^[0-9]+[.][0-9]+\t.*");
 
   /**
    * shorthands for CoreAnnotations
@@ -219,6 +219,8 @@ public class CoNLLUSentence {
 
     // the token lines
     public List<String> tokenLines = new ArrayList<>();
+    // in case the enhanced dependencies have empty words
+    public List<String> emptyLines = new ArrayList<>();
     // data for the sentence contained in # key values
     public HashMap<String, String> sentenceData = new HashMap<>();
     // map indices in token list to mwt data if there is any
@@ -240,8 +242,9 @@ else if (MWT_LINE.matcher(line).matches())
         addMWTData(line);
       else if (TOKEN_LINE.matcher(line).matches())
         tokenLines.add(line);
+      else if (EMPTY_LINE.matcher(line).matches())
+        emptyLines.add(line);
       else
-        // TODO: this is ignoring "empty" tokens
         return true;
       return false;
     }
@@ -359,7 +362,23 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
   public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
     List<String> fields = Arrays.asList(line.split("\t"));
     CoreLabel cl = new CoreLabel();
-    int sentenceTokenIndex = Integer.valueOf(fields.get(CoNLLU_IndexField));
+
+    String indexField = fields.get(CoNLLU_IndexField);
+    int sentenceTokenIndex;
+    boolean isEmpty;
+    if (indexField.indexOf('.') >= 0) {
+      isEmpty = true;
+      String[] indexPieces = indexField.split("[.]", 2);
+      sentenceTokenIndex = Integer.valueOf(indexPieces[0]);
+      cl.setIndex(sentenceTokenIndex);
+      int emptyIndex = Integer.valueOf(indexPieces[1]);
+      cl.set(CoreAnnotations.EmptyIndexAnnotation.class, emptyIndex);
+    } else {
+      isEmpty = false;
+      sentenceTokenIndex = Integer.valueOf(indexField);
+      cl.setIndex(sentenceTokenIndex);
+    }
+
     cl.setWord(fields.get(CoNLLU_WordField));
     cl.setValue(fields.get(CoNLLU_WordField));
     cl.setOriginalText(fields.get(CoNLLU_WordField));
@@ -383,10 +402,14 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
          extraColumnIdx++) {
       cl.set(extraColumns.get(extraColumnIdx), fields.get(extraColumnIdx));
     }
-    cl.setIndex(sentenceTokenIndex);
 
-    // handle the MWT info
-    if (sentence.mwtData.containsKey(sentenceTokenIndex - 1)) {
+    // handle the MWT info and after text
+    if (isEmpty) {
+      // don't set an after for empty tokens
+      // empty tokens are not considered part of MWT
+      cl.setIsMWT(false);
+      cl.setIsMWTFirst(false);
+    } else if (sentence.mwtData.containsKey(sentenceTokenIndex - 1)) {
       // set MWT text
       cl.set(CoreAnnotations.MWTTokenTextAnnotation.class,
           sentence.mwtTokens.get(sentence.mwtData.get(sentenceTokenIndex - 1)));
@@ -487,6 +510,12 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
       }
     }
 
+    List<CoreLabel> emptyLabels = new ArrayList<CoreLabel>();
+    for (String line : sentence.emptyLines) {
+      CoreLabel cl = convertLineToCoreLabel(sentence, line);
+      emptyLabels.add(cl);
+    }
+
     // build SemanticGraphEdges
     List<SemanticGraphEdge> graphEdges = new ArrayList<>();
     for (int i = 0; i < lines.size(); i++) {
@@ -505,6 +534,11 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
     Annotation sentenceCoreMap = new Annotation(doc.docText.substring(sentenceCharBegin).trim());
     // add tokens
     sentenceCoreMap.set(CoreAnnotations.TokensAnnotation.class, coreLabels);
+    // add empty tokens, if any exist
+    if (emptyLabels.size() > 0) {
+      sentenceCoreMap.set(CoreAnnotations.EmptyTokensAnnotation.class, emptyLabels);
+    }
+
     // add dependency graph
     sentenceCoreMap.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, depParse);
     return sentenceCoreMap;