Read the enhanced graph in the CoNLLUReader

AngledLuffa · AngledLuffa · commit c59716b670d2 · 2025-05-29T10:07:46.000-07:00
diff --git a/itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java b/itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java
@@ -2,6 +2,7 @@
 
 import edu.stanford.nlp.ling.*;
 import edu.stanford.nlp.semgraph.*;
+import edu.stanford.nlp.trees.GrammaticalRelation;
 import edu.stanford.nlp.util.*;
 
 import static org.junit.Assert.assertEquals;
@@ -11,7 +12,9 @@
 
 import java.io.*;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Properties;
 
 import org.junit.Before;
@@ -111,10 +114,14 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
     }
 
     // Compare sentence ids
+    // Check that the enhanced dependencies exist
+    //   (these sentences should all have them, but we will check it
+    //    more thoroughly for the Empty test sentence)
     // Check number of keys on each sentence
     for (int i = 0; i < sentences.size(); ++i) {
       assertEquals(Integer.valueOf(i), sentences.get(i).get(CoreAnnotations.SentenceIndexAnnotation.class));
-      assertEquals(4, sentences.get(i).keySet().size());
+      assertTrue(sentences.get(i).containsKey(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class));
+      assertEquals(5, sentences.get(i).keySet().size());
     }
 
     // Check the document tokens and the sentence tokens lists are the same
@@ -319,12 +326,66 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
     }
   }
 
-  public String emptiesPath = String.format("edu/stanford/nlp/pipeline/en-example.conllu");
+  public static final String emptiesPath = String.format("edu/stanford/nlp/pipeline/en-example.conllu");
 
-  String[] EXPECTED_ENGLISH_WORDS = {
+  static final String[] EXPECTED_ENGLISH_WORDS = {
     "Over", "300", "Iraqis", "are", "reported", "dead", "and", "500", "wounded", "in", "Fallujah", "alone", "."
   };
 
+  static final String[][] EXPECTED_ENHANCED_EDGES = {
+    {"1", "2", "advmod"},
+    {"2", "3", "nummod"},
+    {"3", "5", "nsubj:pass"},
+    {"3", "6", "nsubj:xsubj"},
+    {"3", "8", "nsubj:pass"},
+    {"4", "5", "aux:pass"},
+    {"6", "5", "xcomp"},
+    {"7", "8", "cc"},
+    {"7", "8.1", "cc"},
+    {"8", "5", "conj:and"},
+    {"8", "8.1", "nsubj:pass"},
+    {"8", "9", "nsubj:xsubj"},
+    {"8.1", "5", "conj:and"},
+    {"9", "8.1", "xcomp"},
+    {"10", "11", "case"},
+    {"11", "5", "obl:in"},
+    {"12", "11", "advmod"},
+    {"13", "5", "punct"},
+  };
+  static final SemanticGraph EXPECTED_ENHANCED = buildEnhancedTest();
+  static SemanticGraph buildEnhancedTest() {
+    Map<String, IndexedWord> graphNodes = new HashMap<>();
+    for (int i = 0; i < EXPECTED_ENGLISH_WORDS.length; ++i) {
+      String index = Integer.toString(i+1);
+      CoreLabel cl = new CoreLabel();
+      cl.setValue(EXPECTED_ENGLISH_WORDS[i]);
+      cl.setIndex(i+1);
+      cl.setSentIndex(0);
+      graphNodes.put(index, new IndexedWord(cl));
+    }
+    {
+      String index = "8.1";
+      CoreLabel cl = new CoreLabel();
+      cl.setValue("reported");
+      cl.setIndex(8);
+      cl.set(CoreAnnotations.EmptyIndexAnnotation.class, 1);
+      cl.setSentIndex(0);
+      graphNodes.put(index, new IndexedWord(cl));
+    }
+    List<SemanticGraphEdge> edges = new ArrayList<>();
+    for (String[] edge : EXPECTED_ENHANCED_EDGES) {
+      IndexedWord dep = graphNodes.get(edge[0]);
+      IndexedWord gov = graphNodes.get(edge[1]);
+      GrammaticalRelation reln = GrammaticalRelation.valueOf(edge[2]);
+      edges.add(new SemanticGraphEdge(gov, dep, reln, 1.0, false));
+    }
+    List<IndexedWord> roots = new ArrayList<>();
+    roots.add(graphNodes.get("5"));
+    SemanticGraph enhancedParse = SemanticGraphFactory.makeFromEdges(edges);
+    enhancedParse.setRoots(roots);
+    return enhancedParse;
+  }
+
   @Test
   /**
    * Here we run fewer tests.  Just make sure the EmptyToken is properly handled,
@@ -355,6 +416,9 @@ public void testReadingInEmpties() throws ClassNotFoundException, IOException {
     assertEquals(Integer.valueOf(1), empty.get(CoreAnnotations.EmptyIndexAnnotation.class));
     assertEquals(0, empty.sentIndex());
     assertEquals("reported", empty.value());
+
+    SemanticGraph enhanced = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
+    assertEquals(EXPECTED_ENHANCED, enhanced);
   }
 
 }
diff --git a/src/edu/stanford/nlp/pipeline/CoNLLUReader.java b/src/edu/stanford/nlp/pipeline/CoNLLUReader.java
@@ -23,7 +23,6 @@ public class CoNLLUReader {
   /**
    * field constants
    **/
-  // TODO: we should handle field 8, DEPS, for an enhanced dependencies
   // TODO: read sent_id?
   // TODO: read comments in general
   // TODO: SpacesBefore on the first token should be checked
@@ -36,6 +35,7 @@ public class CoNLLUReader {
   public static final int CoNLLU_FeaturesField = 5;
   public static final int CoNLLU_GovField = 6;
   public static final int CoNLLU_RelnField = 7;
+  public static final int CoNLLU_EnhancedField = 8;
   public static final int CoNLLU_MiscField = 9;
 
   public int columnCount = 10;
@@ -521,7 +521,17 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
       emptyLabels.add(cl);
     }
 
-    // first, prebuild the IndexedWords that will make up the basic graph
+    // build sentence CoreMap with full text
+    Annotation sentenceCoreMap = new Annotation(doc.docText.substring(sentenceCharBegin).trim());
+    // add tokens
+    sentenceCoreMap.set(CoreAnnotations.TokensAnnotation.class, coreLabels);
+    // add empty tokens, if any exist
+    if (emptyLabels.size() > 0) {
+      sentenceCoreMap.set(CoreAnnotations.EmptyTokensAnnotation.class, emptyLabels);
+    }
+
+    // to build the basic SemanticGraph, first, prebuild the
+    // IndexedWords that will make up the basic graph
     // (and possibly the enhanced graph)
     Map<String, IndexedWord> graphNodes = new HashMap<>();
     for (CoreLabel label : coreLabels) {
@@ -533,10 +543,13 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
       graphNodes.put(index, new IndexedWord(empty));
     }
 
+    boolean hasEnhanced = false;
     // build SemanticGraphEdges for a basic graph
     List<SemanticGraphEdge> graphEdges = new ArrayList<>();
     for (int i = 0; i < lines.size(); i++) {
       List<String> fields = Arrays.asList(lines.get(i).split("\t"));
+      // track whether any of these lines signify there is an enhanced graph
+      hasEnhanced = hasEnhanced || !fields.equals("_");
       // skip the ROOT node
       if (fields.get(CoNLLU_GovField).equals("0"))
         continue;
@@ -547,20 +560,36 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
     }
     // build SemanticGraph
     SemanticGraph depParse = SemanticGraphFactory.makeFromEdges(graphEdges);
+    // add dependency graph
+    sentenceCoreMap.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, depParse);
 
-    // TODO: here we build the enhanced graph, if it exists
-
-    // build sentence CoreMap with full text
-    Annotation sentenceCoreMap = new Annotation(doc.docText.substring(sentenceCharBegin).trim());
-    // add tokens
-    sentenceCoreMap.set(CoreAnnotations.TokensAnnotation.class, coreLabels);
-    // add empty tokens, if any exist
-    if (emptyLabels.size() > 0) {
-      sentenceCoreMap.set(CoreAnnotations.EmptyTokensAnnotation.class, emptyLabels);
+    if (hasEnhanced) {
+      List<SemanticGraphEdge> enhancedEdges = new ArrayList<>();
+      List<IndexedWord> roots = new ArrayList<>();
+
+      List<String> allLines = new ArrayList<>();
+      allLines.addAll(lines);
+      allLines.addAll(sentence.emptyLines);
+      for (String line : allLines) {
+        List<String> fields = Arrays.asList(line.split("\t"));
+        IndexedWord dependent = graphNodes.get(fields.get(CoNLLU_IndexField));
+        String[] arcs = fields.get(CoNLLU_EnhancedField).split("[|]");
+        for (String arc : arcs) {
+          String[] arcPieces = arc.split(":", 2);
+          if (arcPieces[0].equals("0")) {
+            roots.add(dependent);
+          } else {
+            IndexedWord gov = graphNodes.get(arcPieces[0]);
+            GrammaticalRelation reln = GrammaticalRelation.valueOf(arcPieces[1]);
+            enhancedEdges.add(new SemanticGraphEdge(gov, dependent, reln, 1.0, false));
+          }
+        }
+      }
+      SemanticGraph enhancedParse = SemanticGraphFactory.makeFromEdges(enhancedEdges);
+      enhancedParse.setRoots(roots);
+      sentenceCoreMap.set(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, enhancedParse);
     }
 
-    // add dependency graph
-    sentenceCoreMap.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, depParse);
     return sentenceCoreMap;
   }