Add an EmptyIndexAnnotation to account for UD sentences with extra words between words (that is how UD handles copy words in enhanced dependencies, for example)

AngledLuffa · AngledLuffa · commit 0f69578bdb83 · 2023-08-30T22:39:02.000-07:00
Add a small test of the empty index equals in IndexedWord
diff --git a/src/edu/stanford/nlp/ling/CoreAnnotations.java b/src/edu/stanford/nlp/ling/CoreAnnotations.java
@@ -340,6 +340,36 @@ public Class<Integer> getType() {
     }
   }
 
+  /**
+   * Some datasets - for example, the UD Estonian EWT dataset - use
+   * "empty" nodes to represent words that were unspoken / unwritten
+   * but can be inferred from the structure of the sentence.  For
+   * example, in English, one could say "Gimme" instead of "Give me
+   * it", and "it" could be treated as an empty word.  A more common
+   * example is when it is used in a similar manner to the copy nodes,
+   * but displaced in time.  So, for example, a sentence which uses
+   * them in the UD English EWT dataset (no relation) is:
+   *<br>
+   * "Over 300 Iraqis are reported dead and 500 wounded in Fallujah alone."
+   *<br>
+   * Here, one could build a dependency graph using "reported" as a
+   * copy node, but instead the en_ewt dataset creates an "empty" node
+   * and builds the enhanced dependencies using that node.
+   *<br>
+   * "Over 300 Iraqis are reported dead and 500 *reported* wounded in Fallujah alone."
+   *<br>
+   * Rather than the second "reported" being a copy of word 5, it is treated
+   * as a separate word 8.1
+   *<br>
+   * As with IndexAnnotation, we count from 1.
+   */
+  public static class EmptyIndexAnnotation implements CoreAnnotation<Integer> {
+    @Override
+    public Class<Integer> getType() {
+      return Integer.class;
+    }
+  }
+
   /**
    * This indexes the beginning of a span of words, e.g., a constituent in a
    * tree. See {@link edu.stanford.nlp.trees.Tree#indexSpans(int)}.
diff --git a/src/edu/stanford/nlp/ling/IndexedWord.java b/src/edu/stanford/nlp/ling/IndexedWord.java
@@ -346,6 +346,14 @@ public void setEndPosition(int endPos) {
     label.setEndPosition(endPos);
   }
 
+  public void setEmptyIndex(int empty) {
+    label.set(CoreAnnotations.EmptyIndexAnnotation.class, empty);
+  }
+
+  public int getEmptyIndex(int empty) {
+    return label.get(CoreAnnotations.EmptyIndexAnnotation.class);
+  }
+
   public int copyCount() {
     return copyCount;
   }
@@ -432,6 +440,14 @@ public boolean equals(Object o) {
     if (copyCount() != otherWord.copyCount()) {
       return false;
     }
+
+    // compare empty word index
+    Integer myEmptyIndex = get(CoreAnnotations.EmptyIndexAnnotation.class);
+    Integer otherEmptyIndex = otherWord.get(CoreAnnotations.EmptyIndexAnnotation.class);
+    if ( ! Objects.equals(myEmptyIndex, otherEmptyIndex)) {
+      return false;
+    }
+
     // Compare pseudo-positions
     if ( (!Double.isNaN(this.pseudoPosition) || !Double.isNaN(otherWord.pseudoPosition)) &&
          this.pseudoPosition != otherWord.pseudoPosition) {
@@ -465,6 +481,10 @@ public int hashCode() {
       result = 29 * result + get(CoreAnnotations.IndexAnnotation.class).hashCode();
       sensible = true;
     }
+    Integer emptyIndex = get(CoreAnnotations.EmptyIndexAnnotation.class);
+    if (emptyIndex != null) {
+      result = 29 * result + emptyIndex.hashCode();
+    }
     if ( ! sensible) {
       log.info("WARNING!!!  You have hashed an IndexedWord with no docID, sentIndex or wordIndex. You will almost certainly lose");
     }
diff --git a/test/src/edu/stanford/nlp/ling/IndexedWordTest.java b/test/src/edu/stanford/nlp/ling/IndexedWordTest.java
@@ -35,4 +35,26 @@ public void testIndexedWordComparisons() {
     Assert.assertTrue(iw3.compareTo(iw5) > 0);
   }
 
+  @Test
+  public void testEmptyIndex() {
+    IndexedWord iw = new IndexedWord("foo", 1, 1);
+    iw.setWord("bar");
+
+    IndexedWord iw2 = new IndexedWord("foo", 1, 1);
+    iw2.setWord("bar");
+
+    Assert.assertEquals(iw, iw2);
+
+    iw2.setEmptyIndex(5);
+    Assert.assertNotEquals(iw, iw2);
+    Assert.assertNotEquals(iw2, iw);
+
+    iw.setEmptyIndex(5);
+    Assert.assertEquals(iw, iw2);
+    Assert.assertEquals(iw2, iw);
+
+    iw2.setEmptyIndex(3);
+    Assert.assertNotEquals(iw, iw2);
+    Assert.assertNotEquals(iw2, iw);
+  }
 }