Skip to content

Commit 0f69578

Browse files
committed
Add an EmptyIndexAnnotation to account for UD sentences with extra words between words (that is how UD handles copy words in enhanced dependencies, for example)
Add a small test of the empty index equals in IndexedWord
1 parent de026a3 commit 0f69578

File tree

3 files changed

+72
-0
lines changed

3 files changed

+72
-0
lines changed

src/edu/stanford/nlp/ling/CoreAnnotations.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,36 @@ public Class<Integer> getType() {
340340
}
341341
}
342342

343+
/**
344+
* Some datasets - for example, the UD Estonian EWT dataset - use
345+
* "empty" nodes to represent words that were unspoken / unwritten
346+
* but can be inferred from the structure of the sentence. For
347+
* example, in English, one could say "Gimme" instead of "Give me
348+
* it", and "it" could be treated as an empty word. A more common
349+
* example is when it is used in a similar manner to the copy nodes,
350+
* but displaced in time. So, for example, a sentence which uses
351+
* them in the UD English EWT dataset (no relation) is:
352+
*<br>
353+
* "Over 300 Iraqis are reported dead and 500 wounded in Fallujah alone."
354+
*<br>
355+
* Here, one could build a dependency graph using "reported" as a
356+
* copy node, but instead the en_ewt dataset creates an "empty" node
357+
* and builds the enhanced dependencies using that node.
358+
*<br>
359+
* "Over 300 Iraqis are reported dead and 500 *reported* wounded in Fallujah alone."
360+
*<br>
361+
* Rather than the second "reported" being a copy of word 5, it is treated
362+
* as a separate word 8.1
363+
*<br>
364+
* As with IndexAnnotation, we count from 1.
365+
*/
366+
public static class EmptyIndexAnnotation implements CoreAnnotation<Integer> {
367+
@Override
368+
public Class<Integer> getType() {
369+
return Integer.class;
370+
}
371+
}
372+
343373
/**
344374
* This indexes the beginning of a span of words, e.g., a constituent in a
345375
* tree. See {@link edu.stanford.nlp.trees.Tree#indexSpans(int)}.

src/edu/stanford/nlp/ling/IndexedWord.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,14 @@ public void setEndPosition(int endPos) {
346346
label.setEndPosition(endPos);
347347
}
348348

349+
public void setEmptyIndex(int empty) {
350+
label.set(CoreAnnotations.EmptyIndexAnnotation.class, empty);
351+
}
352+
353+
public int getEmptyIndex(int empty) {
354+
return label.get(CoreAnnotations.EmptyIndexAnnotation.class);
355+
}
356+
349357
public int copyCount() {
350358
return copyCount;
351359
}
@@ -432,6 +440,14 @@ public boolean equals(Object o) {
432440
if (copyCount() != otherWord.copyCount()) {
433441
return false;
434442
}
443+
444+
// compare empty word index
445+
Integer myEmptyIndex = get(CoreAnnotations.EmptyIndexAnnotation.class);
446+
Integer otherEmptyIndex = otherWord.get(CoreAnnotations.EmptyIndexAnnotation.class);
447+
if ( ! Objects.equals(myEmptyIndex, otherEmptyIndex)) {
448+
return false;
449+
}
450+
435451
// Compare pseudo-positions
436452
if ( (!Double.isNaN(this.pseudoPosition) || !Double.isNaN(otherWord.pseudoPosition)) &&
437453
this.pseudoPosition != otherWord.pseudoPosition) {
@@ -465,6 +481,10 @@ public int hashCode() {
465481
result = 29 * result + get(CoreAnnotations.IndexAnnotation.class).hashCode();
466482
sensible = true;
467483
}
484+
Integer emptyIndex = get(CoreAnnotations.EmptyIndexAnnotation.class);
485+
if (emptyIndex != null) {
486+
result = 29 * result + emptyIndex.hashCode();
487+
}
468488
if ( ! sensible) {
469489
log.info("WARNING!!! You have hashed an IndexedWord with no docID, sentIndex or wordIndex. You will almost certainly lose");
470490
}

test/src/edu/stanford/nlp/ling/IndexedWordTest.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,26 @@ public void testIndexedWordComparisons() {
3535
Assert.assertTrue(iw3.compareTo(iw5) > 0);
3636
}
3737

38+
@Test
39+
public void testEmptyIndex() {
40+
IndexedWord iw = new IndexedWord("foo", 1, 1);
41+
iw.setWord("bar");
42+
43+
IndexedWord iw2 = new IndexedWord("foo", 1, 1);
44+
iw2.setWord("bar");
45+
46+
Assert.assertEquals(iw, iw2);
47+
48+
iw2.setEmptyIndex(5);
49+
Assert.assertNotEquals(iw, iw2);
50+
Assert.assertNotEquals(iw2, iw);
51+
52+
iw.setEmptyIndex(5);
53+
Assert.assertEquals(iw, iw2);
54+
Assert.assertEquals(iw2, iw);
55+
56+
iw2.setEmptyIndex(3);
57+
Assert.assertNotEquals(iw, iw2);
58+
Assert.assertNotEquals(iw2, iw);
59+
}
3860
}

0 commit comments

Comments
 (0)