Skip to content

Commit 75bec7c

Browse files
committed
CoNLLUReader processes empty tokens and adds them to the Sentence CoreMap with the EmptyTokensAnnotation list. Add a check in the CoNLLUReaderITest that checks that a sentence with an empty token is properly read.
1 parent 5c470ce commit 75bec7c

File tree

4 files changed

+110
-10
lines changed

4 files changed

+110
-10
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0022
2+
# text = Over 300 Iraqis are reported dead and 500 wounded in Fallujah alone.
3+
1 Over over ADV RB _ 2 advmod 2:advmod _
4+
2 300 300 NUM CD NumForm=Digit|NumType=Card 3 nummod 3:nummod _
5+
3 Iraqis Iraqi PROPN NNPS Number=Plur 5 nsubj:pass 5:nsubj:pass|6:nsubj:xsubj|8:nsubj:pass _
6+
4 are be AUX VBP Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 5 aux:pass 5:aux:pass _
7+
5 reported report VERB VBN Tense=Past|VerbForm=Part|Voice=Pass 0 root 0:root _
8+
6 dead dead ADJ JJ Degree=Pos 5 xcomp 5:xcomp _
9+
7 and and CCONJ CC _ 8 cc 8:cc|8.1:cc _
10+
8 500 500 NUM CD NumForm=Digit|NumType=Card 5 conj 5:conj:and|8.1:nsubj:pass|9:nsubj:xsubj _
11+
8.1 reported report VERB VBN Tense=Past|VerbForm=Part|Voice=Pass _ _ 5:conj:and CopyOf=5
12+
9 wounded wounded ADJ JJ Degree=Pos 8 orphan 8.1:xcomp _
13+
10 in in ADP IN _ 11 case 11:case _
14+
11 Fallujah Fallujah PROPN NNP Number=Sing 5 obl 5:obl:in _
15+
12 alone alone ADV RB _ 11 advmod 11:advmod SpaceAfter=No
16+
13 . . PUNCT . _ 5 punct 5:punct _
17+

itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,6 @@
2323
public class CoNLLUReaderITest {
2424

2525
public String examplePath = String.format("edu/stanford/nlp/pipeline/es-example.conllu");
26-
public StanfordCoreNLP pipeline;
27-
public Annotation goldDocument;
28-
public Annotation readInDocument;
2926

3027
static final String[] EXPECTED_SENTENCE_TEXT = {
3128
"Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica.",
@@ -98,7 +95,7 @@ public class CoNLLUReaderITest {
9895

9996
@Test
10097
public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException {
101-
readInDocument = new CoNLLUReader(new Properties()).readCoNLLUFile(examplePath).get(0);
98+
Annotation readInDocument = readInDocument = new CoNLLUReader(new Properties()).readCoNLLUFile(examplePath).get(0);
10299

103100
assertTrue(readInDocument.containsKey(CoreAnnotations.TextAnnotation.class));
104101
assertTrue(readInDocument.containsKey(CoreAnnotations.TokensAnnotation.class));
@@ -321,4 +318,42 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
321318
}
322319
}
323320
}
321+
322+
public String emptiesPath = String.format("edu/stanford/nlp/pipeline/en-example.conllu");
323+
324+
String[] EXPECTED_ENGLISH_WORDS = {
325+
"Over", "300", "Iraqis", "are", "reported", "dead", "and", "500", "wounded", "in", "Fallujah", "alone", "."
326+
};
327+
328+
@Test
329+
/**
330+
* Here we run fewer tests. Just make sure the EmptyToken is properly handled,
331+
* and make sure there isn't some weird line skipping going on with the rest of the tokens
332+
*/
333+
public void testReadingInEmpties() throws ClassNotFoundException, IOException {
334+
Annotation readInDocument = new CoNLLUReader(new Properties()).readCoNLLUFile(emptiesPath).get(0);
335+
336+
// this document only has one sentence
337+
List<CoreMap> sentences = readInDocument.get(CoreAnnotations.SentencesAnnotation.class);
338+
assertEquals(1, sentences.size());
339+
340+
CoreMap sentence = sentences.get(0);
341+
342+
// cursory check of the tokens
343+
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
344+
assertEquals(13, tokens.size());
345+
assertEquals(13, EXPECTED_ENGLISH_WORDS.length);
346+
for (int i = 0; i < tokens.size(); ++i) {
347+
assertEquals(i+1, tokens.get(i).index());
348+
assertEquals(EXPECTED_ENGLISH_WORDS[i], tokens.get(i).value());
349+
}
350+
351+
List<CoreLabel> emptyTokens = sentence.get(CoreAnnotations.EmptyTokensAnnotation.class);
352+
assertEquals(1, emptyTokens.size());
353+
CoreLabel empty = emptyTokens.get(0);
354+
assertEquals(8, empty.index());
355+
assertEquals(Integer.valueOf(1), empty.get(CoreAnnotations.EmptyIndexAnnotation.class));
356+
assertEquals("reported", empty.value());
357+
}
358+
324359
}

src/edu/stanford/nlp/ling/CoreAnnotations.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,20 @@ public Class<Integer> getType() {
372372
}
373373
}
374374

375+
/**
376+
* This represents a list of the empty words. We can attach to the
377+
* Sentence CoreMap when reading CoNLLU files with such empty words
378+
* <br>
379+
* See the desctiption of EmptyIndexAnnotation for more explanation
380+
* of when this is relevant
381+
*/
382+
public static class EmptyTokensAnnotation implements CoreAnnotation<List<CoreLabel>> {
383+
@Override
384+
public Class<List<CoreLabel>> getType() {
385+
return ErasureUtils.uncheckedCast(List.class);
386+
}
387+
}
388+
375389
/**
376390
* This indexes the beginning of a span of words, e.g., a constituent in a
377391
* tree. See {@link edu.stanford.nlp.trees.Tree#indexSpans(int)}.

src/edu/stanford/nlp/pipeline/CoNLLUReader.java

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ public class CoNLLUReader {
2424
* field constants
2525
**/
2626
// TODO: we should handle field 8, DEPS, for an enhanced dependencies
27-
// doing that requires processing the empty nodes somehow
2827
// TODO: read sent_id?
2928
// TODO: read comments in general
3029
// TODO: SpacesBefore on the first token should be checked
@@ -48,6 +47,7 @@ public class CoNLLUReader {
4847
public static Pattern DOCUMENT_LINE = Pattern.compile("^# newdoc");
4948
public static Pattern MWT_LINE = Pattern.compile("^[0-9]+-[0-9]+.*");
5049
public static Pattern TOKEN_LINE = Pattern.compile("^[0-9]+\t.*");
50+
public static Pattern EMPTY_LINE = Pattern.compile("^[0-9]+[.][0-9]+\t.*");
5151

5252
/**
5353
* shorthands for CoreAnnotations
@@ -219,6 +219,8 @@ public class CoNLLUSentence {
219219

220220
// the token lines
221221
public List<String> tokenLines = new ArrayList<>();
222+
// in case the enhanced dependencies have empty words
223+
public List<String> emptyLines = new ArrayList<>();
222224
// data for the sentence contained in # key values
223225
public HashMap<String, String> sentenceData = new HashMap<>();
224226
// map indices in token list to mwt data if there is any
@@ -240,8 +242,9 @@ else if (MWT_LINE.matcher(line).matches())
240242
addMWTData(line);
241243
else if (TOKEN_LINE.matcher(line).matches())
242244
tokenLines.add(line);
245+
else if (EMPTY_LINE.matcher(line).matches())
246+
emptyLines.add(line);
243247
else
244-
// TODO: this is ignoring "empty" tokens
245248
return true;
246249
return false;
247250
}
@@ -359,7 +362,23 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
359362
public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
360363
List<String> fields = Arrays.asList(line.split("\t"));
361364
CoreLabel cl = new CoreLabel();
362-
int sentenceTokenIndex = Integer.valueOf(fields.get(CoNLLU_IndexField));
365+
366+
String indexField = fields.get(CoNLLU_IndexField);
367+
int sentenceTokenIndex;
368+
boolean isEmpty;
369+
if (indexField.indexOf('.') >= 0) {
370+
isEmpty = true;
371+
String[] indexPieces = indexField.split("[.]", 2);
372+
sentenceTokenIndex = Integer.valueOf(indexPieces[0]);
373+
cl.setIndex(sentenceTokenIndex);
374+
int emptyIndex = Integer.valueOf(indexPieces[1]);
375+
cl.set(CoreAnnotations.EmptyIndexAnnotation.class, emptyIndex);
376+
} else {
377+
isEmpty = false;
378+
sentenceTokenIndex = Integer.valueOf(indexField);
379+
cl.setIndex(sentenceTokenIndex);
380+
}
381+
363382
cl.setWord(fields.get(CoNLLU_WordField));
364383
cl.setValue(fields.get(CoNLLU_WordField));
365384
cl.setOriginalText(fields.get(CoNLLU_WordField));
@@ -383,10 +402,14 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
383402
extraColumnIdx++) {
384403
cl.set(extraColumns.get(extraColumnIdx), fields.get(extraColumnIdx));
385404
}
386-
cl.setIndex(sentenceTokenIndex);
387405

388-
// handle the MWT info
389-
if (sentence.mwtData.containsKey(sentenceTokenIndex - 1)) {
406+
// handle the MWT info and after text
407+
if (isEmpty) {
408+
// don't set an after for empty tokens
409+
// empty tokens are not considered part of MWT
410+
cl.setIsMWT(false);
411+
cl.setIsMWTFirst(false);
412+
} else if (sentence.mwtData.containsKey(sentenceTokenIndex - 1)) {
390413
// set MWT text
391414
cl.set(CoreAnnotations.MWTTokenTextAnnotation.class,
392415
sentence.mwtTokens.get(sentence.mwtData.get(sentenceTokenIndex - 1)));
@@ -487,6 +510,12 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
487510
}
488511
}
489512

513+
List<CoreLabel> emptyLabels = new ArrayList<CoreLabel>();
514+
for (String line : sentence.emptyLines) {
515+
CoreLabel cl = convertLineToCoreLabel(sentence, line);
516+
emptyLabels.add(cl);
517+
}
518+
490519
// build SemanticGraphEdges
491520
List<SemanticGraphEdge> graphEdges = new ArrayList<>();
492521
for (int i = 0; i < lines.size(); i++) {
@@ -505,6 +534,11 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
505534
Annotation sentenceCoreMap = new Annotation(doc.docText.substring(sentenceCharBegin).trim());
506535
// add tokens
507536
sentenceCoreMap.set(CoreAnnotations.TokensAnnotation.class, coreLabels);
537+
// add empty tokens, if any exist
538+
if (emptyLabels.size() > 0) {
539+
sentenceCoreMap.set(CoreAnnotations.EmptyTokensAnnotation.class, emptyLabels);
540+
}
541+
508542
// add dependency graph
509543
sentenceCoreMap.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, depParse);
510544
return sentenceCoreMap;

0 commit comments

Comments
 (0)