@@ -24,7 +24,6 @@ public class CoNLLUReader {
24
24
* field constants
25
25
**/
26
26
// TODO: we should handle field 8, DEPS, for an enhanced dependencies
27
- // doing that requires processing the empty nodes somehow
28
27
// TODO: read sent_id?
29
28
// TODO: read comments in general
30
29
// TODO: SpacesBefore on the first token should be checked
@@ -48,6 +47,7 @@ public class CoNLLUReader {
48
47
public static Pattern DOCUMENT_LINE = Pattern .compile ("^# newdoc" );
49
48
public static Pattern MWT_LINE = Pattern .compile ("^[0-9]+-[0-9]+.*" );
50
49
public static Pattern TOKEN_LINE = Pattern .compile ("^[0-9]+\t .*" );
50
+ public static Pattern EMPTY_LINE = Pattern .compile ("^[0-9]+[.][0-9]+\t .*" );
51
51
52
52
/**
53
53
* shorthands for CoreAnnotations
@@ -219,6 +219,8 @@ public class CoNLLUSentence {
219
219
220
220
// the token lines
221
221
public List <String > tokenLines = new ArrayList <>();
222
+ // in case the enhanced dependencies have empty words
223
+ public List <String > emptyLines = new ArrayList <>();
222
224
// data for the sentence contained in # key values
223
225
public HashMap <String , String > sentenceData = new HashMap <>();
224
226
// map indices in token list to mwt data if there is any
@@ -240,8 +242,9 @@ else if (MWT_LINE.matcher(line).matches())
240
242
addMWTData (line );
241
243
else if (TOKEN_LINE .matcher (line ).matches ())
242
244
tokenLines .add (line );
245
+ else if (EMPTY_LINE .matcher (line ).matches ())
246
+ emptyLines .add (line );
243
247
else
244
- // TODO: this is ignoring "empty" tokens
245
248
return true ;
246
249
return false ;
247
250
}
@@ -359,7 +362,23 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
359
362
public CoreLabel convertLineToCoreLabel (CoNLLUSentence sentence , String line ) {
360
363
List <String > fields = Arrays .asList (line .split ("\t " ));
361
364
CoreLabel cl = new CoreLabel ();
362
- int sentenceTokenIndex = Integer .valueOf (fields .get (CoNLLU_IndexField ));
365
+
366
+ String indexField = fields .get (CoNLLU_IndexField );
367
+ int sentenceTokenIndex ;
368
+ boolean isEmpty ;
369
+ if (indexField .indexOf ('.' ) >= 0 ) {
370
+ isEmpty = true ;
371
+ String [] indexPieces = indexField .split ("[.]" , 2 );
372
+ sentenceTokenIndex = Integer .valueOf (indexPieces [0 ]);
373
+ cl .setIndex (sentenceTokenIndex );
374
+ int emptyIndex = Integer .valueOf (indexPieces [1 ]);
375
+ cl .set (CoreAnnotations .EmptyIndexAnnotation .class , emptyIndex );
376
+ } else {
377
+ isEmpty = false ;
378
+ sentenceTokenIndex = Integer .valueOf (indexField );
379
+ cl .setIndex (sentenceTokenIndex );
380
+ }
381
+
363
382
cl .setWord (fields .get (CoNLLU_WordField ));
364
383
cl .setValue (fields .get (CoNLLU_WordField ));
365
384
cl .setOriginalText (fields .get (CoNLLU_WordField ));
@@ -383,10 +402,14 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
383
402
extraColumnIdx ++) {
384
403
cl .set (extraColumns .get (extraColumnIdx ), fields .get (extraColumnIdx ));
385
404
}
386
- cl .setIndex (sentenceTokenIndex );
387
405
388
- // handle the MWT info
389
- if (sentence .mwtData .containsKey (sentenceTokenIndex - 1 )) {
406
+ // handle the MWT info and after text
407
+ if (isEmpty ) {
408
+ // don't set an after for empty tokens
409
+ // empty tokens are not considered part of MWT
410
+ cl .setIsMWT (false );
411
+ cl .setIsMWTFirst (false );
412
+ } else if (sentence .mwtData .containsKey (sentenceTokenIndex - 1 )) {
390
413
// set MWT text
391
414
cl .set (CoreAnnotations .MWTTokenTextAnnotation .class ,
392
415
sentence .mwtTokens .get (sentence .mwtData .get (sentenceTokenIndex - 1 )));
@@ -487,6 +510,12 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
487
510
}
488
511
}
489
512
513
+ List <CoreLabel > emptyLabels = new ArrayList <CoreLabel >();
514
+ for (String line : sentence .emptyLines ) {
515
+ CoreLabel cl = convertLineToCoreLabel (sentence , line );
516
+ emptyLabels .add (cl );
517
+ }
518
+
490
519
// build SemanticGraphEdges
491
520
List <SemanticGraphEdge > graphEdges = new ArrayList <>();
492
521
for (int i = 0 ; i < lines .size (); i ++) {
@@ -505,6 +534,11 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
505
534
Annotation sentenceCoreMap = new Annotation (doc .docText .substring (sentenceCharBegin ).trim ());
506
535
// add tokens
507
536
sentenceCoreMap .set (CoreAnnotations .TokensAnnotation .class , coreLabels );
537
+ // add empty tokens, if any exist
538
+ if (emptyLabels .size () > 0 ) {
539
+ sentenceCoreMap .set (CoreAnnotations .EmptyTokensAnnotation .class , emptyLabels );
540
+ }
541
+
508
542
// add dependency graph
509
543
sentenceCoreMap .set (SemanticGraphCoreAnnotations .BasicDependenciesAnnotation .class , depParse );
510
544
return sentenceCoreMap ;
0 commit comments