@@ -25,8 +25,8 @@ public class CoNLLUReader {
25
25
**/
26
26
// TODO: read sent_id?
27
27
// TODO: read comments in general
28
- // TODO: SpacesBefore on the first token should be checked
29
28
// TODO: reconsider the newline as the after on the last word
29
+ // TODO: keep around the rest of the misc annotations
30
30
public static final int CoNLLU_IndexField = 0 ;
31
31
public static final int CoNLLU_WordField = 1 ;
32
32
public static final int CoNLLU_LemmaField = 2 ;
@@ -408,6 +408,19 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
408
408
cl .set (extraColumns .get (extraColumnIdx ), fields .get (extraColumnIdx ));
409
409
}
410
410
411
+ Map <String , String > miscKeyValues = new HashMap <>();
412
+ if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
413
+ Arrays .stream (fields .get (CoNLLU_MiscField ).split ("\\ |" )).forEach (
414
+ kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
415
+ }
416
+
417
+ // SpacesBefore on a word that isn't the first in a document will
418
+ // be replaced with the SpacesAfter from the previous token later
419
+ String spacesBefore = miscKeyValues .get ("SpacesBefore" );
420
+ if (spacesBefore != null ) {
421
+ cl .setBefore (unescapeSpacesAfter (spacesBefore ));
422
+ }
423
+
411
424
// handle the MWT info and after text
412
425
if (isEmpty ) {
413
426
// don't set an after for empty tokens
@@ -437,10 +450,10 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
437
450
} else {
438
451
String miscInfo = sentence .mwtMiscs .get (sentence .mwtData .get (sentenceTokenIndex - 1 ));
439
452
if (miscInfo != null && !miscInfo .equals ("_" )) {
440
- Map <String , String > miscKeyValues = new HashMap <>();
453
+ Map <String , String > mwtKeyValues = new HashMap <>();
441
454
Arrays .stream (miscInfo .split ("\\ |" )).forEach (
442
- kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
443
- String spaceAfter = miscToSpaceAfter (miscKeyValues );
455
+ kv -> mwtKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
456
+ String spaceAfter = miscToSpaceAfter (mwtKeyValues );
444
457
cl .setAfter (spaceAfter );
445
458
} else {
446
459
cl .setAfter (" " );
@@ -450,15 +463,8 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
450
463
cl .setIsMWT (false );
451
464
cl .setIsMWTFirst (false );
452
465
453
- if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
454
- Map <String , String > miscKeyValues = new HashMap <>();
455
- Arrays .stream (fields .get (CoNLLU_MiscField ).split ("\\ |" )).forEach (
456
- kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
457
- String spaceAfter = miscToSpaceAfter (miscKeyValues );
458
- cl .setAfter (spaceAfter );
459
- } else {
460
- cl .setAfter (" " );
461
- }
466
+ String spaceAfter = miscToSpaceAfter (miscKeyValues );
467
+ cl .setAfter (spaceAfter );
462
468
}
463
469
return cl ;
464
470
}
@@ -477,7 +483,9 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
477
483
// the last token should have a newline after
478
484
coreLabels .get (coreLabels .size () - 1 ).setAfter (System .lineSeparator ());
479
485
// set before
480
- coreLabels .get (0 ).setBefore ("" );
486
+ if (!coreLabels .get (0 ).containsKey (CoreAnnotations .BeforeAnnotation .class )) {
487
+ coreLabels .get (0 ).setBefore ("" );
488
+ }
481
489
for (int i = 1 ; i < coreLabels .size () ; i ++) {
482
490
// all words should match the after of the previous token
483
491
coreLabels .get (i ).setBefore (coreLabels .get (i - 1 ).after ());
0 commit comments