@@ -354,89 +354,96 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
354
354
}
355
355
356
356
/**
357
- * Convert a list of CoNLL-U token lines into a sentence CoreMap
358
- **/
359
- public CoreMap convertCoNLLUSentenceToCoreMap (CoNLLUDocument doc , CoNLLUSentence sentence ) {
360
- List <String > lines = sentence .tokenLines ;
361
- // create CoreLabels
362
- List <CoreLabel > coreLabels = new ArrayList <CoreLabel >();
363
- int sentenceTokenIndex = 1 ;
364
- for (String line : lines ) {
365
- List <String > fields = Arrays .asList (line .split ("\t " ));
366
- CoreLabel cl = new CoreLabel ();
367
- cl .setWord (fields .get (CoNLLU_WordField ));
368
- cl .setValue (fields .get (CoNLLU_WordField ));
369
- cl .setOriginalText (fields .get (CoNLLU_WordField ));
370
- cl .setIsNewline (false );
371
-
372
- if (!fields .get (CoNLLU_LemmaField ).equals ("_" ))
373
- cl .setLemma (fields .get (CoNLLU_LemmaField ));
374
-
375
- if (!fields .get (CoNLLU_UPOSField ).equals ("_" ))
376
- cl .set (CoreAnnotations .CoarseTagAnnotation .class , fields .get (CoNLLU_UPOSField ));
377
-
378
- final String xpos = fields .get (CoNLLU_XPOSField );
379
- if (!xpos .equals ("_" ))
380
- cl .setTag (xpos );
381
-
382
- if (!fields .get (CoNLLU_FeaturesField ).equals ("_" )) {
383
- CoNLLUFeatures features = new CoNLLUFeatures (fields .get (CoNLLU_FeaturesField ));
384
- cl .set (CoreAnnotations .CoNLLUFeats .class , features );
385
- }
386
- for (int extraColumnIdx = 10 ; extraColumnIdx < columnCount && extraColumnIdx < fields .size ();
387
- extraColumnIdx ++) {
388
- cl .set (extraColumns .get (extraColumnIdx ), fields .get (extraColumnIdx ));
357
+ * Convert a single ten column CoNLLU line into a CoreLabel
358
+ */
359
+ public CoreLabel convertLineToCoreLabel (CoNLLUSentence sentence , String line ) {
360
+ List <String > fields = Arrays .asList (line .split ("\t " ));
361
+ CoreLabel cl = new CoreLabel ();
362
+ int sentenceTokenIndex = Integer .valueOf (fields .get (CoNLLU_IndexField ));
363
+ cl .setWord (fields .get (CoNLLU_WordField ));
364
+ cl .setValue (fields .get (CoNLLU_WordField ));
365
+ cl .setOriginalText (fields .get (CoNLLU_WordField ));
366
+ cl .setIsNewline (false );
367
+
368
+ if (!fields .get (CoNLLU_LemmaField ).equals ("_" ))
369
+ cl .setLemma (fields .get (CoNLLU_LemmaField ));
370
+
371
+ if (!fields .get (CoNLLU_UPOSField ).equals ("_" ))
372
+ cl .set (CoreAnnotations .CoarseTagAnnotation .class , fields .get (CoNLLU_UPOSField ));
373
+
374
+ final String xpos = fields .get (CoNLLU_XPOSField );
375
+ if (!xpos .equals ("_" ))
376
+ cl .setTag (xpos );
377
+
378
+ if (!fields .get (CoNLLU_FeaturesField ).equals ("_" )) {
379
+ CoNLLUFeatures features = new CoNLLUFeatures (fields .get (CoNLLU_FeaturesField ));
380
+ cl .set (CoreAnnotations .CoNLLUFeats .class , features );
381
+ }
382
+ for (int extraColumnIdx = 10 ; extraColumnIdx < columnCount && extraColumnIdx < fields .size ();
383
+ extraColumnIdx ++) {
384
+ cl .set (extraColumns .get (extraColumnIdx ), fields .get (extraColumnIdx ));
385
+ }
386
+ cl .setIndex (sentenceTokenIndex );
387
+
388
+ // handle the MWT info
389
+ if (sentence .mwtData .containsKey (sentenceTokenIndex - 1 )) {
390
+ // set MWT text
391
+ cl .set (CoreAnnotations .MWTTokenTextAnnotation .class ,
392
+ sentence .mwtTokens .get (sentence .mwtData .get (sentenceTokenIndex - 1 )));
393
+ cl .setIsMWT (true );
394
+ // check if first
395
+ if (sentence .mwtData .containsKey (sentenceTokenIndex - 2 ) &&
396
+ sentence .mwtData .get (sentenceTokenIndex -2 ).equals (sentence .mwtData .get (sentenceTokenIndex -1 ))) {
397
+ cl .setIsMWTFirst (false );
398
+ } else {
399
+ cl .setIsMWTFirst (true );
389
400
}
390
- cl .setIndex (sentenceTokenIndex );
391
-
392
- // handle the MWT info
393
- if (sentence .mwtData .containsKey (sentenceTokenIndex - 1 )) {
394
- // set MWT text
395
- cl .set (CoreAnnotations .MWTTokenTextAnnotation .class ,
396
- sentence .mwtTokens .get (sentence .mwtData .get (sentenceTokenIndex - 1 )));
397
- cl .setIsMWT (true );
398
- // check if first
399
- if (sentence .mwtData .containsKey (sentenceTokenIndex - 2 ) &&
400
- sentence .mwtData .get (sentenceTokenIndex -2 ).equals (sentence .mwtData .get (sentenceTokenIndex -1 ))) {
401
- cl .setIsMWTFirst (false );
402
- } else {
403
- cl .setIsMWTFirst (true );
404
- }
405
- // SpaceAfter / SpacesAfter should only apply to the last word in an MWT
406
- // all other words are treated as implicitly having SpaceAfter=No
407
- if (sentence .mwtData .containsKey (sentenceTokenIndex ) &&
408
- sentence .mwtData .get (sentenceTokenIndex ).equals (sentence .mwtData .get (sentenceTokenIndex -1 ))) {
409
- // is there a next word MWT?
410
- // and it's the same MWT as this word?
411
- // then we aren't last, and SpaceAfter="" is implicitly true
412
- cl .setAfter ("" );
413
- } else {
414
- String miscInfo = sentence .mwtMiscs .get (sentence .mwtData .get (sentenceTokenIndex - 1 ));
415
- if (miscInfo != null && !miscInfo .equals ("_" )) {
416
- Map <String , String > miscKeyValues = new HashMap <>();
417
- Arrays .stream (miscInfo .split ("\\ |" )).forEach (
418
- kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
419
- String spaceAfter = miscToSpaceAfter (miscKeyValues );
420
- cl .setAfter (spaceAfter );
421
- } else {
422
- cl .setAfter (" " );
423
- }
424
- }
401
+ // SpaceAfter / SpacesAfter should only apply to the last word in an MWT
402
+ // all other words are treated as implicitly having SpaceAfter=No
403
+ if (sentence .mwtData .containsKey (sentenceTokenIndex ) &&
404
+ sentence .mwtData .get (sentenceTokenIndex ).equals (sentence .mwtData .get (sentenceTokenIndex -1 ))) {
405
+ // is there a next word MWT?
406
+ // and it's the same MWT as this word?
407
+ // then we aren't last, and SpaceAfter="" is implicitly true
408
+ cl .setAfter ("" );
425
409
} else {
426
- cl .setIsMWT (false );
427
- cl .setIsMWTFirst (false );
428
-
429
- if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
410
+ String miscInfo = sentence .mwtMiscs .get (sentence .mwtData .get (sentenceTokenIndex - 1 ));
411
+ if (miscInfo != null && !miscInfo .equals ("_" )) {
430
412
Map <String , String > miscKeyValues = new HashMap <>();
431
- Arrays .stream (fields . get ( CoNLLU_MiscField ) .split ("\\ |" )).forEach (
413
+ Arrays .stream (miscInfo .split ("\\ |" )).forEach (
432
414
kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
433
415
String spaceAfter = miscToSpaceAfter (miscKeyValues );
434
416
cl .setAfter (spaceAfter );
435
417
} else {
436
418
cl .setAfter (" " );
437
419
}
438
420
}
439
- sentenceTokenIndex ++;
421
+ } else {
422
+ cl .setIsMWT (false );
423
+ cl .setIsMWTFirst (false );
424
+
425
+ if (!fields .get (CoNLLU_MiscField ).equals ("_" )) {
426
+ Map <String , String > miscKeyValues = new HashMap <>();
427
+ Arrays .stream (fields .get (CoNLLU_MiscField ).split ("\\ |" )).forEach (
428
+ kv -> miscKeyValues .put (kv .split ("=" , 2 )[0 ], kv .split ("=" )[1 ]));
429
+ String spaceAfter = miscToSpaceAfter (miscKeyValues );
430
+ cl .setAfter (spaceAfter );
431
+ } else {
432
+ cl .setAfter (" " );
433
+ }
434
+ }
435
+ return cl ;
436
+ }
437
+
438
+ /**
439
+ * Convert a list of CoNLL-U token lines into a sentence CoreMap
440
+ **/
441
+ public CoreMap convertCoNLLUSentenceToCoreMap (CoNLLUDocument doc , CoNLLUSentence sentence ) {
442
+ List <String > lines = sentence .tokenLines ;
443
+ // create CoreLabels
444
+ List <CoreLabel > coreLabels = new ArrayList <CoreLabel >();
445
+ for (String line : lines ) {
446
+ CoreLabel cl = convertLineToCoreLabel (sentence , line );
440
447
coreLabels .add (cl );
441
448
}
442
449
// the last token should have a newline after
0 commit comments