Skip to content

Commit 5c470ce

Browse files
committed
Refactor the code that processes one line, and change it to get the index directly from that line instead of counting the index for each line of the conllu sentence
1 parent 8630893 commit 5c470ce

File tree

1 file changed

+80
-73
lines changed

1 file changed

+80
-73
lines changed

src/edu/stanford/nlp/pipeline/CoNLLUReader.java

Lines changed: 80 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -354,89 +354,96 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
354354
}
355355

356356
/**
357-
* Convert a list of CoNLL-U token lines into a sentence CoreMap
358-
**/
359-
public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence sentence) {
360-
List<String> lines = sentence.tokenLines;
361-
// create CoreLabels
362-
List<CoreLabel> coreLabels = new ArrayList<CoreLabel>();
363-
int sentenceTokenIndex = 1;
364-
for (String line : lines) {
365-
List<String> fields = Arrays.asList(line.split("\t"));
366-
CoreLabel cl = new CoreLabel();
367-
cl.setWord(fields.get(CoNLLU_WordField));
368-
cl.setValue(fields.get(CoNLLU_WordField));
369-
cl.setOriginalText(fields.get(CoNLLU_WordField));
370-
cl.setIsNewline(false);
371-
372-
if (!fields.get(CoNLLU_LemmaField).equals("_"))
373-
cl.setLemma(fields.get(CoNLLU_LemmaField));
374-
375-
if (!fields.get(CoNLLU_UPOSField).equals("_"))
376-
cl.set(CoreAnnotations.CoarseTagAnnotation.class, fields.get(CoNLLU_UPOSField));
377-
378-
final String xpos = fields.get(CoNLLU_XPOSField);
379-
if (!xpos.equals("_"))
380-
cl.setTag(xpos);
381-
382-
if (!fields.get(CoNLLU_FeaturesField).equals("_")) {
383-
CoNLLUFeatures features = new CoNLLUFeatures(fields.get(CoNLLU_FeaturesField));
384-
cl.set(CoreAnnotations.CoNLLUFeats.class, features);
385-
}
386-
for (int extraColumnIdx = 10; extraColumnIdx < columnCount && extraColumnIdx < fields.size();
387-
extraColumnIdx++) {
388-
cl.set(extraColumns.get(extraColumnIdx), fields.get(extraColumnIdx));
357+
* Convert a single ten column CoNLLU line into a CoreLabel
358+
*/
359+
public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
360+
List<String> fields = Arrays.asList(line.split("\t"));
361+
CoreLabel cl = new CoreLabel();
362+
int sentenceTokenIndex = Integer.valueOf(fields.get(CoNLLU_IndexField));
363+
cl.setWord(fields.get(CoNLLU_WordField));
364+
cl.setValue(fields.get(CoNLLU_WordField));
365+
cl.setOriginalText(fields.get(CoNLLU_WordField));
366+
cl.setIsNewline(false);
367+
368+
if (!fields.get(CoNLLU_LemmaField).equals("_"))
369+
cl.setLemma(fields.get(CoNLLU_LemmaField));
370+
371+
if (!fields.get(CoNLLU_UPOSField).equals("_"))
372+
cl.set(CoreAnnotations.CoarseTagAnnotation.class, fields.get(CoNLLU_UPOSField));
373+
374+
final String xpos = fields.get(CoNLLU_XPOSField);
375+
if (!xpos.equals("_"))
376+
cl.setTag(xpos);
377+
378+
if (!fields.get(CoNLLU_FeaturesField).equals("_")) {
379+
CoNLLUFeatures features = new CoNLLUFeatures(fields.get(CoNLLU_FeaturesField));
380+
cl.set(CoreAnnotations.CoNLLUFeats.class, features);
381+
}
382+
for (int extraColumnIdx = 10; extraColumnIdx < columnCount && extraColumnIdx < fields.size();
383+
extraColumnIdx++) {
384+
cl.set(extraColumns.get(extraColumnIdx), fields.get(extraColumnIdx));
385+
}
386+
cl.setIndex(sentenceTokenIndex);
387+
388+
// handle the MWT info
389+
if (sentence.mwtData.containsKey(sentenceTokenIndex - 1)) {
390+
// set MWT text
391+
cl.set(CoreAnnotations.MWTTokenTextAnnotation.class,
392+
sentence.mwtTokens.get(sentence.mwtData.get(sentenceTokenIndex - 1)));
393+
cl.setIsMWT(true);
394+
// check if first
395+
if (sentence.mwtData.containsKey(sentenceTokenIndex - 2) &&
396+
sentence.mwtData.get(sentenceTokenIndex-2).equals(sentence.mwtData.get(sentenceTokenIndex-1))) {
397+
cl.setIsMWTFirst(false);
398+
} else {
399+
cl.setIsMWTFirst(true);
389400
}
390-
cl.setIndex(sentenceTokenIndex);
391-
392-
// handle the MWT info
393-
if (sentence.mwtData.containsKey(sentenceTokenIndex - 1)) {
394-
// set MWT text
395-
cl.set(CoreAnnotations.MWTTokenTextAnnotation.class,
396-
sentence.mwtTokens.get(sentence.mwtData.get(sentenceTokenIndex - 1)));
397-
cl.setIsMWT(true);
398-
// check if first
399-
if (sentence.mwtData.containsKey(sentenceTokenIndex - 2) &&
400-
sentence.mwtData.get(sentenceTokenIndex-2).equals(sentence.mwtData.get(sentenceTokenIndex-1))) {
401-
cl.setIsMWTFirst(false);
402-
} else {
403-
cl.setIsMWTFirst(true);
404-
}
405-
// SpaceAfter / SpacesAfter should only apply to the last word in an MWT
406-
// all other words are treated as implicitly having SpaceAfter=No
407-
if (sentence.mwtData.containsKey(sentenceTokenIndex) &&
408-
sentence.mwtData.get(sentenceTokenIndex).equals(sentence.mwtData.get(sentenceTokenIndex-1))) {
409-
// is there a next word MWT?
410-
// and it's the same MWT as this word?
411-
// then we aren't last, and SpaceAfter="" is implicitly true
412-
cl.setAfter("");
413-
} else {
414-
String miscInfo = sentence.mwtMiscs.get(sentence.mwtData.get(sentenceTokenIndex - 1));
415-
if (miscInfo != null && !miscInfo.equals("_")) {
416-
Map<String, String> miscKeyValues = new HashMap<>();
417-
Arrays.stream(miscInfo.split("\\|")).forEach(
418-
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
419-
String spaceAfter = miscToSpaceAfter(miscKeyValues);
420-
cl.setAfter(spaceAfter);
421-
} else {
422-
cl.setAfter(" ");
423-
}
424-
}
401+
// SpaceAfter / SpacesAfter should only apply to the last word in an MWT
402+
// all other words are treated as implicitly having SpaceAfter=No
403+
if (sentence.mwtData.containsKey(sentenceTokenIndex) &&
404+
sentence.mwtData.get(sentenceTokenIndex).equals(sentence.mwtData.get(sentenceTokenIndex-1))) {
405+
// is there a next word MWT?
406+
// and it's the same MWT as this word?
407+
// then we aren't last, and SpaceAfter="" is implicitly true
408+
cl.setAfter("");
425409
} else {
426-
cl.setIsMWT(false);
427-
cl.setIsMWTFirst(false);
428-
429-
if (!fields.get(CoNLLU_MiscField).equals("_")) {
410+
String miscInfo = sentence.mwtMiscs.get(sentence.mwtData.get(sentenceTokenIndex - 1));
411+
if (miscInfo != null && !miscInfo.equals("_")) {
430412
Map<String, String> miscKeyValues = new HashMap<>();
431-
Arrays.stream(fields.get(CoNLLU_MiscField).split("\\|")).forEach(
413+
Arrays.stream(miscInfo.split("\\|")).forEach(
432414
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
433415
String spaceAfter = miscToSpaceAfter(miscKeyValues);
434416
cl.setAfter(spaceAfter);
435417
} else {
436418
cl.setAfter(" ");
437419
}
438420
}
439-
sentenceTokenIndex++;
421+
} else {
422+
cl.setIsMWT(false);
423+
cl.setIsMWTFirst(false);
424+
425+
if (!fields.get(CoNLLU_MiscField).equals("_")) {
426+
Map<String, String> miscKeyValues = new HashMap<>();
427+
Arrays.stream(fields.get(CoNLLU_MiscField).split("\\|")).forEach(
428+
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
429+
String spaceAfter = miscToSpaceAfter(miscKeyValues);
430+
cl.setAfter(spaceAfter);
431+
} else {
432+
cl.setAfter(" ");
433+
}
434+
}
435+
return cl;
436+
}
437+
438+
/**
439+
* Convert a list of CoNLL-U token lines into a sentence CoreMap
440+
**/
441+
public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence sentence) {
442+
List<String> lines = sentence.tokenLines;
443+
// create CoreLabels
444+
List<CoreLabel> coreLabels = new ArrayList<CoreLabel>();
445+
for (String line : lines) {
446+
CoreLabel cl = convertLineToCoreLabel(sentence, line);
440447
coreLabels.add(cl);
441448
}
442449
// the last token should have a newline after

0 commit comments

Comments
 (0)