Skip to content

Commit 6152f3a

Browse files
committed
Refactor the mwt misc key values - will want to keep the non-space ones separately. Check that the SpacesAfter is correctly processed
1 parent e5d494e commit 6152f3a

File tree

2 files changed

+39
-11
lines changed

2 files changed

+39
-11
lines changed

itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,4 +498,33 @@ public void testReadingOneWord() throws ClassNotFoundException, IOException {
498498
assertEquals(1, graph.getRoots().size());
499499
}
500500

501+
502+
public static final String mwtSpaceAfterPath = String.format("edu/stanford/nlp/pipeline/en-example-misc-spaceafter.conllu");
503+
504+
@Test
505+
/**
506+
* This test checks that the SpaceAfter on a Misc is respected.
507+
*/
508+
public void testReadingMiscSpaceAfter() throws ClassNotFoundException, IOException {
509+
Annotation readInDocument = new CoNLLUReader(new Properties()).readCoNLLUFile(mwtSpaceAfterPath).get(0);
510+
511+
// this document only has one sentence
512+
List<CoreMap> sentences = readInDocument.get(CoreAnnotations.SentencesAnnotation.class);
513+
assertEquals(1, sentences.size());
514+
515+
CoreMap sentence = sentences.get(0);
516+
517+
// cursory check of the tokens
518+
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
519+
assertEquals(8, tokens.size());
520+
521+
// check that the non-last words of the MWT have no SpaceAfter
522+
assertEquals("", tokens.get(0).after());
523+
// check the SpaceAfter of the second word, which is where the MWT SpaceAfter should go
524+
assertEquals(" ", tokens.get(1).after());
525+
526+
assertTrue(sentence.containsKey(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class));
527+
assertTrue(sentence.containsKey(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class));
528+
}
529+
501530
}

src/edu/stanford/nlp/pipeline/CoNLLUReader.java

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -437,9 +437,16 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
437437
cl.setIsMWT(false);
438438
cl.setIsMWTFirst(false);
439439
} else if (sentence.mwtData.containsKey(sentenceTokenIndex - 1)) {
440+
String miscInfo = sentence.mwtMiscs.get(sentence.mwtData.get(sentenceTokenIndex - 1));
441+
Map<String, String> mwtKeyValues = new HashMap<>();
442+
if (miscInfo != null && !miscInfo.equals("_")) {
443+
Arrays.stream(miscInfo.split("\\|")).forEach(
444+
kv -> mwtKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
445+
}
446+
440447
// set MWT text
441448
cl.set(CoreAnnotations.MWTTokenTextAnnotation.class,
442-
sentence.mwtTokens.get(sentence.mwtData.get(sentenceTokenIndex - 1)));
449+
sentence.mwtTokens.get(sentence.mwtData.get(sentenceTokenIndex - 1)));
443450
cl.setIsMWT(true);
444451
// check if first
445452
if (sentence.mwtData.containsKey(sentenceTokenIndex - 2) &&
@@ -457,16 +464,8 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
457464
// then we aren't last, and SpaceAfter="" is implicitly true
458465
cl.setAfter("");
459466
} else {
460-
String miscInfo = sentence.mwtMiscs.get(sentence.mwtData.get(sentenceTokenIndex - 1));
461-
if (miscInfo != null && !miscInfo.equals("_")) {
462-
Map<String, String> mwtKeyValues = new HashMap<>();
463-
Arrays.stream(miscInfo.split("\\|")).forEach(
464-
kv -> mwtKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
465-
String spaceAfter = miscToSpaceAfter(mwtKeyValues);
466-
cl.setAfter(spaceAfter);
467-
} else {
468-
cl.setAfter(" ");
469-
}
467+
String spaceAfter = miscToSpaceAfter(mwtKeyValues);
468+
cl.setAfter(spaceAfter);
470469
}
471470
} else {
472471
cl.setIsMWT(false);

0 commit comments

Comments
 (0)