Skip to content

Commit b3f8f97

Browse files
committed
Read a SpacesBefore at the start of a sentence / document and keep it on the token. Other spaces are generally set to match the after of the previous token. Include a short test of that in the unit test
1 parent c59716b commit b3f8f97

File tree

3 files changed

+25
-15
lines changed

3 files changed

+25
-15
lines changed

data/edu/stanford/nlp/pipeline/en-example.conllu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# sent_id = weblog-blogspot.com_healingiraq_20040409053012_ENG_20040409_053012-0022
22
# text = Over 300 Iraqis are reported dead and 500 wounded in Fallujah alone.
3-
1 Over over ADV RB _ 2 advmod 2:advmod _
3+
1 Over over ADV RB _ 2 advmod 2:advmod SpacesBefore=\s\s
44
2 300 300 NUM CD NumForm=Digit|NumType=Card 3 nummod 3:nummod _
55
3 Iraqis Iraqi PROPN NNPS Number=Plur 5 nsubj:pass 5:nsubj:pass|6:nsubj:xsubj|8:nsubj:pass _
66
4 are be AUX VBP Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 5 aux:pass 5:aux:pass _

itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,8 @@ public void testReadingInEmpties() throws ClassNotFoundException, IOException {
419419

420420
SemanticGraph enhanced = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
421421
assertEquals(EXPECTED_ENHANCED, enhanced);
422+
423+
assertEquals(" ", tokens.get(0).before());
422424
}
423425

424426
}

src/edu/stanford/nlp/pipeline/CoNLLUReader.java

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ public class CoNLLUReader {
2525
**/
2626
// TODO: read sent_id?
2727
// TODO: read comments in general
28-
// TODO: SpacesBefore on the first token should be checked
2928
// TODO: reconsider the newline as the after on the last word
29+
// TODO: keep around the rest of the misc annotations
3030
public static final int CoNLLU_IndexField = 0;
3131
public static final int CoNLLU_WordField = 1;
3232
public static final int CoNLLU_LemmaField = 2;
@@ -408,6 +408,19 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
408408
cl.set(extraColumns.get(extraColumnIdx), fields.get(extraColumnIdx));
409409
}
410410

411+
Map<String, String> miscKeyValues = new HashMap<>();
412+
if (!fields.get(CoNLLU_MiscField).equals("_")) {
413+
Arrays.stream(fields.get(CoNLLU_MiscField).split("\\|")).forEach(
414+
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
415+
}
416+
417+
// SpacesBefore on a word that isn't the first in a document will
418+
// be replaced with the SpacesAfter from the previous token later
419+
String spacesBefore = miscKeyValues.get("SpacesBefore");
420+
if (spacesBefore != null) {
421+
cl.setBefore(unescapeSpacesAfter(spacesBefore));
422+
}
423+
411424
// handle the MWT info and after text
412425
if (isEmpty) {
413426
// don't set an after for empty tokens
@@ -437,10 +450,10 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
437450
} else {
438451
String miscInfo = sentence.mwtMiscs.get(sentence.mwtData.get(sentenceTokenIndex - 1));
439452
if (miscInfo != null && !miscInfo.equals("_")) {
440-
Map<String, String> miscKeyValues = new HashMap<>();
453+
Map<String, String> mwtKeyValues = new HashMap<>();
441454
Arrays.stream(miscInfo.split("\\|")).forEach(
442-
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
443-
String spaceAfter = miscToSpaceAfter(miscKeyValues);
455+
kv -> mwtKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
456+
String spaceAfter = miscToSpaceAfter(mwtKeyValues);
444457
cl.setAfter(spaceAfter);
445458
} else {
446459
cl.setAfter(" ");
@@ -450,15 +463,8 @@ public CoreLabel convertLineToCoreLabel(CoNLLUSentence sentence, String line) {
450463
cl.setIsMWT(false);
451464
cl.setIsMWTFirst(false);
452465

453-
if (!fields.get(CoNLLU_MiscField).equals("_")) {
454-
Map<String, String> miscKeyValues = new HashMap<>();
455-
Arrays.stream(fields.get(CoNLLU_MiscField).split("\\|")).forEach(
456-
kv -> miscKeyValues.put(kv.split("=", 2)[0], kv.split("=")[1]));
457-
String spaceAfter = miscToSpaceAfter(miscKeyValues);
458-
cl.setAfter(spaceAfter);
459-
} else {
460-
cl.setAfter(" ");
461-
}
466+
String spaceAfter = miscToSpaceAfter(miscKeyValues);
467+
cl.setAfter(spaceAfter);
462468
}
463469
return cl;
464470
}
@@ -477,7 +483,9 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
477483
// the last token should have a newline after
478484
coreLabels.get(coreLabels.size() - 1).setAfter(System.lineSeparator());
479485
// set before
480-
coreLabels.get(0).setBefore("");
486+
if (!coreLabels.get(0).containsKey(CoreAnnotations.BeforeAnnotation.class)) {
487+
coreLabels.get(0).setBefore("");
488+
}
481489
for (int i = 1 ; i < coreLabels.size() ; i++) {
482490
// all words should match the after of the previous token
483491
coreLabels.get(i).setBefore(coreLabels.get(i - 1).after());

0 commit comments

Comments
 (0)