Skip to content

Commit a9bef7d

Browse files
committed
Don't use lineSeparator for the 'after' between sentences. Just put a normal whitespace unless the CoNLLU document specifically has a SpacesAfter
1 parent 8d6ea40 commit a9bef7d

File tree

2 files changed

+17
-17
lines changed

2 files changed

+17
-17
lines changed

itest/src/edu/stanford/nlp/pipeline/CoNLLUReaderITest.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ public class CoNLLUReaderITest {
3131
"Pero la existencia de dos recién nacidos en la misma caja sólo podía deberse a un descuido de fábrica.",
3232
"De allí las rebajas."
3333
};
34-
static final String EXPECTED_TEXT = String.join(System.lineSeparator(), EXPECTED_SENTENCE_TEXT) + System.lineSeparator();
34+
static final String EXPECTED_TEXT = String.join(" ", EXPECTED_SENTENCE_TEXT) + " ";
3535

3636
static final String[][] EXPECTED_WORD_TEXT = {
3737
{"Pero", "la", "existencia", "de", "dos", "recién", "nacidos", "en", "la", "misma", "caja", "sólo", "podía", "deber", "se", "a", "un", "descuido", "de", "fábrica", "."},
@@ -194,8 +194,6 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
194194
CoreLabel token = tokens.get(j);
195195
if (i == 0 && j == 1) {
196196
assertEquals(" ", token.after());
197-
} else if (j == tokens.size() - 1) {
198-
assertEquals(System.lineSeparator(), token.after());
199197
} else if (j == tokens.size() - 2) {
200198
assertEquals("", token.after());
201199
} else if (i == 0 && j == 13) {
@@ -207,10 +205,7 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
207205
if (i == 0 && j == 2) {
208206
assertEquals(" ", token.before());
209207
} else if (i == 0 && j == 0) {
210-
// TODO: is it properly reading the SpacesBefore on the first token?
211208
assertEquals("", token.before());
212-
} else if (j == 0) {
213-
assertEquals(System.lineSeparator(), token.before());
214209
} else if (j == tokens.size() - 1) {
215210
assertEquals("", token.before());
216211
} else if (i == 0 && j == 14) {
@@ -265,7 +260,11 @@ public void testReadingInCoNLLUFile() throws ClassNotFoundException, IOException
265260
if (i == 0 && (j == 13 || j == 14)) {
266261
expectedKeys += 1;
267262
}
268-
assertEquals(expectedKeys, token.keySet().size());
263+
if (i == 0 && j == 0) {
264+
// The very first key won't have a Before unless the document specifically has one
265+
expectedKeys -= 1;
266+
}
267+
assertEquals("Error at sentence " + i + " word " + j, expectedKeys, token.keySet().size());
269268

270269
// The known fields should be the ones checked above:
271270
// CoreAnnotations.TextAnnotation

src/edu/stanford/nlp/pipeline/CoNLLUReader.java

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -332,16 +332,22 @@ public Annotation convertCoNLLUDocumentToAnnotation(CoNLLUDocument doc) {
332332
// set sentences
333333
finalAnnotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
334334
// build document wide CoreLabels list
335+
// TODO: do we need to put new SentenceIndexAnnotations on each of the IndexedWords?
336+
// TODO: what about document annotation?
337+
// We should confirm that setting the SentenceIndexAnnotation like this isn't
338+
// distorting any of the SemanticGraphs
335339
List<CoreLabel> tokens = new ArrayList<>();
336340
finalAnnotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
337341
int documentIdx = 0;
338342
int sentenceIdx = 0;
339343
for (CoreMap sentence : finalAnnotation.get(CoreAnnotations.SentencesAnnotation.class)) {
340344
sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentenceIdx);
341345
if (sentenceIdx > 0) {
342-
// for now we're treating a CoNLL-U document as sentences separated by newline
343-
// so every sentence after the first should have a newline as the previous character
344-
sentence.get(CoreAnnotations.TokensAnnotation.class).get(0).setBefore(System.lineSeparator());
346+
CoreMap previousSentence = finalAnnotation.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceIdx-1);
347+
List<CoreLabel> previousTokens = previousSentence.get(CoreAnnotations.TokensAnnotation.class);
348+
CoreLabel previousToken = previousTokens.get(previousTokens.size() - 1);
349+
String previousAfter = previousToken.get(CoreAnnotations.AfterAnnotation.class);
350+
sentence.get(CoreAnnotations.TokensAnnotation.class).get(0).set(CoreAnnotations.BeforeAnnotation.class, previousAfter);
345351
}
346352
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
347353
token.set(CoreAnnotations.TokenBeginAnnotation.class, documentIdx);
@@ -482,15 +488,10 @@ public CoreMap convertCoNLLUSentenceToCoreMap(CoNLLUDocument doc, CoNLLUSentence
482488
CoreLabel cl = convertLineToCoreLabel(sentence, line);
483489
coreLabels.add(cl);
484490
}
485-
// the last token should have a newline after
486-
coreLabels.get(coreLabels.size() - 1).setAfter(System.lineSeparator());
487-
// set before
488-
if (!coreLabels.get(0).containsKey(CoreAnnotations.BeforeAnnotation.class)) {
489-
coreLabels.get(0).setBefore("");
490-
}
491491
for (int i = 1 ; i < coreLabels.size() ; i++) {
492492
// all words should match the after of the previous token
493-
coreLabels.get(i).setBefore(coreLabels.get(i - 1).after());
493+
coreLabels.get(i).set(CoreAnnotations.BeforeAnnotation.class,
494+
coreLabels.get(i - 1).get(CoreAnnotations.AfterAnnotation.class));
494495
}
495496
// handle MWT tokens and build the final sentence text
496497
int sentenceCharBegin = doc.docText.length();

0 commit comments

Comments
 (0)