Skip to content

Commit 873b136

Browse files
committed
When outputting MWT, output the SpaceAfter/SpacesAfter from the last word on the MWT token rather than on the last word. This better agrees with the UD standard for where to print these things
1 parent 8f120b9 commit 873b136

File tree

1 file changed

+31
-16
lines changed

1 file changed

+31
-16
lines changed

src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -124,28 +124,19 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
124124

125125
// don't use after() directly; it returns a default of ""
126126
// TODO: also print SpacesBefore on the first token
127-
if (token.get(CoreAnnotations.AfterAnnotation.class) != null) {
127+
Boolean isMWT = token.get(CoreAnnotations.IsMultiWordTokenAnnotation.class);
128+
if ((isMWT == null || !isMWT) && token.get(CoreAnnotations.AfterAnnotation.class) != null) {
128129
String after = token.after();
129130
if (!after.equals(" ")) {
130131
if (after.equals("")) {
131132
after = "SpaceAfter=No";
132133
} else {
133134
after = "SpacesAfter=" + escapeSpaces(after);
134135
}
135-
IndexedWord nextVertex = tokenSg.getNodeByIndexSafe(token.index() + 1);
136-
// the next word needs to exist and be part of the same MWT
137-
// and either this word is the start of the MWT
138-
// or this word is the middle of the same MWT as the next word
139-
// if that is true, we will skip the SpaceAfter annotation
140-
boolean inMWT = ((nextVertex != null && isMWTbutNotStart(nextVertex)) &&
141-
((token.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) && token.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) ||
142-
(isMWTbutNotStart(token))));
143-
if (!inMWT) {
144-
if (misc.equals("_")) {
145-
misc = after;
146-
} else {
147-
misc = misc + "|" + after;
148-
}
136+
if (misc.equals("_")) {
137+
misc = after;
138+
} else {
139+
misc = misc + "|" + after;
149140
}
150141
}
151142
}
@@ -224,7 +215,31 @@ public static void printMWT(StringBuilder sb, SemanticGraph graph, IndexedWord t
224215
return;
225216
}
226217
String range = String.format("%d-%d", startIndex, endIndex);
227-
sb.append(String.format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t_%n", range, token.get(CoreAnnotations.MWTTokenTextAnnotation.class)));
218+
219+
IndexedWord endVertex = graph.getNodeByIndexSafe(endIndex);
220+
221+
String misc = "_";
222+
if (token.get(CoreAnnotations.MWTTokenMiscAnnotation.class) != null) {
223+
misc = token.get(CoreAnnotations.MWTTokenMiscAnnotation.class);
224+
}
225+
226+
if (endVertex.get(CoreAnnotations.AfterAnnotation.class) != null) {
227+
String after = endVertex.after();
228+
if (!after.equals(" ")) {
229+
if (after.equals("")) {
230+
after = "SpaceAfter=No";
231+
} else {
232+
after = "SpacesAfter=" + escapeSpaces(after);
233+
}
234+
if (misc.equals("_")) {
235+
misc = after;
236+
} else {
237+
misc = misc + "|" + after;
238+
}
239+
}
240+
}
241+
242+
sb.append(String.format("%s\t%s\t_\t_\t_\t_\t_\t_\t_\t%s%n", range, token.get(CoreAnnotations.MWTTokenTextAnnotation.class), misc));
228243
}
229244

230245
/**

0 commit comments

Comments
 (0)