Skip to content

Commit e2acb52

Browse files
committed
Include SpacesAfter as well as SpaceAfter=No in the CoNLLUDocumentWriter
1 parent d9b61c4 commit e2acb52

File tree

1 file changed

+48
-14
lines changed

1 file changed

+48
-14
lines changed

src/edu/stanford/nlp/trees/ud/CoNLLUDocumentWriter.java

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,32 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
3535
return printSemanticGraph(basicSg, enhancedSg, true, basicSg.getComments());
3636
}
3737

38+
// TODO: put in the same place as CoNLLUReader::unescapeSpacesAfter
39+
public static String escapeSpaces(String after) {
40+
StringBuilder result = new StringBuilder();
41+
for (int i = 0; i < after.length(); ++i) {
42+
char next = after.charAt(i);
43+
if (next == ' ') {
44+
result.append("\\s");
45+
} else if (next == '\t') {
46+
result.append("\\t");
47+
} else if (next == '\r') {
48+
result.append("\\r");
49+
} else if (next == '\n') {
50+
result.append("\\n");
51+
} else if (next == '|') {
52+
result.append("\\p");
53+
} else if (next == '\\') {
54+
result.append("\\\\");
55+
} else if (next == ' ') {
56+
result.append("\\u00A0");
57+
} else {
58+
result.append(next);
59+
}
60+
}
61+
return result.toString();
62+
}
63+
3864
public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg, boolean unescapeParenthesis, Collection<String> comments) {
3965
StringBuilder sb = new StringBuilder();
4066

@@ -97,21 +123,29 @@ public String printSemanticGraph(SemanticGraph basicSg, SemanticGraph enhancedSg
97123
String relnName = reln == null ? "_" : reln.toString();
98124

99125
// don't use after() directly; it returns a default of ""
100-
// TODO: does this handle SpaceAfter on other tokens or SpacesAfter?
101-
if (token.get(CoreAnnotations.AfterAnnotation.class) != null && token.after().equals("")) {
102-
IndexedWord nextVertex = tokenSg.getNodeByIndexSafe(token.index() + 1);
103-
// the next word needs to exist and be part of the same MWT
104-
// and either this word is the start of the MWT
105-
// or this word is the middle of the same MWT as the next word
106-
// if that is true, we will skip the SpaceAfter annotation
107-
boolean inMWT = ((nextVertex != null && isMWTbutNotStart(nextVertex)) &&
108-
((token.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) && token.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) ||
109-
(isMWTbutNotStart(token))));
110-
if (!inMWT) {
111-
if (misc.equals("_")) {
112-
misc = "SpaceAfter=No";
126+
// TODO: also print SpacesBefore on the first token
127+
if (token.get(CoreAnnotations.AfterAnnotation.class) != null) {
128+
String after = token.after();
129+
if (!after.equals(" ")) {
130+
if (after.equals("")) {
131+
after = "SpaceAfter=No";
113132
} else {
114-
misc = misc + "|SpaceAfter=No";
133+
after = "SpacesAfter=" + escapeSpaces(after);
134+
}
135+
IndexedWord nextVertex = tokenSg.getNodeByIndexSafe(token.index() + 1);
136+
// the next word needs to exist and be part of the same MWT
137+
// and either this word is the start of the MWT
138+
// or this word is the middle of the same MWT as the next word
139+
// if that is true, we will skip the SpaceAfter annotation
140+
boolean inMWT = ((nextVertex != null && isMWTbutNotStart(nextVertex)) &&
141+
((token.containsKey(CoreAnnotations.IsFirstWordOfMWTAnnotation.class) && token.get(CoreAnnotations.IsFirstWordOfMWTAnnotation.class)) ||
142+
(isMWTbutNotStart(token))));
143+
if (!inMWT) {
144+
if (misc.equals("_")) {
145+
misc = after;
146+
} else {
147+
misc = misc + "|" + after;
148+
}
115149
}
116150
}
117151
}

0 commit comments

Comments
 (0)