18
18
*/
19
19
public class CoNLLUDocumentWriter {
20
20
21
+ private static final String LRB_PATTERN = "(?i)-LRB-" ;
22
+ private static final String RRB_PATTERN = "(?i)-RRB-" ;
21
23
22
- private static final String LRB_PATTERN = "(?i)-LRB-" ;
23
- private static final String RRB_PATTERN = "(?i)-RRB-" ;
24
24
25
+ public String printSemanticGraph (SemanticGraph basicSg ) {
26
+ return printSemanticGraph (basicSg , null , true );
27
+ }
25
28
26
- public String printSemanticGraph (SemanticGraph basicSg ) {
27
- return printSemanticGraph (basicSg , null , true );
28
- }
29
+ public String printSemanticGraph (SemanticGraph basicSg , SemanticGraph enhancedSg ) {
30
+ return printSemanticGraph (basicSg , enhancedSg , true );
31
+ }
29
32
30
- public String printSemanticGraph (SemanticGraph basicSg , SemanticGraph enhancedSg ) {
31
- return printSemanticGraph (basicSg , enhancedSg , true );
32
- }
33
+ public String printSemanticGraph (SemanticGraph basicSg , SemanticGraph enhancedSg , boolean unescapeParenthesis ) {
34
+ StringBuilder sb = new StringBuilder ();
33
35
34
- public String printSemanticGraph (SemanticGraph basicSg , SemanticGraph enhancedSg , boolean unescapeParenthesis ) {
36
+ /* Print comments. */
37
+ for (String comment : basicSg .getComments ()) {
38
+ sb .append (comment ).append (System .lineSeparator ());
39
+ }
35
40
41
+ SemanticGraph tokenSg = enhancedSg != null ? enhancedSg : basicSg ;
36
42
43
+ for (IndexedWord token : tokenSg .vertexListSorted ()) {
44
+ /* Check for multiword tokens. */
45
+ if (token .containsKey (CoreAnnotations .CoNLLUTokenSpanAnnotation .class )) {
46
+ printSpan (sb , token );
47
+ } else if (token .containsKey (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ) && token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class )) {
48
+ printMWT (sb , tokenSg , token );
49
+ }
37
50
38
- StringBuilder sb = new StringBuilder ();
51
+ /* Try to find main governor and additional dependencies. */
52
+ IndexedWord gov = basicSg .containsVertex (token ) ? basicSg .getParent (token ) : null ;
53
+ String govIdx = gov != null ? gov .toCopyIndex () : null ;
54
+ GrammaticalRelation reln = gov != null ? basicSg .getEdge (gov , token ).getRelation () : null ;
55
+
56
+ HashMap <String , String > enhancedDependencies = new HashMap <>();
57
+ if (enhancedSg != null ) {
58
+ for (IndexedWord parent : enhancedSg .getParents (token )) {
59
+ SemanticGraphEdge edge = enhancedSg .getEdge (parent , token );
60
+ String relationString = edge .getRelation ().toString ();
61
+ // for Joakim
62
+ //if (edge.getWeight() == 1.0) {
63
+ // relationString = relationString + ":ENH_CONTROL";
64
+ //} else if (edge.getWeight() == 3.0) {
65
+ // relationString = relationString + ":ENH_RELCL";
66
+ //} else if (edge.getWeight() == 4.0) {
67
+ // relationString = relationString + ":ENH_GAPPING";
68
+ //} else if (edge.getWeight() == 5.0) {
69
+ // relationString = relationString + ":ENH_CONJ_PROP";
70
+ //}
71
+ enhancedDependencies .put (parent .toCopyIndex (), relationString );
72
+ }
73
+ } else {
74
+ // add enhanced ones stored with token
75
+ HashMap <String , String > secondaryDeps = token .get (CoreAnnotations .CoNLLUSecondaryDepsAnnotation .class );
76
+ if (secondaryDeps != null ) {
77
+ enhancedDependencies .putAll (token .get (CoreAnnotations .CoNLLUSecondaryDepsAnnotation .class ));
78
+ //add basic dependency
79
+ if (gov != null ) {
80
+ enhancedDependencies .put (govIdx , reln .toString ());
81
+ }
82
+ }
83
+ }
39
84
40
- /* Print comments. */
41
- for (String comment : basicSg .getComments ()) {
42
- sb .append (comment ).append (System .lineSeparator ());
85
+ String additionalDepsString = CoNLLUUtils .toExtraDepsString (enhancedDependencies );
86
+ String word = token .word ();
87
+ String featuresString = CoNLLUFeatures .toFeatureString (token .get (CoreAnnotations .CoNLLUFeats .class ));
88
+ String pos = token .getString (CoreAnnotations .PartOfSpeechAnnotation .class , "_" );
89
+ String upos = token .getString (CoreAnnotations .CoarseTagAnnotation .class , "_" );
90
+ String misc = token .getString (CoreAnnotations .CoNLLUMisc .class , "_" );
91
+ String lemma = token .getString (CoreAnnotations .LemmaAnnotation .class , "_" );
92
+ String relnName = reln == null ? "_" : reln .toString ();
93
+
94
+ // don't use after() directly; it returns a default of ""
95
+ // TODO: does this handle SpaceAfter on other tokens or SpacesAfter?
96
+ if (token .get (CoreAnnotations .AfterAnnotation .class ) != null && token .after ().equals ("" )) {
97
+ IndexedWord nextVertex = tokenSg .getNodeByIndexSafe (token .index () + 1 );
98
+ // the next word needs to exist and be part of the same MWT
99
+ // and either this word is the start of the MWT
100
+ // or this word is the middle of the same MWT as the next word
101
+ // if that is true, we will skip the SpaceAfter annotation
102
+ boolean inMWT = ((nextVertex != null && isMWTbutNotStart (nextVertex )) &&
103
+ ((token .containsKey (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ) && token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class )) ||
104
+ (isMWTbutNotStart (token ))));
105
+ if (!inMWT ) {
106
+ if (misc .equals ("_" )) {
107
+ misc = "SpaceAfter=No" ;
108
+ } else {
109
+ misc = misc + "|SpaceAfter=No" ;
110
+ }
43
111
}
112
+ }
44
113
45
- SemanticGraph tokenSg = enhancedSg != null ? enhancedSg : basicSg ;
46
-
47
- for (IndexedWord token : tokenSg .vertexListSorted ()) {
48
- /* Check for multiword tokens. */
49
- if (token .containsKey (CoreAnnotations .CoNLLUTokenSpanAnnotation .class )) {
50
- printSpan (sb , token );
51
- } else if (token .containsKey (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ) && token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class )) {
52
- printMWT (sb , tokenSg , token );
53
- }
54
-
55
- /* Try to find main governor and additional dependencies. */
56
- IndexedWord gov = basicSg .containsVertex (token ) ? basicSg .getParent (token ) : null ;
57
- String govIdx = gov != null ? gov .toCopyIndex () : null ;
58
- GrammaticalRelation reln = gov != null ? basicSg .getEdge (gov , token ).getRelation () : null ;
59
-
60
- HashMap <String , String > enhancedDependencies = new HashMap <>();
61
- if (enhancedSg != null ) {
62
-
63
- for (IndexedWord parent : enhancedSg .getParents (token )) {
64
- SemanticGraphEdge edge = enhancedSg .getEdge (parent , token );
65
- String relationString = edge .getRelation ().toString ();
66
- // for Joakim
67
- //if (edge.getWeight() == 1.0) {
68
- // relationString = relationString + ":ENH_CONTROL";
69
- //} else if (edge.getWeight() == 3.0) {
70
- // relationString = relationString + ":ENH_RELCL";
71
- //} else if (edge.getWeight() == 4.0) {
72
- // relationString = relationString + ":ENH_GAPPING";
73
- //} else if (edge.getWeight() == 5.0) {
74
- // relationString = relationString + ":ENH_CONJ_PROP";
75
- //}
76
- enhancedDependencies .put (parent .toCopyIndex (), relationString );
77
- }
78
-
79
- } else {
80
-
81
- // add enhanced ones stored with token
82
- HashMap <String , String > secondaryDeps = token .get (CoreAnnotations .CoNLLUSecondaryDepsAnnotation .class );
83
- if (secondaryDeps != null ) {
84
- enhancedDependencies .putAll (token .get (CoreAnnotations .CoNLLUSecondaryDepsAnnotation .class ));
85
- //add basic dependency
86
- if (gov != null ) {
87
- enhancedDependencies .put (govIdx , reln .toString ());
88
- }
89
- }
90
- }
91
-
92
-
93
- String additionalDepsString = CoNLLUUtils .toExtraDepsString (enhancedDependencies );
94
- String word = token .word ();
95
- String featuresString = CoNLLUFeatures .toFeatureString (token .get (CoreAnnotations .CoNLLUFeats .class ));
96
- String pos = token .getString (CoreAnnotations .PartOfSpeechAnnotation .class , "_" );
97
- String upos = token .getString (CoreAnnotations .CoarseTagAnnotation .class , "_" );
98
- String misc = token .getString (CoreAnnotations .CoNLLUMisc .class , "_" );
99
- String lemma = token .getString (CoreAnnotations .LemmaAnnotation .class , "_" );
100
- String relnName = reln == null ? "_" : reln .toString ();
101
-
102
- // don't use after() directly; it returns a default of ""
103
- // TODO: does this handle SpaceAfter on other tokens or SpacesAfter?
104
- if (token .get (CoreAnnotations .AfterAnnotation .class ) != null && token .after ().equals ("" )) {
105
- IndexedWord nextVertex = tokenSg .getNodeByIndexSafe (token .index () + 1 );
106
- // the next word needs to exist and be part of the same MWT
107
- // and either this word is the start of the MWT
108
- // or this word is the middle of the same MWT as the next word
109
- // if that is true, we will skip the SpaceAfter annotation
110
- boolean inMWT = ((nextVertex != null && isMWTbutNotStart (nextVertex )) &&
111
- ((token .containsKey (CoreAnnotations .IsFirstWordOfMWTAnnotation .class ) && token .get (CoreAnnotations .IsFirstWordOfMWTAnnotation .class )) ||
112
- (isMWTbutNotStart (token ))));
113
- if (!inMWT ) {
114
- if (misc .equals ("_" )) {
115
- misc = "SpaceAfter=No" ;
116
- } else {
117
- misc = misc + "|SpaceAfter=No" ;
118
- }
119
- }
120
- }
121
-
122
- /* Root. */
123
- if (govIdx == null && basicSg .getRoots ().contains (token )) {
124
- govIdx = "0" ;
125
- relnName = GrammaticalRelation .ROOT .toString ();
126
- } else if (govIdx == null ) {
127
- govIdx = "_" ;
128
- relnName = "_" ;
129
- }
130
-
131
- if (enhancedSg != null && enhancedSg .getRoots ().contains (token )) {
132
- if (enhancedDependencies .isEmpty ()) {
133
- additionalDepsString = "0:root" ;
134
- } else {
135
- additionalDepsString = "0:root|" + additionalDepsString ;
136
- }
137
- }
138
-
139
- if (unescapeParenthesis ) {
140
- word = word .replaceAll (LRB_PATTERN , "(" );
141
- word = word .replaceAll (RRB_PATTERN , ")" );
142
- lemma = lemma .replaceAll (LRB_PATTERN , "(" );
143
- lemma = lemma .replaceAll (RRB_PATTERN , ")" );
144
- }
145
-
146
- sb .append (String .format ("%s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s%n" , token .toCopyIndex (), word ,
147
- lemma , upos , pos , featuresString , govIdx , relnName , additionalDepsString , misc ));
114
+ /* Root. */
115
+ if (govIdx == null && basicSg .getRoots ().contains (token )) {
116
+ govIdx = "0" ;
117
+ relnName = GrammaticalRelation .ROOT .toString ();
118
+ } else if (govIdx == null ) {
119
+ govIdx = "_" ;
120
+ relnName = "_" ;
121
+ }
122
+
123
+ if (enhancedSg != null && enhancedSg .getRoots ().contains (token )) {
124
+ if (enhancedDependencies .isEmpty ()) {
125
+ additionalDepsString = "0:root" ;
126
+ } else {
127
+ additionalDepsString = "0:root|" + additionalDepsString ;
148
128
}
149
- sb . append ( System . lineSeparator ());
129
+ }
150
130
151
- return sb .toString ();
131
+ if (unescapeParenthesis ) {
132
+ word = word .replaceAll (LRB_PATTERN , "(" );
133
+ word = word .replaceAll (RRB_PATTERN , ")" );
134
+ lemma = lemma .replaceAll (LRB_PATTERN , "(" );
135
+ lemma = lemma .replaceAll (RRB_PATTERN , ")" );
136
+ }
137
+
138
+ sb .append (String .format ("%s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s%n" , token .toCopyIndex (), word ,
139
+ lemma , upos , pos , featuresString , govIdx , relnName , additionalDepsString , misc ));
152
140
}
141
+ sb .append (System .lineSeparator ());
142
+
143
+ return sb .toString ();
144
+ }
153
145
154
146
/**
155
147
* Outputs just one token span (MWT)
156
148
*/
157
149
public static void printSpan (StringBuilder sb , AbstractCoreLabel token ) {
158
- IntPair tokenSpan = token .get (CoreAnnotations .CoNLLUTokenSpanAnnotation .class );
159
- if (tokenSpan .getSource () == token .index ()) {
160
- String range = String .format ("%d-%d" , tokenSpan .getSource (), tokenSpan .getTarget ());
161
- sb .append (String .format ("%s\t %s\t _\t _\t _\t _\t _\t _\t _\t _%n" , range , token .originalText ()));
162
- }
150
+ IntPair tokenSpan = token .get (CoreAnnotations .CoNLLUTokenSpanAnnotation .class );
151
+ if (tokenSpan .getSource () == token .index ()) {
152
+ String range = String .format ("%d-%d" , tokenSpan .getSource (), tokenSpan .getTarget ());
153
+ sb .append (String .format ("%s\t %s\t _\t _\t _\t _\t _\t _\t _\t _%n" , range , token .originalText ()));
154
+ }
163
155
}
164
156
165
157
/**
@@ -178,22 +170,22 @@ public static boolean isMWTbutNotStart(IndexedWord nextVertex) {
178
170
}
179
171
180
172
public static void printMWT (StringBuilder sb , SemanticGraph graph , IndexedWord token ) {
181
- int startIndex = token .index ();
182
- int endIndex = startIndex ;
183
- // advance endIndex until we reach the end of the sentence, the start of the next MWT,
184
- // or a word which isn't part of any MWT
185
- IndexedWord nextVertex ;
186
- while ((nextVertex = graph .getNodeByIndexSafe (endIndex +1 )) != null ) {
187
- if (!isMWTbutNotStart (nextVertex )) {
188
- break ;
189
- }
190
- ++endIndex ;
173
+ int startIndex = token .index ();
174
+ int endIndex = startIndex ;
175
+ // advance endIndex until we reach the end of the sentence, the start of the next MWT,
176
+ // or a word which isn't part of any MWT
177
+ IndexedWord nextVertex ;
178
+ while ((nextVertex = graph .getNodeByIndexSafe (endIndex +1 )) != null ) {
179
+ if (!isMWTbutNotStart (nextVertex )) {
180
+ break ;
191
181
}
192
- if (startIndex == endIndex ) {
193
- return ;
194
- }
195
- String range = String .format ("%d-%d" , startIndex , endIndex );
196
- sb .append (String .format ("%s\t %s\t _\t _\t _\t _\t _\t _\t _\t _%n" , range , token .get (CoreAnnotations .MWTTokenTextAnnotation .class )));
182
+ ++endIndex ;
183
+ }
184
+ if (startIndex == endIndex ) {
185
+ return ;
186
+ }
187
+ String range = String .format ("%d-%d" , startIndex , endIndex );
188
+ sb .append (String .format ("%s\t %s\t _\t _\t _\t _\t _\t _\t _\t _%n" , range , token .get (CoreAnnotations .MWTTokenTextAnnotation .class )));
197
189
}
198
190
199
191
/**
@@ -205,42 +197,40 @@ public static void printMWT(StringBuilder sb, SemanticGraph graph, IndexedWord t
205
197
*/
206
198
207
199
public String printPOSAnnotations (CoreMap sentence , boolean fakeDeps ) {
208
- StringBuilder sb = new StringBuilder ();
209
-
210
- int index = 0 ;
211
- for (CoreLabel token : sentence .get (CoreAnnotations .TokensAnnotation .class )) {
212
- /* Check for multiword tokens. */
213
- if (token .containsKey (CoreAnnotations .CoNLLUTokenSpanAnnotation .class )) {
214
- printSpan (sb , token );
215
- }
200
+ StringBuilder sb = new StringBuilder ();
216
201
217
- String upos = token .getString (CoreAnnotations .CoarseTagAnnotation .class , "_" );
218
- String lemma = token .getString (CoreAnnotations .LemmaAnnotation .class , "_" );
219
- String pos = token .getString (CoreAnnotations .PartOfSpeechAnnotation .class , "_" );
220
- String featuresString = CoNLLUFeatures .toFeatureString (token .get (CoreAnnotations .CoNLLUFeats .class ));
221
- String misc = token .getString (CoreAnnotations .CoNLLUMisc .class , "_" );
222
- final String head ;
223
- final String rel ;
224
- final String headrel ;
225
- if (fakeDeps ) {
226
- // deps count from 1, with 0 as the root.
227
- // we will have the first word go to fake root
228
- head = Integer .toString (index );
229
- rel = (index == 0 ) ? "root" : "dep" ;
230
- headrel = head + ":" + rel ;
231
- } else {
232
- head = "_" ;
233
- rel = "_" ;
234
- headrel = "_" ;
235
- }
236
- index ++;
237
- sb .append (String .format ("%s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s%n" , token .index (), token .word (),
238
- lemma , upos , pos , featuresString , head , rel , headrel , misc ));
202
+ int index = 0 ;
203
+ for (CoreLabel token : sentence .get (CoreAnnotations .TokensAnnotation .class )) {
204
+ /* Check for multiword tokens. */
205
+ if (token .containsKey (CoreAnnotations .CoNLLUTokenSpanAnnotation .class )) {
206
+ printSpan (sb , token );
239
207
}
240
- sb .append (System .lineSeparator ());
241
-
242
- return sb .toString ();
243
208
209
+ String upos = token .getString (CoreAnnotations .CoarseTagAnnotation .class , "_" );
210
+ String lemma = token .getString (CoreAnnotations .LemmaAnnotation .class , "_" );
211
+ String pos = token .getString (CoreAnnotations .PartOfSpeechAnnotation .class , "_" );
212
+ String featuresString = CoNLLUFeatures .toFeatureString (token .get (CoreAnnotations .CoNLLUFeats .class ));
213
+ String misc = token .getString (CoreAnnotations .CoNLLUMisc .class , "_" );
214
+ final String head ;
215
+ final String rel ;
216
+ final String headrel ;
217
+ if (fakeDeps ) {
218
+ // deps count from 1, with 0 as the root.
219
+ // we will have the first word go to fake root
220
+ head = Integer .toString (index );
221
+ rel = (index == 0 ) ? "root" : "dep" ;
222
+ headrel = head + ":" + rel ;
223
+ } else {
224
+ head = "_" ;
225
+ rel = "_" ;
226
+ headrel = "_" ;
227
+ }
228
+ index ++;
229
+ sb .append (String .format ("%s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s\t %s%n" , token .index (), token .word (),
230
+ lemma , upos , pos , featuresString , head , rel , headrel , misc ));
244
231
}
232
+ sb .append (System .lineSeparator ());
245
233
234
+ return sb .toString ();
235
+ }
246
236
}
0 commit comments