|
3 | 3 | import java.io.BufferedReader;
|
4 | 4 | import java.io.IOException;
|
5 | 5 | import java.io.StringReader;
|
| 6 | +import java.util.ArrayList; |
6 | 7 | import java.util.List;
|
7 | 8 |
|
8 | 9 | import edu.stanford.nlp.io.RuntimeIOException;
|
@@ -51,166 +52,95 @@ public class UniversalPOSMapper {
|
51 | 52 | private UniversalPOSMapper() {} // static methods
|
52 | 53 |
|
53 | 54 | public static void load() {
|
54 |
| - String newLine = System.lineSeparator(); |
55 |
| - String rawPattern = String.join(newLine, |
56 |
| - // ------------------------------ |
57 |
| - // Context-sensitive mappings |
| 55 | + operations = new ArrayList<>(); |
| 56 | + // ------------------------------ |
| 57 | + // Context-sensitive mappings |
| 58 | + // ------------------------------ |
58 | 59 |
|
59 |
| - // TO -> PART (in CONJP phrases) |
60 |
| - "@CONJP < TO=target < VB", |
61 |
| - "", |
62 |
| - "relabel target PART", |
63 |
| - "", |
| 60 | + // TO -> PART (in CONJP phrases) |
| 61 | + String [][] contextMappings = new String [][] { |
| 62 | + { "@CONJP < TO=target < VB", "PART", }, |
| 63 | + { "@VP < @VP < (/^TO$/=target <... {/.*/})", "PART", }, |
| 64 | + { "@VP <: (/^TO$/=target <... {/.*/})", "PART", }, |
| 65 | + { "TO=target <... {/.*/}", "ADP", }, // otherwise TO -> ADP |
| 66 | + // Don't do this, we are now treating these as copular constructions |
| 67 | + // VB.* -> AUX (for passives where main verb is part of an ADJP) |
| 68 | + // @VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase|get|got|getting|gets|gotten)$/ ) < (@ADJP [ < VBN|VBD | < (@VP|ADJP < VBN|VBD) < CC ] ) |
| 69 | + //relabel target AUX", |
64 | 70 |
|
65 |
| - // TO -> PART |
66 |
| - "@VP < @VP < (/^TO$/=target <... {/.*/})", |
67 |
| - "", |
68 |
| - "relabel target PART", |
69 |
| - "", |
| 71 | + // VB.* -> AUX (for cases with fronted main VPs) |
| 72 | + { "@SINV < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ ) $-- (@VP < VBD|VBN))", |
| 73 | + "AUX", }, |
| 74 | + // VB.* -> AUX (another, rarer case of fronted VPs) |
| 75 | + { "@SINV < (@VP < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ )) $-- (@VP < VBD|VBN))", |
| 76 | + "AUX", }, |
70 | 77 |
|
71 |
| - // TO -> PART |
72 |
| - "@VP <: (/^TO$/=target <... {/.*/})", |
73 |
| - "", |
74 |
| - "relabel target PART", |
75 |
| - "", |
| 78 | + // VB.* -> AUX (passive, case 2) |
| 79 | + //"%SQ|SINV < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ $++ (VP < VBD|VBN))", |
| 80 | + //"%relabel target AUX", |
| 81 | + // VB.* -> AUX (active, case 1) |
| 82 | + { "VP < VP < (/^VB.*$/=target <: /^(?i:will|have|can|would|do|is|was|be|are|has|could|should|did|been|may|were|had|'ll|'ve|does|am|might|ca|'m|being|'s|must|'d|'re|wo|shall|get|ve|s|got|r|m|getting|having|d|re|ll|wilt|v|of|my|nt|gets|du|wud|woud|with|willl|wil|wase|shoul|shal|`s|ould|-ll|most|made|hvae|hav|cold|as|art|ai|ar|a)$/)", |
| 83 | + "AUX", }, |
76 | 84 |
|
77 |
| - // TO -> ADP (otherwise) |
78 |
| - "TO=target <... {/.*/}", |
79 |
| - "", |
80 |
| - "relabel target ADP", |
81 |
| - "", |
| 85 | + // VB -> AUX (active, case 2) |
| 86 | + { "@SQ|SINV < (/^VB/=target $++ /^(?:VP)/ <... {/.*/})", "AUX" }, |
82 | 87 |
|
83 |
| - // Don't do this, we are now treating these as copular constructions |
84 |
| - // VB.* -> AUX (for passives where main verb is part of an ADJP) |
85 |
| - // @VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase|get|got|getting|gets|gotten)$/ ) < (@ADJP [ < VBN|VBD | < (@VP|ADJP < VBN|VBD) < CC ] ) |
86 |
| - //relabel target AUX", |
| 88 | + // otherwise, VB.* -> VERB |
| 89 | + { "/^VB.*/=target <... {/.*/}", "VERB", }, |
87 | 90 |
|
88 |
| - // VB.* -> AUX (for cases with fronted main VPs) |
89 |
| - "@SINV < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ ) $-- (@VP < VBD|VBN))", |
90 |
| - "", |
91 |
| - "relabel target AUX", |
92 |
| - "", |
| 91 | + // IN -> SCONJ (subordinating conjunctions) |
| 92 | + { "/^SBAR(-[^ ]+)?$/ < (IN=target $++ @S|FRAG|SBAR|SINV <... {/.*/})", "SCONJ", }, |
93 | 93 |
|
94 |
| - // VB.* -> AUX (another, rarer case of fronted VPs) |
95 |
| - "@SINV < (@VP < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ )) $-- (@VP < VBD|VBN))", |
96 |
| - "", |
97 |
| - "relabel target AUX", |
98 |
| - "", |
99 |
| - |
100 |
| - // VB.* -> AUX (passive, case 2) |
101 |
| - "%SQ|SINV < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ $++ (VP < VBD|VBN))", |
102 |
| - "", |
103 |
| - "%relabel target AUX", |
104 |
| - "", |
105 |
| - |
106 |
| - // VB.* -> AUX (active, case 1) |
107 |
| - "VP < VP < (/^VB.*$/=target <: /^(?i:will|have|can|would|do|is|was|be|are|has|could|should|did|been|may|were|had|'ll|'ve|does|am|might|ca|'m|being|'s|must|'d|'re|wo|shall|get|ve|s|got|r|m|getting|having|d|re|ll|wilt|v|of|my|nt|gets|du|wud|woud|with|willl|wil|wase|shoul|shal|`s|ould|-ll|most|made|hvae|hav|cold|as|art|ai|ar|a)$/)", |
108 |
| - "", |
109 |
| - "relabel target AUX", |
110 |
| - "", |
111 |
| - |
112 |
| - // VB -> AUX (active, case 2) |
113 |
| - "@SQ|SINV < (/^VB/=target $++ /^(?:VP)/ <... {/.*/})", |
114 |
| - "", |
115 |
| - "relabel target AUX", |
116 |
| - "", |
117 |
| - |
118 |
| - // VB.* -> VERB |
119 |
| - "/^VB.*/=target <... {/.*/}", |
120 |
| - "", |
121 |
| - "relabel target VERB", |
122 |
| - "", |
123 |
| - |
124 |
| - // IN -> SCONJ (subordinating conjunctions) |
125 |
| - "/^SBAR(-[^ ]+)?$/ < (IN=target $++ @S|FRAG|SBAR|SINV <... {/.*/})", |
126 |
| - "", |
127 |
| - "relabel target SCONJ", |
128 |
| - "", |
129 |
| - |
130 |
| - // IN -> SCONJ (subordinating conjunctions II) |
131 |
| - "@PP < (IN=target $+ @SBAR|S)", |
132 |
| - "", |
133 |
| - "relabel target SCONJ", |
134 |
| - "", |
| 94 | + // IN -> SCONJ (subordinating conjunctions II) |
| 95 | + { "@PP < (IN=target $+ @SBAR|S)", "SCONJ" }, |
135 | 96 |
|
136 |
| - // IN -> ADP (otherwise) |
137 |
| - "IN=target < __", |
138 |
| - "", |
139 |
| - "relabel target ADP", |
140 |
| - "", |
| 97 | + // IN -> ADP (otherwise) |
| 98 | + { "IN=target < __", "ADP" }, |
141 | 99 |
|
142 |
| - // NN -> SYM (in case of the percent sign) |
143 |
| - "NN=target <... {/\\\\%/}", |
144 |
| - "", |
145 |
| - "relabel target SYM", |
146 |
| - "", |
| 100 | + // NN -> SYM (in case of the percent sign) |
| 101 | + { "NN=target <... {/[%]/}", "SYM" }, |
147 | 102 |
|
148 |
| - // fused det-noun pronouns -> PRON |
149 |
| - "NN=target < (/^(?i:(somebody|something|someone|anybody|anything|anyone|everybody|everything|everyone|nobody|nothing))$/)", |
150 |
| - "", |
151 |
| - "relabel target PRON", |
152 |
| - "", |
| 103 | + // fused det-noun pronouns -> PRON |
| 104 | + { "NN=target < (/^(?i:(somebody|something|someone|anybody|anything|anyone|everybody|everything|everyone|nobody|nothing))$/)", |
| 105 | + "PRON" }, |
153 | 106 |
|
154 |
| - // NN -> NOUN (otherwise) |
155 |
| - "NN=target <... {/.*/}", |
156 |
| - "", |
157 |
| - "relabel target NOUN", |
158 |
| - "", |
| 107 | + // NN -> NOUN (otherwise) |
| 108 | + { "NN=target <... {/.*/}", "NOUN" }, |
159 | 109 |
|
160 |
| - // NFP -> PUNCT (in case of possibly repeated hyphens, asterisks or tildes) |
161 |
| - "NFP=target <... {/^(~+|\\*+|\\-+)$/}", |
162 |
| - "", |
163 |
| - "relabel target PUNCT", |
164 |
| - "", |
| 110 | + // NFP -> PUNCT (in case of possibly repeated hyphens, asterisks or tildes) |
| 111 | + { "NFP=target <... {/^(~+|\\*+|\\-+)$/}", "PUNCT", }, |
165 | 112 |
|
166 |
| - // NFP -> SYM (otherwise) |
167 |
| - "NFP=target <... {/.*/}", |
168 |
| - "", |
169 |
| - "relabel target SYM", |
170 |
| - "", |
| 113 | + // NFP -> SYM (otherwise) |
| 114 | + { "NFP=target <... {/.*/}", "SYM" }, |
171 | 115 |
|
172 |
| - // RB -> PART when it is verbal negation (not or its reductions) |
173 |
| - "@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)", |
174 |
| - "", |
175 |
| - "relabel target PART", |
176 |
| - "", |
| 116 | + // RB -> PART when it is verbal negation (not or its reductions) |
| 117 | + { "@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)", "PART" }, |
177 | 118 |
|
178 |
| - // Otherwise RB -> ADV |
179 |
| - "RB=target <... {/.*/}", |
180 |
| - "", |
181 |
| - "relabel target ADV", |
182 |
| - "", |
| 119 | + // Otherwise RB -> ADV |
| 120 | + { "RB=target <... {/.*/}", "ADV" }, |
183 | 121 |
|
184 |
| - // DT -> PRON (pronominal this/that/these/those) |
185 |
| - "@NP <: (DT=target < /^(?i:th(is|at|ose|ese))$/)", |
186 |
| - "", |
187 |
| - "relabel target PRON", |
188 |
| - "", |
| 122 | + // DT -> PRON (pronominal this/that/these/those) |
| 123 | + { "@NP <: (DT=target < /^(?i:th(is|at|ose|ese))$/)", "PRON", }, |
189 | 124 |
|
190 |
| - // DT -> DET |
191 |
| - "DT=target < __", |
192 |
| - "", |
193 |
| - "relabel target DET", |
194 |
| - "", |
| 125 | + // DT -> DET |
| 126 | + { "DT=target < __", "DET" }, |
195 | 127 |
|
196 |
| - // WDT -> PRON (pronominal that/which) |
197 |
| - "@WHNP|NP <: (WDT=target < /^(?i:(that|which))$/)", |
198 |
| - "", |
199 |
| - "relabel target PRON", |
200 |
| - "", |
| 128 | + // WDT -> PRON (pronominal that/which) |
| 129 | + { "@WHNP|NP <: (WDT=target < /^(?i:(that|which))$/)", "PRON" }, |
201 | 130 |
|
202 |
| - // WDT->SCONJ (incorrectly tagged subordinating conjunctions) |
203 |
| - "@SBAR < (WDT=target < /^(?i:(that|which))$/)", |
204 |
| - "", |
205 |
| - "relabel target SCONJ", |
206 |
| - "", |
| 131 | + // WDT->SCONJ (incorrectly tagged subordinating conjunctions) |
| 132 | + { "@SBAR < (WDT=target < /^(?i:(that|which))$/)", "SCONJ" }, |
207 | 133 |
|
208 |
| - // WDT -> DET |
209 |
| - "WDT=target <... {/.*/}", |
210 |
| - "", |
211 |
| - "relabel target DET", |
212 |
| - "", |
| 134 | + // WDT -> DET |
| 135 | + { "WDT=target <... {/.*/}", "DET" }, |
| 136 | + }; |
| 137 | + for (String[] newOp : contextMappings) { |
| 138 | + operations.add(new Pair<>(TregexPattern.compile(newOp[0]), |
| 139 | + Tsurgeon.parseOperation("relabel target " + newOp[1]))); |
213 | 140 |
|
| 141 | + } |
| 142 | + String newLine = System.lineSeparator(); |
| 143 | + String rawPattern = String.join(newLine, |
214 | 144 | // ------------------------------
|
215 | 145 | // 1 to 1 mappings
|
216 | 146 | // ------------------------------
|
@@ -436,7 +366,8 @@ public static void load() {
|
436 | 366 | "relabel target X");
|
437 | 367 | StringReader reader = new StringReader(rawPattern);
|
438 | 368 | try (BufferedReader buffered = new BufferedReader(reader)) {
|
439 |
| - operations = Tsurgeon.getOperationsFromReader(buffered, new TregexPatternCompiler()); |
| 369 | + List<Pair<TregexPattern, TsurgeonPattern>> newOperations = Tsurgeon.getOperationsFromReader(buffered, new TregexPatternCompiler()); |
| 370 | + operations.addAll(newOperations); |
440 | 371 | } catch (IOException e) {
|
441 | 372 | throw new RuntimeIOException(e);
|
442 | 373 | }
|
|
0 commit comments