Skip to content

Commit 77f7291

Browse files
committed
When using Ssurgeon to split a word, and we know exactly how we want the word split, we can provide those splits rather than trying to regex our way to the right solution
1 parent 54fec11 commit 77f7291

File tree

3 files changed

+86
-28
lines changed

3 files changed

+86
-28
lines changed

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java

Lines changed: 46 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -40,25 +40,32 @@ public class SplitWord extends SsurgeonEdit {
4040

4141
final String node;
4242
final List<Pattern> nodeRegex;
43+
final List<String> exactPieces;
4344
final int headIndex;
4445
final GrammaticalRelation relation;
4546
final Map<Integer, String> nodeNames;
4647

47-
public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation, String nodeNames) {
48+
public SplitWord(String node, List<String> nodePieces, Integer headIndex, GrammaticalRelation relation, String nodeNames, boolean exactSplit) {
4849
if (node == null) {
4950
throw new SsurgeonParseException("SplitWord expected -node with the name of the matched node to split");
5051
}
5152
this.node = node;
5253

53-
if (nodeRegex == null || nodeRegex.size() == 0) {
54-
throw new SsurgeonParseException("SplitWord expected -regex with regex to determine which pieces to split the word into");
54+
if (nodePieces == null || nodePieces.size() == 0) {
55+
throw new SsurgeonParseException("SplitWord expected -exact or -regex with regex to determine which pieces to split the word into");
5556
}
56-
if (nodeRegex.size() == 1) {
57-
throw new SsurgeonParseException("SplitWord expected at least two -regex");
57+
if (nodePieces.size() == 1) {
58+
throw new SsurgeonParseException("SplitWord expected at least two -exact or -regex");
5859
}
59-
this.nodeRegex = new ArrayList<>();
60-
for (int i = 0; i < nodeRegex.size(); ++i) {
61-
this.nodeRegex.add(Pattern.compile(nodeRegex.get(i)));
60+
if (exactSplit) {
61+
this.exactPieces = new ArrayList<>(nodePieces);
62+
this.nodeRegex = null;
63+
} else {
64+
this.nodeRegex = new ArrayList<>();
65+
for (int i = 0; i < nodePieces.size(); ++i) {
66+
this.nodeRegex.add(Pattern.compile(nodePieces.get(i)));
67+
}
68+
this.exactPieces = null;
6269
}
6370

6471
if (headIndex == null) {
@@ -80,7 +87,7 @@ public SplitWord(String node, List<String> nodeRegex, Integer headIndex, Grammat
8087
throw new SsurgeonParseException("SplitWord got a -name parameter which did not have a number for one of the names. Should look like 0=foo,1=bar");
8188
}
8289
int idx = Integer.valueOf(pieces[0]);
83-
if (idx >= this.nodeRegex.size()) {
90+
if (idx >= nodePieces.size()) {
8491
throw new SsurgeonParseException("SplitWord got an index in -name which was larger than the largest possible split piece, " + idx + " (this is 0-indexed)");
8592
}
8693
this.nodeNames.put(idx, pieces[1]);
@@ -96,8 +103,14 @@ public String toEditString() {
96103
buf.write(LABEL);
97104
buf.write("\t");
98105
buf.write("-node " + node + "\t");
99-
for (Pattern regex : nodeRegex) {
100-
buf.write("-regex " + regex + "\t");
106+
if (nodeRegex != null) {
107+
for (Pattern regex : nodeRegex) {
108+
buf.write("-regex " + regex + "\t");
109+
}
110+
} else {
111+
for (String piece : exactPieces) {
112+
buf.write("-exact " + piece + "\t");
113+
}
101114
}
102115
buf.write("-reln " + relation.toString() + "\t");
103116
buf.write("-headIndex " + headIndex);
@@ -113,22 +126,27 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
113126
//
114127
// each new word created will be the concatenation of all of the
115128
// matching groups from this pattern
116-
List<String> words = new ArrayList<>();
117-
for (int i = 0; i < nodeRegex.size(); ++i) {
118-
Matcher regexMatcher = nodeRegex.get(i).matcher(origWord);
119-
if (!regexMatcher.matches()) {
120-
return false;
121-
}
129+
List<String> words;
130+
if (exactPieces != null) {
131+
words = new ArrayList<>(exactPieces);
132+
} else {
133+
words = new ArrayList<>();
134+
for (int i = 0; i < nodeRegex.size(); ++i) {
135+
Matcher regexMatcher = nodeRegex.get(i).matcher(origWord);
136+
if (!regexMatcher.matches()) {
137+
return false;
138+
}
122139

123-
StringBuilder newWordBuilder = new StringBuilder();
124-
for (int j = 0; j < regexMatcher.groupCount(); ++j) {
125-
newWordBuilder.append(regexMatcher.group(j+1));
126-
}
127-
String newWord = newWordBuilder.toString();
128-
if (newWord.length() == 0) {
129-
return false;
140+
StringBuilder newWordBuilder = new StringBuilder();
141+
for (int j = 0; j < regexMatcher.groupCount(); ++j) {
142+
newWordBuilder.append(regexMatcher.group(j+1));
143+
}
144+
String newWord = newWordBuilder.toString();
145+
if (newWord.length() == 0) {
146+
return false;
147+
}
148+
words.add(newWord);
130149
}
131-
words.add(newWord);
132150
}
133151

134152
int matchedIndex = matchedNode.index();
@@ -137,7 +155,7 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
137155

138156
// move all words down by nodeRegex.size() - 1
139157
// then move the original word down by headIndex
140-
SsurgeonUtils.moveNodes(sg, sm, x -> (x > matchedIndex), x -> x+nodeRegex.size() - 1, true);
158+
SsurgeonUtils.moveNodes(sg, sm, x -> (x > matchedIndex), x -> x+words.size() - 1, true);
141159
// the head node has its word replaced, and its index & links need
142160
// to be rearranged, but none of the links are added or removed
143161
if (headIndex > 0) {
@@ -147,7 +165,8 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
147165
matchedNode.setWord(words.get(headIndex));
148166
matchedNode.setValue(words.get(headIndex));
149167

150-
for (int i = 0; i < nodeRegex.size(); ++i) {
168+
// TODO: update SpaceAfter in a reasonable manner
169+
for (int i = 0; i < words.size(); ++i) {
151170
if (i == headIndex) {
152171
if (nodeNames.containsKey(i)) {
153172
sm.putNode(nodeNames.get(i), matchedNode);

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,7 @@ public Collection<SsurgeonWordlist> getResources() {
421421
public static final String EDGE_NAME_ARG = "-edge";
422422
public static final String NODENAME_ARG = "-node";
423423
public static final String REGEX_ARG = "-regex";
424+
public static final String EXACT_ARG = "-exact";
424425
public static final String RELN_ARG = "-reln";
425426
public static final String NODE_PROTO_ARG = "-nodearg";
426427
public static final String WEIGHT_ARG = "-weight";
@@ -450,6 +451,8 @@ protected static class SsurgeonArgs {
450451

451452
public List<String> regex = new ArrayList<>();
452453

454+
public List<String> exact = new ArrayList<>();
455+
453456
// below are string representations of the intended values
454457
public String nodeString = null;
455458

@@ -528,6 +531,9 @@ private static SsurgeonArgs parseArgsBox(String args, Map<String, String> additi
528531
case REGEX_ARG:
529532
argsBox.regex.add(argsValue);
530533
break;
534+
case EXACT_ARG:
535+
argsBox.exact.add(argsValue);
536+
break;
531537
case NODE_PROTO_ARG:
532538
argsBox.nodeString = argsValue;
533539
break;
@@ -653,7 +659,14 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
653659
return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));
654660
} else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
655661
GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
656-
return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name);
662+
if (argsBox.regex.size() > 0 && argsBox.exact.size() > 0) {
663+
throw new SsurgeonParseException("Found both regex and exact in the splits for splitWord");
664+
}
665+
if (argsBox.regex.size() > 0) {
666+
return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name, false);
667+
} else {
668+
return new SplitWord(argsBox.nodes.get(0), argsBox.exact, argsBox.headIndex, reln, argsBox.name, true);
669+
}
657670
} else if (command.equalsIgnoreCase(ReindexGraph.LABEL)) {
658671
return new ReindexGraph();
659672
}

test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2111,6 +2111,32 @@ public void readXMLSplitTwoWords() {
21112111
assertEquals(newSg, expected);
21122112
}
21132113

2114+
/**
2115+
* Test a splitWord which will split words based on exact pieces
2116+
*/
2117+
@Test
2118+
public void readXMLSplitTwoWordsExact() {
2119+
String doc = String.join(newline,
2120+
"<ssurgeon-pattern-list>",
2121+
" <ssurgeon-pattern>",
2122+
" <uid>38</uid>",
2123+
" <notes>Test splitting a word into two pieces with the head at the start</notes>",
2124+
" <language>UniversalEnglish</language>",
2125+
" <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
2126+
" <edit-list>splitWord -node split -exact foo -exact bar -reln dep -headIndex 0</edit-list>",
2127+
" </ssurgeon-pattern>",
2128+
"</ssurgeon-pattern-list>");
2129+
Ssurgeon inst = Ssurgeon.inst();
2130+
List<SsurgeonPattern> patterns = inst.readFromString(doc);
2131+
assertEquals(patterns.size(), 1);
2132+
SsurgeonPattern pattern = patterns.get(0);
2133+
2134+
SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
2135+
SemanticGraph newSg = pattern.iterate(sg).first;
2136+
SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [foo-2 dep> bar-3]]");
2137+
assertEquals(newSg, expected);
2138+
}
2139+
21142140
/**
21152141
* Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
21162142
*/

0 commit comments

Comments
 (0)