Skip to content

Commit ebdac38

Browse files
committed
MergeNodes can now handle multiple nodes at the same time
Add a test of MergeNodes for multiple nodes
1 parent d9083ce commit ebdac38

File tree

3 files changed

+110
-62
lines changed

3 files changed

+110
-62
lines changed

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java

Lines changed: 74 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,11 @@
22

33
import java.io.StringWriter;
44
import java.util.ArrayList;
5+
import java.util.Collections;
6+
import java.util.HashSet;
57
import java.util.List;
68
import java.util.Map;
9+
import java.util.Set;
710
import java.util.TreeMap;
811

912
import edu.stanford.nlp.ling.CoreAnnotations;
@@ -26,14 +29,11 @@
2629
*/
2730
public class MergeNodes extends SsurgeonEdit {
2831
public static final String LABEL = "mergeNodes";
29-
final List<String> nodes;
32+
final List<String> names;
3033
final Map<String, String> attributes;
3134

32-
public MergeNodes(List<String> nodes, Map<String, String> attributes) {
33-
if (nodes.size() > 2) {
34-
throw new SsurgeonParseException("Cannot support MergeNodes of size " + nodes.size() + " yet... please file an issue on github if you need this feature");
35-
}
36-
this.nodes = new ArrayList<>(nodes);
35+
public MergeNodes(List<String> names, Map<String, String> attributes) {
36+
this.names = new ArrayList<>(names);
3737
this.attributes = new TreeMap<>(attributes);
3838
}
3939

@@ -44,7 +44,7 @@ public MergeNodes(List<String> nodes, Map<String, String> attributes) {
4444
public String toEditString() {
4545
StringWriter buf = new StringWriter();
4646
buf.write(LABEL);
47-
for (String name : nodes) {
47+
for (String name : names) {
4848
buf.write("\t");
4949
buf.write(Ssurgeon.NODENAME_ARG + " " + name);
5050
}
@@ -61,90 +61,97 @@ public String toEditString() {
6161
}
6262

6363
/**
64-
* If the two named nodes are next to each other, and the edges of
65-
* the graph allow for it, squish the two words into one word
64+
* If the named nodes are next to each other, and the edges of
65+
* the graph allow for it, squish those words into one word
6666
*/
6767
@Override
6868
public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
69-
String name1 = nodes.get(0);
70-
String name2 = nodes.get(1);
71-
72-
IndexedWord node1 = sm.getNode(name1);
73-
IndexedWord node2 = sm.getNode(name2);
74-
75-
if (node1 == null || node2 == null) {
76-
return false;
77-
}
78-
79-
List<SemanticGraphEdge> n1_to_n2 = sg.getAllEdges(node1, node2);
80-
List<SemanticGraphEdge> n2_to_n1 = sg.getAllEdges(node2, node1);
81-
if (n1_to_n2.size() == 0 && n2_to_n1.size() == 0) {
82-
return false;
83-
}
84-
85-
// TODO: what about the case where the dep is or has copies?
86-
final IndexedWord head;
87-
final IndexedWord dep;
88-
89-
if (n1_to_n2.size() > 0) {
90-
head = node1;
91-
dep = node2;
92-
} else {
93-
head = node2;
94-
dep = node1;
95-
}
96-
97-
// If the dep has any edges that aren't between dep & head, abort
98-
// TODO: we could probably make it adjust edges with "dep" as source, instead
99-
for (SemanticGraphEdge e : sg.outgoingEdgeIterable(dep)) {
100-
if (e.getTarget() != head) {
69+
List<IndexedWord> nodes = new ArrayList<>();
70+
for (String name : names) {
71+
IndexedWord node = sm.getNode(name);
72+
if (node == null) {
10173
return false;
10274
}
103-
}
104-
for (SemanticGraphEdge e : sg.incomingEdgeIterable(dep)) {
105-
if (e.getSource() != head) {
106-
return false;
75+
nodes.add(node);
76+
}
77+
Collections.sort(nodes);
78+
79+
IndexedWord head = null;
80+
for (IndexedWord candidate : nodes) {
81+
if (sg.hasChildren(candidate)) {
82+
// if multiple nodes have children inside the graph,
83+
// perhaps we could merge them all,
84+
// but the easiest thing to do is just abort
85+
// TODO: an alternate approach would be to look for nodes with a head
86+
// outside the nodes in the phrase to merge
87+
if (head != null) {
88+
return false;
89+
}
90+
head = candidate;
10791
}
10892
}
10993

110-
IndexedWord left;
111-
IndexedWord right;
112-
if (node1.index() < node2.index()) {
113-
left = node1;
114-
right = node2;
115-
} else {
116-
left = node2;
117-
right = node1;
94+
Set<Integer> depIndices = new HashSet<Integer>();
95+
for (IndexedWord other : nodes) {
96+
if (other == head) {
97+
continue;
98+
}
99+
Set<IndexedWord> parents = sg.getParents(other);
100+
// this shouldn't happen
101+
if (parents.size() == 0) {
102+
return false;
103+
}
104+
// iterate instead of just do the first in case
105+
// this one day is doing an graph with extra dependencies
106+
for (IndexedWord parent : parents) {
107+
if (parent != head) {
108+
return false;
109+
}
110+
}
111+
depIndices.add(other.index());
118112
}
119113

120114
CoreLabel newLabel = AddDep.fromCheapStrings(attributes);
121115
// CoreLabel.setWord wipes out the lemma for some reason
122116
// we may eventually change that, but for now, we compensate for that here
123117
String lemma = newLabel.lemma();
118+
124119
if (newLabel.word() == null) {
125-
String newWord = left.word() + right.word();
126-
newLabel.setWord(newWord);
120+
StringBuilder newWord = new StringBuilder();
121+
for (IndexedWord node : nodes) {
122+
newWord.append(node.word());
123+
}
124+
newLabel.setWord(newWord.toString());
127125
}
128126
if (newLabel.value() == null) {
129127
newLabel.setValue(newLabel.word());
130128
}
131129

132130
newLabel.setLemma(lemma);
133131
if (newLabel.lemma() == null) {
134-
String newLemma = left.lemma() != null && right.lemma() != null ? left.lemma() + right.lemma() : null;
135-
newLabel.setLemma(newLemma);
132+
StringBuilder newLemma = new StringBuilder();
133+
for (IndexedWord node : nodes) {
134+
if (node.lemma() != null) {
135+
newLemma.append(node.lemma());
136+
}
137+
}
138+
lemma = newLemma.length() > 0 ? newLemma.toString() : null;
139+
newLabel.setLemma(lemma);
136140
}
141+
137142
// after() and before() return "" if null, so we need to use the CoreAnnotations directly
138143
if (newLabel.get(CoreAnnotations.AfterAnnotation.class) == null) {
139-
newLabel.setAfter(right.after());
144+
newLabel.setAfter(nodes.get(nodes.size() - 1).after());
140145
}
141146
if (newLabel.get(CoreAnnotations.BeforeAnnotation.class) == null) {
142-
newLabel.setBefore(right.before());
147+
newLabel.setBefore(nodes.get(0).before());
143148
}
144149

145150
// find the head, and replace all the existing annotations on the head
146151
// with the new annotations (including word and lemma)
147152
// from the newly built CoreLabel
153+
// TODO: should avoid messing with empty nodes
154+
// doing extra nodes would be good
148155
for (IndexedWord vertex : sg.vertexSet()) {
149156
if (vertex.index() == head.index()) {
150157
for (Class key : newLabel.keySet()) {
@@ -159,13 +166,19 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
159166
// TODO: super fancy would be implementing iterator.remove()
160167
// on the Set returned by the SemanticGraph
161168
for (IndexedWord vertex : sg.vertexListSorted()) {
162-
if (vertex.index() == dep.index()) {
169+
// TODO: again, don't delete empty nodes
170+
if (depIndices.contains(vertex.index())) {
163171
sg.removeVertex(vertex);
164172
}
165173
}
166174

167175
// reindex everyone
168-
SsurgeonUtils.moveNodes(sg, sm, x -> (x >= dep.index()), x -> x-1, false);
176+
List<Integer> sortedIndices = new ArrayList<>(depIndices);
177+
Collections.sort(sortedIndices);
178+
Collections.reverse(sortedIndices);
179+
for (Integer depIndex : sortedIndices) {
180+
SsurgeonUtils.moveNodes(sg, sm, x -> (x >= depIndex), x -> x-1, false);
181+
}
169182

170183
return true;
171184
}

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@
8787
* <li> {@code combineMWT -node node -word word}
8888
* <li> {@code splitWord -node node -headIndex idx -reln depType -regex w1 -regex w2 ...}
8989
* <li> {@code setRoots n1 (n2 n3 ...)}
90-
* <li> {@code mergeNodes n1 n2}
90+
* <li> {@code mergeNodes -node n1 -node n2 ...}
9191
* <li> {@code killAllIncomingEdges -node node}
9292
* <li> {@code delete -node node}
9393
* <li> {@code deleteLeaf -node node}

test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,6 +1133,41 @@ public void readXMLMergeNodes() {
11331133
assertEquals(".prof", prof.lemma());
11341134
}
11351135

1136+
/**
1137+
* Test merging three nodes at once
1138+
*<br>
1139+
* The indices should be changed as well
1140+
*/
1141+
@Test
1142+
public void readXMLMergeNodesMultiple() {
1143+
Ssurgeon inst = Ssurgeon.inst();
1144+
1145+
// Test the head word being the first word
1146+
String merge = String.join(newline,
1147+
"<ssurgeon-pattern-list>",
1148+
" <ssurgeon-pattern>",
1149+
" <uid>38</uid>",
1150+
" <notes>Merge three nodes that should not have been split</notes>",
1151+
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {}) >nmod ({}=nmod !> {})") + "</semgrex>",
1152+
" <edit-list>mergeNodes -node source -node punct -node nmod</edit-list>",
1153+
" </ssurgeon-pattern>",
1154+
"</ssurgeon-pattern-list>");
1155+
List<SsurgeonPattern> patterns = inst.readFromString(merge);
1156+
assertEquals(patterns.size(), 1);
1157+
SsurgeonPattern mergeSsurgeon = patterns.get(0);
1158+
1159+
// nodes 3, 4, 5 are in order in the same unit, so we should be able to merge them
1160+
SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
1161+
SemanticGraph expected = SemanticGraph.valueOf("[fare-5 aux> potrebbe-4 nsubj> [prof.Fotticchia-3 det> Il-2] obj> [gag-7 det> una-6] obl> [situazione-10 case> su-8 det> la-9]]", Language.UniversalEnglish);
1162+
sg.getNodeByIndexSafe(3).setLemma("prof");
1163+
sg.getNodeByIndexSafe(4).setLemma(".");
1164+
sg.getNodeByIndexSafe(5).setLemma("Fotticchia");
1165+
SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
1166+
assertEquals(expected, newSG);
1167+
IndexedWord prof = sg.getNodeByIndexSafe(3);
1168+
assertEquals("prof.Fotticchia", prof.lemma());
1169+
}
1170+
11361171
/**
11371172
* A simple test sent to us from a user (unbelievably, ssurgeon apparently has users)
11381173
*/

0 commit comments

Comments
 (0)