MergeNodes can now handle multiple nodes at the same time

AngledLuffa · AngledLuffa · commit ebdac385c203 · 2025-05-31T12:22:38.000-07:00
Add a test of MergeNodes for multiple nodes
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java
@@ -2,8 +2,11 @@
 
 import java.io.StringWriter;
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;
 
 import edu.stanford.nlp.ling.CoreAnnotations;
@@ -26,14 +29,11 @@
  */
 public class MergeNodes extends SsurgeonEdit {
   public static final String LABEL = "mergeNodes";
-  final List<String> nodes;
+  final List<String> names;
   final Map<String, String> attributes;
 
-  public MergeNodes(List<String> nodes, Map<String, String> attributes) {
-    if (nodes.size() > 2) {
-      throw new SsurgeonParseException("Cannot support MergeNodes of size " + nodes.size() + " yet... please file an issue on github if you need this feature");
-    }
-    this.nodes = new ArrayList<>(nodes);
+  public MergeNodes(List<String> names, Map<String, String> attributes) {
+    this.names = new ArrayList<>(names);
     this.attributes = new TreeMap<>(attributes);
   }
 
@@ -44,7 +44,7 @@ public MergeNodes(List<String> nodes, Map<String, String> attributes) {
   public String toEditString() {
     StringWriter buf = new StringWriter();
     buf.write(LABEL);
-    for (String name : nodes) {
+    for (String name : names) {
       buf.write("\t");
       buf.write(Ssurgeon.NODENAME_ARG + " " + name);
     }
@@ -61,90 +61,97 @@ public String toEditString() {
   }
 
   /**
-   * If the two named nodes are next to each other, and the edges of
-   * the graph allow for it, squish the two words into one word
+   * If the named nodes are next to each other, and the edges of
+   * the graph allow for it, squish those words into one word
    */
   @Override
   public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
-    String name1 = nodes.get(0);
-    String name2 = nodes.get(1);
-
-    IndexedWord node1 = sm.getNode(name1);
-    IndexedWord node2 = sm.getNode(name2);
-
-    if (node1 == null || node2 == null) {
-      return false;
-    }
-
-    List<SemanticGraphEdge> n1_to_n2 = sg.getAllEdges(node1, node2);
-    List<SemanticGraphEdge> n2_to_n1 = sg.getAllEdges(node2, node1);
-    if (n1_to_n2.size() == 0 && n2_to_n1.size() == 0) {
-      return false;
-    }
-
-    // TODO: what about the case where the dep is or has copies?
-    final IndexedWord head;
-    final IndexedWord dep;
-
-    if (n1_to_n2.size() > 0) {
-      head = node1;
-      dep = node2;
-    } else {
-      head = node2;
-      dep = node1;
-    }
-
-    // If the dep has any edges that aren't between dep & head, abort
-    // TODO: we could probably make it adjust edges with "dep" as source, instead
-    for (SemanticGraphEdge e : sg.outgoingEdgeIterable(dep)) {
-      if (e.getTarget() != head) {
+    List<IndexedWord> nodes = new ArrayList<>();
+    for (String name : names) {
+      IndexedWord node = sm.getNode(name);
+      if (node == null) {
         return false;
       }
-    }
-    for (SemanticGraphEdge e : sg.incomingEdgeIterable(dep)) {
-      if (e.getSource() != head) {
-        return false;
+      nodes.add(node);
+    }
+    Collections.sort(nodes);
+
+    IndexedWord head = null;
+    for (IndexedWord candidate : nodes) {
+      if (sg.hasChildren(candidate)) {
+        // if multiple nodes have children inside the graph,
+        // perhaps we could merge them all,
+        // but the easiest thing to do is just abort
+        // TODO: an alternate approach would be to look for nodes with a head
+        // outside the nodes in the phrase to merge
+        if (head != null) {
+          return false;
+        }
+        head = candidate;
       }
     }
 
-    IndexedWord left;
-    IndexedWord right;
-    if (node1.index() < node2.index()) {
-      left = node1;
-      right = node2;
-    } else {
-      left = node2;
-      right = node1;
+    Set<Integer> depIndices = new HashSet<Integer>();
+    for (IndexedWord other : nodes) {
+      if (other == head) {
+        continue;
+      }
+      Set<IndexedWord> parents = sg.getParents(other);
+      // this shouldn't happen
+      if (parents.size() == 0) {
+        return false;
+      }
+      // iterate instead of just do the first in case
+      // this one day is doing an graph with extra dependencies
+      for (IndexedWord parent : parents) {
+        if (parent != head) {
+          return false;
+        }
+      }
+      depIndices.add(other.index());
     }
 
     CoreLabel newLabel = AddDep.fromCheapStrings(attributes);
     // CoreLabel.setWord wipes out the lemma for some reason
     // we may eventually change that, but for now, we compensate for that here
     String lemma = newLabel.lemma();
+
     if (newLabel.word() == null) {
-      String newWord = left.word() + right.word();
-      newLabel.setWord(newWord);
+      StringBuilder newWord = new StringBuilder();
+      for (IndexedWord node : nodes) {
+        newWord.append(node.word());
+      }
+      newLabel.setWord(newWord.toString());
     }
     if (newLabel.value() == null) {
       newLabel.setValue(newLabel.word());
     }
 
     newLabel.setLemma(lemma);
     if (newLabel.lemma() == null) {
-      String newLemma = left.lemma() != null && right.lemma() != null ? left.lemma() + right.lemma() : null;
-      newLabel.setLemma(newLemma);
+      StringBuilder newLemma = new StringBuilder();
+      for (IndexedWord node : nodes) {
+        if (node.lemma() != null) {
+          newLemma.append(node.lemma());
+        }
+      }
+      lemma = newLemma.length() > 0 ? newLemma.toString() : null;
+      newLabel.setLemma(lemma);
     }
+
     // after() and before() return "" if null, so we need to use the CoreAnnotations directly
     if (newLabel.get(CoreAnnotations.AfterAnnotation.class) == null) {
-      newLabel.setAfter(right.after());
+      newLabel.setAfter(nodes.get(nodes.size() - 1).after());
     }
     if (newLabel.get(CoreAnnotations.BeforeAnnotation.class) == null) {
-      newLabel.setBefore(right.before());
+      newLabel.setBefore(nodes.get(0).before());
     }
 
     // find the head, and replace all the existing annotations on the head
     // with the new annotations (including word and lemma)
     // from the newly built CoreLabel
+    // TODO: should avoid messing with empty nodes
+    // doing extra nodes would be good
     for (IndexedWord vertex : sg.vertexSet()) {
       if (vertex.index() == head.index()) {
         for (Class key : newLabel.keySet()) {
@@ -159,13 +166,19 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
     // TODO: super fancy would be implementing iterator.remove()
     // on the Set returned by the SemanticGraph
     for (IndexedWord vertex : sg.vertexListSorted()) {
-      if (vertex.index() == dep.index()) {
+      // TODO: again, don't delete empty nodes
+      if (depIndices.contains(vertex.index())) {
         sg.removeVertex(vertex);
       }
     }
 
     // reindex everyone
-    SsurgeonUtils.moveNodes(sg, sm, x -> (x >= dep.index()), x -> x-1, false);
+    List<Integer> sortedIndices = new ArrayList<>(depIndices);
+    Collections.sort(sortedIndices);
+    Collections.reverse(sortedIndices);
+    for (Integer depIndex : sortedIndices) {
+      SsurgeonUtils.moveNodes(sg, sm, x -> (x >= depIndex), x -> x-1, false);
+    }
 
     return true;
   }
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
@@ -87,7 +87,7 @@
  * <li> {@code combineMWT -node node -word word}
  * <li> {@code splitWord -node node -headIndex idx -reln depType -regex w1 -regex w2 ...}
  * <li> {@code setRoots n1 (n2 n3 ...)}
- * <li> {@code mergeNodes n1 n2}
+ * <li> {@code mergeNodes -node n1 -node n2 ...}
  * <li> {@code killAllIncomingEdges -node node}
  * <li> {@code delete -node node}
  * <li> {@code deleteLeaf -node node}
diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
@@ -1133,6 +1133,41 @@ public void readXMLMergeNodes() {
     assertEquals(".prof", prof.lemma());
   }
 
+  /**
+   * Test merging three nodes at once
+   *<br>
+   * The indices should be changed as well
+   */
+  @Test
+  public void readXMLMergeNodesMultiple() {
+    Ssurgeon inst = Ssurgeon.inst();
+
+    // Test the head word being the first word
+    String merge = String.join(newline,
+                               "<ssurgeon-pattern-list>",
+                               "  <ssurgeon-pattern>",
+                               "    <uid>38</uid>",
+                               "    <notes>Merge three nodes that should not have been split</notes>",
+                               "    <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {}) >nmod ({}=nmod !> {})") + "</semgrex>",
+                               "    <edit-list>mergeNodes -node source -node punct -node nmod</edit-list>",
+                               "  </ssurgeon-pattern>",
+                               "</ssurgeon-pattern-list>");
+    List<SsurgeonPattern> patterns = inst.readFromString(merge);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern mergeSsurgeon = patterns.get(0);
+
+    // nodes 3, 4, 5 are in order in the same unit, so we should be able to merge them
+    SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+    SemanticGraph expected = SemanticGraph.valueOf("[fare-5 aux> potrebbe-4 nsubj> [prof.Fotticchia-3 det> Il-2] obj> [gag-7 det> una-6] obl> [situazione-10 case> su-8 det> la-9]]", Language.UniversalEnglish);
+    sg.getNodeByIndexSafe(3).setLemma("prof");
+    sg.getNodeByIndexSafe(4).setLemma(".");
+    sg.getNodeByIndexSafe(5).setLemma("Fotticchia");
+    SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
+    assertEquals(expected, newSG);
+    IndexedWord prof = sg.getNodeByIndexSafe(3);
+    assertEquals("prof.Fotticchia", prof.lemma());
+  }
+
   /**
    * A simple test sent to us from a user (unbelievably, ssurgeon apparently has users)
    */