When using Ssurgeon to split a word, and we know exactly how we want the word split, we can provide those splits rather than trying to regex our way to the right solution

AngledLuffa · AngledLuffa · commit 77f72914ac06 · 2025-04-15T13:22:10.000-07:00
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SplitWord.java
@@ -40,25 +40,32 @@ public class SplitWord extends SsurgeonEdit {
 
   final String node;
   final List<Pattern> nodeRegex;
+  final List<String> exactPieces;
   final int headIndex;
   final GrammaticalRelation relation;
   final Map<Integer, String> nodeNames;
 
-  public SplitWord(String node, List<String> nodeRegex, Integer headIndex, GrammaticalRelation relation, String nodeNames) {
+  public SplitWord(String node, List<String> nodePieces, Integer headIndex, GrammaticalRelation relation, String nodeNames, boolean exactSplit) {
     if (node == null) {
       throw new SsurgeonParseException("SplitWord expected -node with the name of the matched node to split");
     }
     this.node = node;
 
-    if (nodeRegex == null || nodeRegex.size() == 0) {
-      throw new SsurgeonParseException("SplitWord expected -regex with regex to determine which pieces to split the word into");
+    if (nodePieces == null || nodePieces.size() == 0) {
+      throw new SsurgeonParseException("SplitWord expected -exact or -regex with regex to determine which pieces to split the word into");
     }
-    if (nodeRegex.size() == 1) {
-      throw new SsurgeonParseException("SplitWord expected at least two -regex");
+    if (nodePieces.size() == 1) {
+      throw new SsurgeonParseException("SplitWord expected at least two -exact or -regex");
     }
-    this.nodeRegex = new ArrayList<>();
-    for (int i = 0; i < nodeRegex.size(); ++i) {
-      this.nodeRegex.add(Pattern.compile(nodeRegex.get(i)));
+    if (exactSplit) {
+      this.exactPieces = new ArrayList<>(nodePieces);
+      this.nodeRegex = null;
+    } else {
+      this.nodeRegex = new ArrayList<>();
+      for (int i = 0; i < nodePieces.size(); ++i) {
+        this.nodeRegex.add(Pattern.compile(nodePieces.get(i)));
+      }
+      this.exactPieces = null;
     }
 
     if (headIndex == null) {
@@ -80,7 +87,7 @@ public SplitWord(String node, List<String> nodeRegex, Integer headIndex, Grammat
           throw new SsurgeonParseException("SplitWord got a -name parameter which did not have a number for one of the names.  Should look like 0=foo,1=bar");
         }
         int idx = Integer.valueOf(pieces[0]);
-        if (idx >= this.nodeRegex.size()) {
+        if (idx >= nodePieces.size()) {
           throw new SsurgeonParseException("SplitWord got an index in -name which was larger than the largest possible split piece, " + idx + " (this is 0-indexed)");
         }
         this.nodeNames.put(idx, pieces[1]);
@@ -96,8 +103,14 @@ public String toEditString() {
     buf.write(LABEL);
     buf.write("\t");
     buf.write("-node " + node + "\t");
-    for (Pattern regex : nodeRegex) {
-      buf.write("-regex " + regex + "\t");
+    if (nodeRegex != null) {
+      for (Pattern regex : nodeRegex) {
+        buf.write("-regex " + regex + "\t");
+      }
+    } else {
+      for (String piece : exactPieces) {
+        buf.write("-exact " + piece + "\t");
+      }
     }
     buf.write("-reln " + relation.toString() + "\t");
     buf.write("-headIndex " + headIndex);
@@ -113,22 +126,27 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
     //
     // each new word created will be the concatenation of all of the
     // matching groups from this pattern
-    List<String> words = new ArrayList<>();
-    for (int i = 0; i < nodeRegex.size(); ++i) {
-      Matcher regexMatcher = nodeRegex.get(i).matcher(origWord);
-      if (!regexMatcher.matches()) {
-        return false;
-      }
+    List<String> words;
+    if (exactPieces != null) {
+      words = new ArrayList<>(exactPieces);
+    } else {
+      words = new ArrayList<>();
+      for (int i = 0; i < nodeRegex.size(); ++i) {
+        Matcher regexMatcher = nodeRegex.get(i).matcher(origWord);
+        if (!regexMatcher.matches()) {
+          return false;
+        }
 
-      StringBuilder newWordBuilder = new StringBuilder();
-      for (int j = 0; j < regexMatcher.groupCount(); ++j) {
-        newWordBuilder.append(regexMatcher.group(j+1));
-      }
-      String newWord = newWordBuilder.toString();
-      if (newWord.length() == 0) {
-        return false;
+        StringBuilder newWordBuilder = new StringBuilder();
+        for (int j = 0; j < regexMatcher.groupCount(); ++j) {
+          newWordBuilder.append(regexMatcher.group(j+1));
+        }
+        String newWord = newWordBuilder.toString();
+        if (newWord.length() == 0) {
+          return false;
+        }
+        words.add(newWord);
       }
-      words.add(newWord);
     }
 
     int matchedIndex = matchedNode.index();
@@ -137,7 +155,7 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
 
     // move all words down by nodeRegex.size() - 1
     // then move the original word down by headIndex
-    SsurgeonUtils.moveNodes(sg, sm, x -> (x > matchedIndex), x -> x+nodeRegex.size() - 1, true);
+    SsurgeonUtils.moveNodes(sg, sm, x -> (x > matchedIndex), x -> x+words.size() - 1, true);
     // the head node has its word replaced, and its index & links need
     // to be rearranged, but none of the links are added or removed
     if (headIndex > 0) {
@@ -147,7 +165,8 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
     matchedNode.setWord(words.get(headIndex));
     matchedNode.setValue(words.get(headIndex));
 
-    for (int i = 0; i < nodeRegex.size(); ++i) {
+    // TODO: update SpaceAfter in a reasonable manner
+    for (int i = 0; i < words.size(); ++i) {
       if (i == headIndex) {
         if (nodeNames.containsKey(i)) {
           sm.putNode(nodeNames.get(i), matchedNode);
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
@@ -421,6 +421,7 @@ public Collection<SsurgeonWordlist> getResources() {
   public static final String EDGE_NAME_ARG = "-edge";
   public static final String NODENAME_ARG = "-node";
   public static final String REGEX_ARG = "-regex";
+  public static final String EXACT_ARG = "-exact";
   public static final String RELN_ARG = "-reln";
   public static final String NODE_PROTO_ARG = "-nodearg";
   public static final String WEIGHT_ARG = "-weight";
@@ -450,6 +451,8 @@ protected static class SsurgeonArgs {
 
     public List<String> regex = new ArrayList<>();
 
+    public List<String> exact = new ArrayList<>();
+
     // below are string representations of the intended values
     public String nodeString = null;
 
@@ -528,6 +531,9 @@ private static SsurgeonArgs parseArgsBox(String args, Map<String, String> additi
         case REGEX_ARG:
           argsBox.regex.add(argsValue);
           break;
+        case EXACT_ARG:
+          argsBox.exact.add(argsValue);
+          break;
         case NODE_PROTO_ARG:
           argsBox.nodeString = argsValue;
           break;
@@ -653,7 +659,14 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
         return new CombineMWT(argsBox.nodes, argsBox.annotations.get("word"));
       } else if (command.equalsIgnoreCase(SplitWord.LABEL)) {
         GrammaticalRelation reln = GrammaticalRelation.valueOf(language, argsBox.reln);
-        return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name);
+        if (argsBox.regex.size() > 0 && argsBox.exact.size() > 0) {
+          throw new SsurgeonParseException("Found both regex and exact in the splits for splitWord");
+        }
+        if (argsBox.regex.size() > 0) {
+          return new SplitWord(argsBox.nodes.get(0), argsBox.regex, argsBox.headIndex, reln, argsBox.name, false);
+        } else {
+          return new SplitWord(argsBox.nodes.get(0), argsBox.exact, argsBox.headIndex, reln, argsBox.name, true);
+        }
       } else if (command.equalsIgnoreCase(ReindexGraph.LABEL)) {
         return new ReindexGraph();
       }
diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
@@ -2111,6 +2111,32 @@ public void readXMLSplitTwoWords() {
     assertEquals(newSg, expected);
   }
 
+  /**
+   * Test a splitWord which will split words based on exact pieces
+   */
+  @Test
+  public void readXMLSplitTwoWordsExact() {
+    String doc = String.join(newline,
+                             "<ssurgeon-pattern-list>",
+                             "  <ssurgeon-pattern>",
+                             "    <uid>38</uid>",
+                             "    <notes>Test splitting a word into two pieces with the head at the start</notes>",
+                             "    <language>UniversalEnglish</language>",
+                             "    <semgrex>" + XMLUtils.escapeXML("{word:/foobar/}=split") + "</semgrex>",
+                             "    <edit-list>splitWord -node split -exact foo -exact bar -reln dep -headIndex 0</edit-list>",
+                             "  </ssurgeon-pattern>",
+                             "</ssurgeon-pattern-list>");
+    Ssurgeon inst = Ssurgeon.inst();
+    List<SsurgeonPattern> patterns = inst.readFromString(doc);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern pattern = patterns.get(0);
+
+    SemanticGraph sg = SemanticGraph.valueOf("[example-3 det> the-1 amod> foobar-2]");
+    SemanticGraph newSg = pattern.iterate(sg).first;
+    SemanticGraph expected = SemanticGraph.valueOf("[example-4 det> the-1 amod> [foo-2 dep> bar-3]]");
+    assertEquals(newSg, expected);
+  }
+
   /**
    * Test splitWord, which should split a word into pieces based on regex matches, with the head at position 1
    */