From aeba700f88c3b3d8bf26d8a564329277f30365b5 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Wed, 19 Feb 2025 01:19:56 -0800 Subject: [PATCH] Add the capacity to negate attributes in a node, rather than requiring negative lookahead regex --- .../nlp/semgraph/semgrex/Attribute.java | 19 +++++ .../nlp/semgraph/semgrex/NodeAttributes.java | 25 +++++-- .../nlp/semgraph/semgrex/NodePattern.java | 62 +++++++--------- .../nlp/semgraph/semgrex/SemgrexParser.java | 73 ++++++++++++------- .../nlp/semgraph/semgrex/SemgrexParser.jj | 12 ++- .../semgrex/SemgrexParserConstants.java | 1 + .../semgrex/SemgrexParserTokenManager.java | 39 ++++++++-- .../nlp/semgraph/semgrex/SemgrexTest.java | 7 ++ 8 files changed, 158 insertions(+), 80 deletions(-) create mode 100644 src/edu/stanford/nlp/semgraph/semgrex/Attribute.java diff --git a/src/edu/stanford/nlp/semgraph/semgrex/Attribute.java b/src/edu/stanford/nlp/semgraph/semgrex/Attribute.java new file mode 100644 index 0000000000..4bdf67c86f --- /dev/null +++ b/src/edu/stanford/nlp/semgraph/semgrex/Attribute.java @@ -0,0 +1,19 @@ +package edu.stanford.nlp.semgraph.semgrex; + +import java.io.Serializable; + +public class Attribute implements Serializable { + final String key; + final Object cased; + final Object caseless; + final boolean negated; + + Attribute(String key, Object cased, Object caseless, boolean negated) { + this.key = key; + this.cased = cased; + this.caseless = caseless; + this.negated = negated; + } + + private static final long serialVersionUID = 973567614155612487L; +} diff --git a/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java b/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java index b6f6c50235..cb0e4251d6 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java @@ -1,7 +1,11 @@ package edu.stanford.nlp.semgraph.semgrex; -import java.util.LinkedHashMap; -import java.util.Map; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import edu.stanford.nlp.util.Triple; /** * Stores attributes for a Semgrex NodePattern. @@ -18,12 +22,14 @@ public class NodeAttributes { private boolean root; private boolean empty; - private Map attributes; + private List> attributes; + private Set positiveAttributes; public NodeAttributes() { root = false; empty = false; - attributes = new LinkedHashMap<>(); + attributes = new ArrayList<>(); + positiveAttributes = new HashSet<>(); } public void setRoot(boolean root) { @@ -42,14 +48,17 @@ public boolean empty() { return empty; } - public void setAttribute(String key, String value) { - if (attributes.containsKey(key)) { + public void setAttribute(String key, String value, boolean negated) { + if (positiveAttributes.contains(key)) { throw new SemgrexParseException("Duplicate attribute " + key + " found in semgrex expression"); } - attributes.put(key, value); + if (!negated) { + positiveAttributes.add(key); + } + attributes.add(new Triple(key, value, negated)); } - public Map attributes() { + public List> attributes() { return attributes; } } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java b/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java index 90ef8895ba..3067b1c231 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java @@ -3,7 +3,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; @@ -13,6 +12,7 @@ import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.Triple; import edu.stanford.nlp.util.logging.Redwood; public class NodePattern extends SemgrexPattern { @@ -31,7 +31,7 @@ public class NodePattern extends SemgrexPattern { * value. * Otherwise, the type will be a Pattern, and you must use Pattern.matches(). */ - private final Map> attributes; + private final List attributes; private final boolean isRoot; private final boolean isLink; private final boolean isEmpty; @@ -43,7 +43,7 @@ public class NodePattern extends SemgrexPattern { private List> variableGroups; public NodePattern(GraphRelation r, boolean negDesc, - Map attrs, + List> attrs, boolean root, boolean empty, boolean isLink, String name) { this(r, negDesc, attrs, root, empty, isLink, name, new ArrayList<>(0)); @@ -51,7 +51,7 @@ public NodePattern(GraphRelation r, boolean negDesc, // TODO: there is no capacity for named variable groups in the parser right now public NodePattern(GraphRelation r, boolean negDesc, - Map attrs, + List> attrs, boolean root, boolean empty, boolean isLink, String name, List> variableGroups) { this.reln = r; @@ -59,17 +59,18 @@ public NodePattern(GraphRelation r, boolean negDesc, this.isLink = isLink; // order the attributes so that the pattern stays the same when // printing a compiled pattern - attributes = new LinkedHashMap<>(); + attributes = new ArrayList<>(); descString = "{"; - for (Map.Entry entry : attrs.entrySet()) { + for (Triple entry : attrs) { if (!descString.equals("{")) descString += ";"; - String key = entry.getKey(); - String value = entry.getValue(); + String key = entry.first(); + String value = entry.second(); + boolean negated = entry.third(); // Add the attributes for this key if (value.equals("__")) { - attributes.put(key, Pair.makePair(true, true)); + attributes.add(new Attribute(key, true, true, negated)); } else if (value.matches("/.*/")) { boolean isRegexp = false; for (int i = 1; i < value.length() - 1; ++i) { @@ -81,34 +82,24 @@ public NodePattern(GraphRelation r, boolean negDesc, } String patternContent = value.substring(1, value.length() - 1); if (isRegexp) { - attributes.put(key, Pair.makePair( - Pattern.compile(patternContent), - Pattern.compile(patternContent, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE)) - ); + attributes.add(new Attribute(key, + Pattern.compile(patternContent), + Pattern.compile(patternContent, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE), + negated)); } else { - attributes.put(key, Pair.makePair(patternContent, patternContent)); + attributes.add(new Attribute(key, patternContent, patternContent, negated)); } } else { // raw description - attributes.put(key, Pair.makePair(value, value)); + attributes.add(new Attribute(key, value, value, negated)); } - - -// if (value.equals("__")) { -// attributes.put(key, Pair.makePair(Pattern.compile(".*"), Pattern.compile(".*", Pattern.CASE_INSENSITIVE))); -// } else if (value.matches("/.*/")) { -// attributes.put(key, Pair.makePair( -// Pattern.compile(value.substring(1, value.length() - 1)), -// Pattern.compile(value.substring(1, value.length() - 1), Pattern.CASE_INSENSITIVE)) -// ); -// } else { // raw description -// attributes.put(key, Pair.makePair( -// Pattern.compile("^(" + value + ")$"), -// Pattern.compile("^(" + value + ")$", Pattern.CASE_INSENSITIVE)) -// ); -// } - descString += (key + ':' + value); + if (negated) { + descString += (key + "!:" + value); + } else { + descString += (key + ':' + value); + } } + if (root) { if (!descString.equals("{")) descString += ";"; @@ -145,8 +136,8 @@ public boolean nodeAttrMatch(IndexedWord node, final SemanticGraph sg, boolean i return (negDesc ? !node.equals(IndexedWord.NO_WORD) : node.equals(IndexedWord.NO_WORD)); // log.info("Attributes are: " + attributes); - for (Map.Entry> attr : attributes.entrySet()) { - String key = attr.getKey(); + for (Attribute attr : attributes) { + String key = attr.key; // System.out.println(key); String nodeValue; // if (key.equals("idx")) @@ -167,7 +158,7 @@ public boolean nodeAttrMatch(IndexedWord node, final SemanticGraph sg, boolean i return negDesc; // Get the node pattern - Object toMatch = ignoreCase ? attr.getValue().second : attr.getValue().first; + Object toMatch = ignoreCase ? attr.caseless : attr.cased; boolean matches; if (toMatch instanceof Boolean) { matches = ((Boolean) toMatch); @@ -182,6 +173,9 @@ public boolean nodeAttrMatch(IndexedWord node, final SemanticGraph sg, boolean i } else { throw new IllegalStateException("Unknown matcher type: " + toMatch + " (of class + " + toMatch.getClass() + ")"); } + if (attr.negated) { + matches = !matches; + } if (!matches) { // System.out.println("doesn't match"); diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java index c4339ab097..80e46ef9ac 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java @@ -65,7 +65,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th case 11: case 15: case 17: - case 22:{ + case 23:{ node = SubNode(GraphRelation.ROOT); children.add(node); label_1: @@ -135,7 +135,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th } case 15: case 17: - case 22:{ + case 23:{ result = ModNode(r); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case RELATION: @@ -397,7 +397,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 15: case 17: - case 22:{ + case 23:{ node = ModNode(reln); break; } @@ -454,7 +454,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th case 14: case 15: case 17: - case 22:{ + case 23:{ ; break; } @@ -485,7 +485,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th boolean startUnderNeg; switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 17: - case 22:{ + case 23:{ child = Child(r); break; } @@ -512,7 +512,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th child = NodeDisj(r); break; } - case 22:{ + case 23:{ child = Description(r); break; } @@ -527,10 +527,24 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th final public void AddAttribute(NodeAttributes attributes) throws ParseException {Token attr = null; Token value = null; + Token attrType = null; switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case IDENTIFIER:{ attr = jj_consume_token(IDENTIFIER); - jj_consume_token(10); + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 10:{ + attrType = jj_consume_token(10); + break; + } + case 22:{ + attrType = jj_consume_token(22); + break; + } + default: + jj_la1[23] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case IDENTIFIER:{ value = jj_consume_token(IDENTIFIER); @@ -541,11 +555,14 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[23] = jj_gen; + jj_la1[24] = jj_gen; jj_consume_token(-1); throw new ParseException(); } -if (attr != null && value != null) attributes.setAttribute(attr.image, value.image); +if (attr != null && value != null) { + boolean negated = attrType.image.equals("!:"); + attributes.setAttribute(attr.image, value.image, negated); + } break; } case ROOT:{ @@ -559,7 +576,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[24] = jj_gen; + jj_la1[25] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -569,7 +586,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th boolean link = false; NodeAttributes attributes = new NodeAttributes(); NodePattern pat; - jj_consume_token(22); + jj_consume_token(23); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case IDENTIFIER: case EMPTY: @@ -578,24 +595,24 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th label_6: while (true) { switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { - case 23:{ + case 24:{ ; break; } default: - jj_la1[25] = jj_gen; + jj_la1[26] = jj_gen; break label_6; } - jj_consume_token(23); + jj_consume_token(24); AddAttribute(attributes); } break; } default: - jj_la1[26] = jj_gen; + jj_la1[27] = jj_gen; ; } - jj_consume_token(24); + jj_consume_token(25); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 21:{ jj_consume_token(21); @@ -612,7 +629,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[27] = jj_gen; + jj_la1[28] = jj_gen; ; } pat = new NodePattern(r, underNodeNegation, attributes.attributes(), attributes.root(), attributes.empty(), link, name != null ? name.image : null); @@ -629,13 +646,13 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th public Token jj_nt; private int jj_ntk; private int jj_gen; - final private int[] jj_la1 = new int[28]; + final private int[] jj_la1 = new int[29]; static private int[] jj_la1_0; static { jj_la1_init_0(); } private static void jj_la1_init_0() { - jj_la1_0 = new int[] {0x400,0x428808,0x3801c,0x3801c,0x428800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x428800,0x2000,0x42c000,0x4000,0x428000,0x420000,0x110,0xd0,0x800000,0xd0,0x200000,}; + jj_la1_0 = new int[] {0x400,0x828808,0x3801c,0x3801c,0x828800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x828800,0x2000,0x82c000,0x4000,0x828000,0x820000,0x400400,0x110,0xd0,0x1000000,0xd0,0x200000,}; } /** Constructor with InputStream. */ @@ -649,7 +666,7 @@ public SemgrexParser(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 28; i++) jj_la1[i] = -1; + for (int i = 0; i < 29; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -663,7 +680,7 @@ public void ReInit(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 28; i++) jj_la1[i] = -1; + for (int i = 0; i < 29; i++) jj_la1[i] = -1; } /** Constructor. */ @@ -673,7 +690,7 @@ public SemgrexParser(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 28; i++) jj_la1[i] = -1; + for (int i = 0; i < 29; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -691,7 +708,7 @@ public void ReInit(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 28; i++) jj_la1[i] = -1; + for (int i = 0; i < 29; i++) jj_la1[i] = -1; } /** Constructor with generated Token Manager. */ @@ -700,7 +717,7 @@ public SemgrexParser(SemgrexParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 28; i++) jj_la1[i] = -1; + for (int i = 0; i < 29; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -709,7 +726,7 @@ public void ReInit(SemgrexParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 28; i++) jj_la1[i] = -1; + for (int i = 0; i < 29; i++) jj_la1[i] = -1; } private Token jj_consume_token(int kind) throws ParseException { @@ -760,12 +777,12 @@ private int jj_ntk_f() { /** Generate ParseException. */ public ParseException generateParseException() { jj_expentries.clear(); - boolean[] la1tokens = new boolean[25]; + boolean[] la1tokens = new boolean[26]; if (jj_kind >= 0) { la1tokens[jj_kind] = true; jj_kind = -1; } - for (int i = 0; i < 28; i++) { + for (int i = 0; i < 29; i++) { if (jj_la1[i] == jj_gen) { for (int j = 0; j < 32; j++) { if ((jj_la1_0[i] & (1< ":" (value = | value = ) ) - { if (attr != null && value != null) attributes.setAttribute(attr.image, value.image); } ) + ((attr = (attrType = ":" | attrType = "!:") (value = | value = ) ) + { + if (attr != null && value != null) { + boolean negated = attrType.image.equals("!:"); + attributes.setAttribute(attr.image, value.image, negated); + } + }) | ( attr = { attributes.setRoot(true); } ) | @@ -288,7 +294,7 @@ NodePattern Description(GraphRelation r) : { NodePattern pat; } { - ( "{" ( AddAttribute(attributes) + ( "{" ( AddAttribute(attributes) (";" AddAttribute(attributes))* )? "}" (( ("=" { link = true; }) name = ) diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java index a4b47b3ea5..7a55891f0c 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java @@ -54,6 +54,7 @@ interface SemgrexParserConstants { "\",\"", "\"~\"", "\"=\"", + "\"!:\"", "\"{\"", "\";\"", "\"}\"", diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java index 7a741792e9..e3fe4d9933 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java @@ -21,6 +21,12 @@ class SemgrexParserTokenManager implements SemgrexParserConstants { private final int jjStopStringLiteralDfa_0(int pos, long active0){ switch (pos) { + case 0: + if ((active0 & 0x200000L) != 0L) + return 2; + if ((active0 & 0x80L) != 0L) + return 25; + return -1; default : return -1; } @@ -40,7 +46,8 @@ private int jjMoveStringLiteralDfa0_0(){ case 10: return jjStopAtPos(0, 9); case 33: - return jjStopAtPos(0, 15); + jjmatchedKind = 15; + return jjMoveStringLiteralDfa1_0(0x400000L); case 35: return jjStopAtPos(0, 6); case 36: @@ -56,7 +63,7 @@ private int jjMoveStringLiteralDfa0_0(){ case 58: return jjStopAtPos(0, 10); case 59: - return jjStopAtPos(0, 23); + return jjStopAtPos(0, 24); case 61: return jjStartNfaWithStates_0(0, 21, 2); case 63: @@ -68,17 +75,34 @@ private int jjMoveStringLiteralDfa0_0(){ case 93: return jjStopAtPos(0, 18); case 123: - return jjStopAtPos(0, 22); + return jjStopAtPos(0, 23); case 124: return jjStopAtPos(0, 13); case 125: - return jjStopAtPos(0, 24); + return jjStopAtPos(0, 25); case 126: return jjStopAtPos(0, 20); default : return jjMoveNfa_0(1, 0); } } +private int jjMoveStringLiteralDfa1_0(long active0){ + try { curChar = input_stream.readChar(); } + catch(java.io.IOException e) { + jjStopStringLiteralDfa_0(0, active0); + return 1; + } + switch(curChar) + { + case 58: + if ((active0 & 0x400000L) != 0L) + return jjStopAtPos(1, 22); + break; + default : + break; + } + return jjStartNfa_0(0, active0); +} private int jjStartNfaWithStates_0(int pos, int kind, int state) { jjmatchedKind = kind; @@ -333,8 +357,8 @@ else if (curChar < 128) /** Token literal values. */ public static final String[] jjstrLiteralImages = { "", null, null, "\100", null, null, "\43", "\44", null, "\12", "\72", "\50", -"\51", "\174", "\46", "\41", "\77", "\133", "\135", "\54", "\176", "\75", "\173", -"\73", "\175", }; +"\51", "\174", "\46", "\41", "\77", "\133", "\135", "\54", "\176", "\75", "\41\72", +"\173", "\73", "\175", }; protected Token jjFillToken() { final Token t; @@ -571,9 +595,10 @@ public void SwitchTo(int lexState) /** Lex State array. */ public static final int[] jjnewLexState = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, }; static final long[] jjtoToken = { - 0x1fffffdL, + 0x3fffffdL, }; static final long[] jjtoSkip = { 0x2L, diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java index fb9c938d85..3c2b785f34 100644 --- a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java +++ b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java @@ -215,6 +215,13 @@ public void testRegex() { "Bill"); } + public void testNegatedRegex() { + runTest("{word!:/Bill/}", "[ate subj>Bill obj>[muffins compound>blueberry]]", + "ate", "blueberry", "muffins"); + runTest("{word!:/.*i.*/}", "[ate subj>Bill obj>[muffins compound>blueberry]]", + "ate", "blueberry"); + } + public void testReferencedRegex() { runTest("{word:/Bill/}", "[ate subj>Bill obj>[bill det>the]]", "Bill");