From dca3845169fdbc0c29fe25a01edfe0fbf4c49fe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20K=C3=B6rner?= Date: Tue, 1 Jul 2025 18:43:29 +0200 Subject: [PATCH 1/4] Add start/stop offsets in query string to query nodes --- src/main/java/org/z3950/zing/cql/CQLNode.java | 17 +++++++- .../java/org/z3950/zing/cql/CQLParser.java | 40 +++++++++++++++++-- .../java/org/z3950/zing/cql/Modifier.java | 15 +++++++ .../java/org/z3950/zing/cql/ModifierSet.java | 6 ++- 4 files changed, 72 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/z3950/zing/cql/CQLNode.java b/src/main/java/org/z3950/zing/cql/CQLNode.java index b71ba1b..e59c5f6 100644 --- a/src/main/java/org/z3950/zing/cql/CQLNode.java +++ b/src/main/java/org/z3950/zing/cql/CQLNode.java @@ -10,7 +10,9 @@ * */ public abstract class CQLNode { - + + private int start = -1, stop = -1; + public abstract void traverse(CQLNodeVisitor visitor); /** @@ -25,6 +27,19 @@ public String getResultSetName() { return null; } + public int getStart() { + return start; + } + + public int getStop() { + return stop; + } + + protected void setStartStop(int start, int stop) { + this.start = start; + this.stop = stop; + } + /** * Translates a parse-tree into an XCQL document. * @return diff --git a/src/main/java/org/z3950/zing/cql/CQLParser.java b/src/main/java/org/z3950/zing/cql/CQLParser.java index 93c7c10..fc9aebc 100644 --- a/src/main/java/org/z3950/zing/cql/CQLParser.java +++ b/src/main/java/org/z3950/zing/cql/CQLParser.java @@ -134,13 +134,18 @@ private CQLNode parseTopLevelPrefixes(String index, CQLRelation relation) CQLNode node = parseQuery(index, relation); if ((compat == V1POINT2 || compat == V1POINT1SORT) && lexer.what() == CQLTokenizer.TT_SORTBY) { + int start = lexer.pos() - lexer.value().length(), stop = -1; match(lexer.what()); debug("sortspec"); CQLSortNode sortnode = new CQLSortNode(node); while (lexer.what() != CQLTokenizer.TT_EOF) { + stop = lexer.pos(); String sortindex = matchSymbol("sort index"); ModifierSet ms = gatherModifiers(sortindex); + if (ms.getModifiers().size() > 0) { + stop = ms.getModifiers().get(ms.getModifiers().size() -1).getStop(); + } sortnode.addSortIndex(ms); } @@ -148,6 +153,8 @@ private CQLNode parseTopLevelPrefixes(String index, CQLRelation relation) throw new CQLParseException("no sort keys", lexer.pos()); } + sortnode.setStartStop(start, stop); + node = sortnode; } @@ -171,10 +178,13 @@ private CQLNode parseQuery(String index, CQLRelation relation) match(type); ModifierSet ms = gatherModifiers(val); CQLNode term2 = parseTerm(index, relation); + int start = term.getStart(); + int stop = term2.getStop(); term = ((type == CQLTokenizer.TT_AND) ? new CQLAndNode(term, term2, ms) : (type == CQLTokenizer.TT_OR) ? new CQLOrNode (term, term2, ms) : (type == CQLTokenizer.TT_NOT) ? new CQLNotNode(term, term2, ms) : new CQLProxNode(term, term2, ms)); + term.setStartStop(start, stop); } else { throw new CQLParseException("expected boolean, got " + lexer.render(), lexer.pos()); @@ -191,7 +201,9 @@ private ModifierSet gatherModifiers(String base) ModifierSet ms = new ModifierSet(base); while (lexer.what() == '/') { + int start = lexer.pos() - 1; match('/'); + int stop = lexer.pos(); if (lexer.what() != CQLTokenizer.TT_WORD) throw new CQLParseException("expected modifier, " + "got " + lexer.render(), @@ -200,13 +212,14 @@ private ModifierSet gatherModifiers(String base) match(lexer.what()); if (!isSymbolicRelation()) { // It's a simple modifier consisting of type only - ms.addModifier(type); + ms.addModifier(type).setStartStop(start, stop); } else { // It's a complex modifier of the form type=value String comparision = lexer.render(lexer.what(), false); match(lexer.what()); + stop = lexer.pos(); String value = matchSymbol("modifier value"); - ms.addModifier(type, comparision, value); + ms.addModifier(type, comparision, value).setStartStop(start, stop); } } @@ -217,20 +230,26 @@ private CQLNode parseTerm(String index, CQLRelation relation) throws CQLParseException, IOException { debug("in parseTerm()"); + int termStart = lexer.pos() - lexer.value().length()- ((lexer.what() == CQLTokenizer.TT_STRING) ? 2 : 0); + int termStop = lexer.pos(); String first; StringBuilder all; while (true) { if (lexer.what() == '(') { debug("parenthesised term"); + int wrapStart = lexer.pos() - 1; match('('); CQLNode expr = parseQuery(index, relation); + int wrapStop = lexer.pos(); match(')'); + expr.setStartStop(wrapStart, wrapStop); return expr; } else if (lexer.what() == '>') { return parsePrefix(index, relation, false); } debug("non-parenthesised term"); + termStop = lexer.pos() - lexer.value().length(); first = matchSymbol("index or term"); all = new StringBuilder(first); //match relation only on second postion @@ -238,6 +257,7 @@ private CQLNode parseTerm(String index, CQLRelation relation) all.append(" ").append(lexer.value()); match(lexer.what()); } + termStop += all.length(); if (!isRelation()) break; //we're done if no relation @@ -252,13 +272,20 @@ private CQLNode parseTerm(String index, CQLRelation relation) } index = first; relation = new CQLRelation(relstr); + int start = lexer.pos() - ((isSymbolicRelation()) ? ((lexer.what() == CQLTokenizer.TT_LE || lexer.what() == CQLTokenizer.TT_GE || lexer.what() == CQLTokenizer.TT_NE || lexer.what() == CQLTokenizer.TT_EQEQ) ? 2 : 1) : lexer.value().length()); + int stop = lexer.pos(); match(lexer.what()); ModifierSet ms = gatherModifiers(relstr); relation.ms = ms; + if (ms.getModifiers().size() > 0) { + stop = ms.getModifiers().get(ms.getModifiers().size() - 1).getStop(); + } + relation.setStartStop(start, stop); debug("index='" + index + ", " + "relation='" + relation.toCQL() + "'"); } CQLTermNode node = new CQLTermNode(index, relation, all.toString()); + node.setStartStop(termStart, termStop); debug("made term node " + node.toCQL()); return node; } @@ -268,19 +295,26 @@ private CQLNode parsePrefix(String index, CQLRelation relation, throws CQLParseException, IOException { debug("prefix mapping"); + int start = lexer.pos() - 1; + int stop = -1; match('>'); + String name = null; + stop = lexer.pos(); String identifier = matchSymbol("prefix-name"); if (lexer.what() == '=') { match('='); name = identifier; + stop = lexer.pos(); identifier = matchSymbol("prefix-identifer"); } CQLNode node = topLevel ? parseTopLevelPrefixes(index, relation) : parseQuery(index, relation); - return new CQLPrefixNode(name, identifier, node); + CQLPrefixNode prefixNode = new CQLPrefixNode(name, identifier, node); + prefixNode.setStartStop(start, stop); + return prefixNode; } private boolean isWordOrString() { diff --git a/src/main/java/org/z3950/zing/cql/Modifier.java b/src/main/java/org/z3950/zing/cql/Modifier.java index af68453..330d2d0 100644 --- a/src/main/java/org/z3950/zing/cql/Modifier.java +++ b/src/main/java/org/z3950/zing/cql/Modifier.java @@ -14,6 +14,8 @@ public class Modifier { String comparison; String value; + private int start, stop; + /** * Creates a new Modifier with the specified type, comparison * and value. @@ -55,6 +57,19 @@ public String getValue() { return value; } + public int getStart() { + return start; + } + + public int getStop() { + return stop; + } + + protected void setStartStop(int start, int stop) { + this.start = start; + this.stop = stop; + } + void toXCQLInternal(XCQLBuilder b, int level, String relationElement) { b.indent(level).append("\n"); b.indent(level + 1).append(""); diff --git a/src/main/java/org/z3950/zing/cql/ModifierSet.java b/src/main/java/org/z3950/zing/cql/ModifierSet.java index 754af1e..2ef87c8 100644 --- a/src/main/java/org/z3950/zing/cql/ModifierSet.java +++ b/src/main/java/org/z3950/zing/cql/ModifierSet.java @@ -40,18 +40,20 @@ public String getBase() { * Adds a modifier of the specified type, * comparison and value to a ModifierSet. */ - public void addModifier(String type, String comparison, String value) { + public Modifier addModifier(String type, String comparison, String value) { Modifier modifier = new Modifier(type, comparison, value); modifiers.add(modifier); + return modifier; } /** * Adds a modifier of the specified type, but with no * comparison and value, to a ModifierSet. */ - public void addModifier(String type) { + public Modifier addModifier(String type) { Modifier modifier = new Modifier(type); modifiers.add(modifier); + return modifier; } /** From 340ae90974ee692422a7c309a1ee0ae831b8f3d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20K=C3=B6rner?= Date: Wed, 2 Jul 2025 17:03:07 +0200 Subject: [PATCH 2/4] Add "test" class to interactively test and visualize query trees with start/stop offsets --- .../z3950/zing/cql/CQLNodePositionsTest.java | 311 ++++++++++++++++++ 1 file changed, 311 insertions(+) create mode 100644 src/test/java/org/z3950/zing/cql/CQLNodePositionsTest.java diff --git a/src/test/java/org/z3950/zing/cql/CQLNodePositionsTest.java b/src/test/java/org/z3950/zing/cql/CQLNodePositionsTest.java new file mode 100644 index 0000000..8b71fde --- /dev/null +++ b/src/test/java/org/z3950/zing/cql/CQLNodePositionsTest.java @@ -0,0 +1,311 @@ +package org.z3950.zing.cql; + +import java.io.IOException; + +public class CQLNodePositionsTest { + + // Set to true to use the query to infer positions of various attributes of the + // CQLNode classes. Those start/stop offsets are not stored and need to be + // computed (sometimes requiring the original query string to get exact + // positions). + public static final boolean INFER_OTHER_POSITIONS = false; + + public static void main(String[] args) throws CQLParseException, IOException { + + CQLParser parser = new CQLParser(); + + String[] cqls = new String[] { + // "cat", + // " cat ", + // "\"cat\"", + // " \"cat\" ", + // "\"cat\" or apple", + "field any \"cat\"", + // "field = \"cat\"", + // "field <> cat", + + // "a lot of words", + + // --- + + // test terms + // " field = \"val\" ", + // " field = \"\\\"\" ", + // " field = \"\\\"\\\"\" ", + // " field = \"\\\"\\\"\\\"\" ", + // " field = \"\\\"\\\"\\\"\\\"\" ", + // " field = \"\\\" \\\"\" ", + + // test braces + // "a = b and (field = \"value\")", + // "a = b and ( field = \"value\" ) ", + + // this will fails with the parser + // "(a) or (b)", + // "(field = val) and (field = val)", + // "(a)", + // "(a = aa) or b = bb", + // "(a = aa and d = ef) or b = bb", + + // test spaces + // "dc.title any fish prox / unit=word dc.title any squirrel", + // "dc.title any fish sortBy dc.date / sort.ascending", + + // test CQLTermNode parsing + // "dc.TitlE any fish", + // "dc.TitlE ANY fish", + // "dc.TitlE Any fish", + // "dc.TitlE aNy fish", + // "dc.TitlE other fish", + // "dc.TitlE Other fish", + + // --- + + // "dc.title any fish", + // "dc.title any fish or dc.creator any sanderson", + "dc.title any fish sortBy dc.date/sort.ascending", + // "> dc = \"info:srw/context-sets/1/dc-v1.1\" dc.title any fish", + + // "fish", + // "cql.serverChoice = fish", // this will not really work due to using defaults + + // "\"fish\"", + // "fish", + // "\"squirrels fish\"", + // "\"\"", + + // "title any fish", + // "dc.title any fish", + + // "dc.title any fish", + // "dc.title cql.any fish", + + // "dc.title any/relevant fish", + // "dc.title any/ relevant /cql.string fish", + // "dc.title any/rel.algorithm=cori fish", + + // "dc.title any fish or (dc.creator any sanderson and dc.identifier = + // \"id:1234567\")", + + // "dc.title any fish or/rel.combine=sum dc.creator any sanderson", + // "dc.title any fish prox/unit=word/distance>3 dc.title any squirrel", + + // "\"cat\" sortBy dc.title", + // "\"dinosaur\" sortBy dc.date/sort.descending dc.title/sort.ascending", + + // "> dc = \"http://deepcustard.org/\" dc.custardDepth > 10", + // "> \"http://deepcustard.org/\" custardDepth > 10", + + // "dC.tiTlE any fish", + // NOTE: fails to parse + // "dc.TitlE Any/rEl.algOriThm=cori fish soRtbY Dc.TitlE", + + }; + + for (String cql : cqls) { + CQLNode node = null; + try { + node = parser.parse(cql); + } catch (NullPointerException e) { + System.err.println("Error parsing query '" + cql + "': " + e.getMessage()); + System.out.println(); + continue; + } + + // System.out.println("CQL: " + cql); + // System.out.println("Node: " + node.toCQL()); + // System.out.println("XCQL: " + node.toXCQL()); + + // System.out.println(); + dumpTreeSubstring(node, 0, cql); + System.out.println(); + // System.out.println("-".repeat(40)); + // System.out.println(); + } + } + + // --- + + public static void printStartStopSubstring(int start, int stop, String cql) { + System.out.print("|"); + if (start != -1 && stop != -1) { + System.out.print(".".repeat(start)); + System.out.print(cql.substring(start, stop)); + System.out.print(".".repeat(cql.length() - stop)); + } else { + System.out.print("~".repeat(cql.length())); + } + System.out.print("|"); + } + + public static void printStartStopSubstringCustom(String label, int level, int start, int stop, String cql) { + printStartStopSubstring(start, stop, cql); + System.out.print(" "); + System.out.print(" ".repeat(level)); + System.out.print(label); + System.out.println(); + } + + public static void printStartStopSubstringCustomWithSpaces(String label, int level, int start, int stop, + String cql) { + + // try to strip/trim whitespaces + if (start != -1 && stop != -1) { + String content = cql.substring(start, stop); + start = start + (content.length() - content.stripLeading().length()); + stop = stop - (content.length() - content.stripTrailing().length()); + } + printStartStopSubstringCustom(label, level, start, stop, cql); + } + + public static void dumpTreeSubstring(Modifier node, int level, String cql) { + printStartStopSubstring(node.getStart(), node.getStop(), cql); + System.out.print(" "); + System.out.print(" ".repeat(level)); + System.out.print(node.getClass().getSimpleName()); + System.out.print(" → "); + System.out.print(node.toCQL()); + System.out.println(); + + if (INFER_OTHER_POSITIONS) { + int typeStart = cql.toLowerCase().indexOf(node.getType(), node.getStart()); + int typeStop = typeStart + node.getType().length(); + printStartStopSubstringCustomWithSpaces("type", level + 1, typeStart, typeStop, cql); + + if (node.getComparison() != null) { + int compStart = cql.indexOf(node.getComparison(), typeStop); + int compStop = compStart + node.getComparison().length(); + printStartStopSubstringCustomWithSpaces("comparison", level + 1, compStart, compStop, cql); + + int valueStop = node.getStop(); + int valueStart = valueStop - node.getValue().length(); + printStartStopSubstringCustomWithSpaces("value", level + 1, valueStart, valueStop, cql); + } + } + } + + public static void dumpTreeSubstring(CQLNode node, int level, String cql) { + if (level == 0) { + printStartStopSubstring(0, cql.length(), cql); + System.out.print(" "); + System.out.print(""); + System.out.println(); + + dumpTreeSubstring(node, level + 1, cql); + return; + } + + printStartStopSubstring(node.getStart(), node.getStop(), cql); + System.out.print(" "); + System.out.print(" ".repeat(level)); + System.out.print(node.getClass().getSimpleName()); + System.out.print(" → "); + System.out.print(node.toCQL()); + System.out.println(); + + if (node instanceof CQLTermNode) { + CQLTermNode node2 = (CQLTermNode) node; + + if (INFER_OTHER_POSITIONS) { + String index = node2.getIndex(); + boolean hasCustomIndex = (index != null && !index.equalsIgnoreCase("srw.serverChoice") + && !index.equalsIgnoreCase("cql.serverChoice")); + if (hasCustomIndex) { + int indexStart = cql.indexOf(index, node.getStart()); + int indexStop = indexStart + index.length(); + printStartStopSubstringCustom("index", level + 1, indexStart, indexStop, cql); + + dumpTreeSubstring(node2.getRelation(), level + 1, cql); + + String term = node2.getTerm(); + int termStop = node.getStop(); + int termStart = termStop - term.length(); + // check for quotes + if (term.indexOf('"') != -1) { + term = term.replace("\"", "\\\""); + } + int pos = cql.lastIndexOf(term, termStop); + if (pos != -1 && pos < termStart) { + termStart = pos - 1; + termStop = termStart + term.length() + 2; + } + printStartStopSubstringCustom("term", level + 1, termStart, termStop, cql); + } + } else { + dumpTreeSubstring(node2.getRelation(), level + 1, cql); + } + + } else if (node instanceof CQLRelation) { + CQLRelation node2 = (CQLRelation) node; + + if (node2.getModifiers().size() > 0) { + for (Modifier modifier : node2.getModifiers()) { + dumpTreeSubstring(modifier, level + 1, cql); + } + } + + } else if (node instanceof CQLBooleanNode) { + CQLBooleanNode node2 = (CQLBooleanNode) node; + + dumpTreeSubstring(node2.getLeftOperand(), level + 1, cql); + + if (INFER_OTHER_POSITIONS) { + int opStart = node2.getLeftOperand().getStop(); + int opStop = node2.getRightOperand().getStart(); + + printStartStopSubstringCustomWithSpaces("operator", level + 1, opStart, opStop, cql); + + if (node2.getModifiers().size() > 0) { + opStop = node2.getModifiers().get(0).getStart(); + printStartStopSubstringCustomWithSpaces("operator", level + 2, opStart, opStop, cql); + + for (Modifier modifier : node2.getModifiers()) { + dumpTreeSubstring(modifier, level + 2, cql); + } + } + } + + dumpTreeSubstring(node2.getRightOperand(), level + 1, cql); + } else if (node instanceof CQLSortNode) { + CQLSortNode node2 = (CQLSortNode) node; + + dumpTreeSubstring(node2.getSubtree(), level + 1, cql); + + int start = node2.getStart() + 6; + for (ModifierSet ms : node2.getSortIndexes()) { + + if (INFER_OTHER_POSITIONS) { + int baseStart = cql.indexOf(ms.getBase(), start); + int baseStop = baseStart + ms.getBase().length(); + printStartStopSubstringCustom("base", level + 1, baseStart, baseStop, cql); + } + + if (ms.getModifiers().size() > 0) { + for (Modifier modifier : ms.getModifiers()) { + dumpTreeSubstring(modifier, level + 2, cql); + } + } + } + + } else if (node instanceof CQLPrefixNode) { + CQLPrefixNode node2 = (CQLPrefixNode) node; + + if (INFER_OTHER_POSITIONS) { + int skip = node2.getStart(); + if (node2.getPrefix().getName() != null) { + int nameStart = cql.indexOf(node2.getPrefix().getName(), node2.getStart()); + int nameStop = nameStart + node2.getPrefix().getName().length(); + skip = nameStop + 1; + printStartStopSubstringCustom("name", level + 1, nameStart, nameStop, cql); + } + + int identStart = cql.indexOf(node2.getPrefix().getIdentifier(), skip); + int identStop = identStart + node2.getPrefix().getIdentifier().length(); + printStartStopSubstringCustom("identifier", level + 1, identStart, identStop, cql); + } + + dumpTreeSubstring(node2.getSubtree(), level + 1, cql); + } + } + +} From 9938a3004fcd0288fe50ea40b61416f260faef1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20K=C3=B6rner?= Date: Wed, 2 Jul 2025 17:29:18 +0200 Subject: [PATCH 3/4] Fix NPE bugs --- src/main/java/org/z3950/zing/cql/CQLParser.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/z3950/zing/cql/CQLParser.java b/src/main/java/org/z3950/zing/cql/CQLParser.java index fc9aebc..c3200fc 100644 --- a/src/main/java/org/z3950/zing/cql/CQLParser.java +++ b/src/main/java/org/z3950/zing/cql/CQLParser.java @@ -230,7 +230,7 @@ private CQLNode parseTerm(String index, CQLRelation relation) throws CQLParseException, IOException { debug("in parseTerm()"); - int termStart = lexer.pos() - lexer.value().length()- ((lexer.what() == CQLTokenizer.TT_STRING) ? 2 : 0); + int termStart = lexer.pos() - ((lexer.value() != null) ? lexer.value().length() : 0) - ((lexer.what() == CQLTokenizer.TT_STRING) ? 2 : 0); int termStop = lexer.pos(); String first; StringBuilder all; @@ -249,7 +249,7 @@ private CQLNode parseTerm(String index, CQLRelation relation) } debug("non-parenthesised term"); - termStop = lexer.pos() - lexer.value().length(); + termStop = lexer.pos() - ((lexer.value() != null) ? lexer.value().length() : 0); first = matchSymbol("index or term"); all = new StringBuilder(first); //match relation only on second postion From 13d80eafb7316251523b1c673be60a1d23233953 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20K=C3=B6rner?= Date: Wed, 2 Jul 2025 17:56:44 +0200 Subject: [PATCH 4/4] Fix term quoted/brace at end issue --- .../z3950/zing/cql/CQLNodePositionsTest.java | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/test/java/org/z3950/zing/cql/CQLNodePositionsTest.java b/src/test/java/org/z3950/zing/cql/CQLNodePositionsTest.java index 8b71fde..be3eace 100644 --- a/src/test/java/org/z3950/zing/cql/CQLNodePositionsTest.java +++ b/src/test/java/org/z3950/zing/cql/CQLNodePositionsTest.java @@ -8,9 +8,9 @@ public class CQLNodePositionsTest { // CQLNode classes. Those start/stop offsets are not stored and need to be // computed (sometimes requiring the original query string to get exact // positions). - public static final boolean INFER_OTHER_POSITIONS = false; + public static final boolean INFER_OTHER_POSITIONS = true; - public static void main(String[] args) throws CQLParseException, IOException { + public static void main(String[] args) throws IOException { CQLParser parser = new CQLParser(); @@ -106,7 +106,7 @@ public static void main(String[] args) throws CQLParseException, IOException { CQLNode node = null; try { node = parser.parse(cql); - } catch (NullPointerException e) { + } catch (CQLParseException | NullPointerException e) { System.err.println("Error parsing query '" + cql + "': " + e.getMessage()); System.out.println(); continue; @@ -218,16 +218,18 @@ public static void dumpTreeSubstring(CQLNode node, int level, String cql) { dumpTreeSubstring(node2.getRelation(), level + 1, cql); String term = node2.getTerm(); + String termQuoted = CQLTermNode.maybeQuote(term); int termStop = node.getStop(); - int termStart = termStop - term.length(); - // check for quotes - if (term.indexOf('"') != -1) { - term = term.replace("\"", "\\\""); - } - int pos = cql.lastIndexOf(term, termStop); + int termStart = termStop - termQuoted.length(); + int pos = cql.lastIndexOf(termQuoted, termStop); if (pos != -1 && pos < termStart) { - termStart = pos - 1; - termStop = termStart + term.length() + 2; + if (pos > 0 && cql.charAt(pos - 1) == '"') { + termStart = pos - 1; + termStop = termStart + termQuoted.length() + 2; + } else { + termStart = pos; + termStop = termStart + termQuoted.length(); + } } printStartStopSubstringCustom("term", level + 1, termStart, termStop, cql); }