Skip to content

Commit d1a710e

Browse files
committed
feat: support wildcard namespaces in xpath queries
This is almost as fast as a standard child-axis search, and much faster than the builtin or using local-name(): //span 18.923k (± 8.9%) i/s - 93.906k in 5.010792s //*[local-name()='span'] 1.849k (± 2.8%) i/s - 9.261k in 5.011560s //*[nokogiri-builtin:local-name-is('span')] 3.191k (± 2.4%) i/s - 16.150k in 5.064798s //*:span 18.016k (± 4.6%) i/s - 89.900k in 5.003444s Comparison: //span: 18922.5 i/s //*:span: 18016.5 i/s - same-ish: difference falls within error //*[nokogiri-builtin:local-name-is('span')]: 3190.6 i/s - 5.93x (± 0.00) slower //*[local-name()='span']: 1849.4 i/s - 10.23x (± 0.00) slower
1 parent 0fd4de4 commit d1a710e

File tree

4 files changed

+144
-3
lines changed

4 files changed

+144
-3
lines changed

lib/nokogiri/css/xpath_visitor.rb

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ module CSS
77
# class allows for changing some of the behaviors related to builtin xpath functions and quirks
88
# of HTML5.
99
class XPathVisitor
10+
WILDCARD_NAMESPACES = Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch") # :nodoc:
11+
1012
# Enum to direct XPathVisitor when to use Nokogiri builtin XPath functions.
1113
module BuiltinsConfig
1214
# Never use Nokogiri builtin functions, always generate vanilla XPath 1.0 queries. This is
@@ -254,7 +256,11 @@ def visit_element_name(node)
254256
# HTML5 has namespaces that should be ignored in CSS queries
255257
# https://github.com/sparklemotion/nokogiri/issues/2376
256258
if @builtins == BuiltinsConfig::ALWAYS || (@builtins == BuiltinsConfig::OPTIMAL && Nokogiri.uses_libxml?)
257-
"*[nokogiri-builtin:local-name-is('#{node.value.first}')]"
259+
if WILDCARD_NAMESPACES
260+
"*:#{node.value.first}"
261+
else
262+
"*[nokogiri-builtin:local-name-is('#{node.value.first}')]"
263+
end
258264
else
259265
"*[local-name()='#{node.value.first}']"
260266
end
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
From 74c95ec5932c737d4fcb06b8646b0017364ada14 Mon Sep 17 00:00:00 2001
2+
From: Mike Dalessio <mike.dalessio@gmail.com>
3+
Date: Fri, 24 Dec 2021 19:08:01 -0500
4+
Subject: [PATCH] attempt to hack in wildcard namespaces to xpath
5+
6+
I'm not confident this is a bulletproof patch.
7+
---
8+
xpath.c | 24 ++++++++++++++++++------
9+
1 file changed, 18 insertions(+), 6 deletions(-)
10+
11+
diff --git a/xpath.c b/xpath.c
12+
index 1aa2f1a..c7f0885 100644
13+
--- a/xpath.c
14+
+++ b/xpath.c
15+
@@ -146,6 +146,9 @@
16+
#define XPATH_MAX_RECURSION_DEPTH 5000
17+
#endif
18+
19+
+#define WILDCARD_PREFIX "*"
20+
+#define IS_WILDCARD_PREFIX(p) xmlStrEqual((xmlChar*)WILDCARD_PREFIX, p)
21+
+
22+
/*
23+
* TODO:
24+
* There are a few spots where some tests are done which depend upon ascii
25+
@@ -11073,12 +11076,15 @@ xmlXPathCompNodeTest(xmlXPathParserContextPtr ctxt, xmlXPathTestVal *test,
26+
SKIP_BLANKS;
27+
28+
if ((name == NULL) && (CUR == '*')) {
29+
- /*
30+
- * All elements
31+
- */
32+
NEXT;
33+
- *test = NODE_TEST_ALL;
34+
- return(NULL);
35+
+ if (CUR != ':') {
36+
+ /*
37+
+ * All elements
38+
+ */
39+
+ *test = NODE_TEST_ALL;
40+
+ return(NULL);
41+
+ }
42+
+ name = xmlCharStrdup(WILDCARD_PREFIX);
43+
}
44+
45+
if (name == NULL)
46+
@@ -11327,6 +11333,10 @@ xmlXPathCompStep(xmlXPathParserContextPtr ctxt) {
47+
}
48+
#endif
49+
if (CUR == '*') {
50+
+ if (NXT(1) == ':') {
51+
+ NEXT;
52+
+ name = xmlCharStrdup(WILDCARD_PREFIX);
53+
+ }
54+
axis = AXIS_CHILD;
55+
} else {
56+
if (name == NULL)
57+
@@ -12030,7 +12040,7 @@ xmlXPathNodeCollectAndTest(xmlXPathParserContextPtr ctxt,
58+
/*
59+
* Setup namespaces.
60+
*/
61+
- if (prefix != NULL) {
62+
+ if (prefix != NULL && !IS_WILDCARD_PREFIX(prefix)) {
63+
URI = xmlXPathNsLookup(xpctxt, prefix);
64+
if (URI == NULL) {
65+
xmlXPathReleaseObject(xpctxt, obj);
66+
@@ -12369,6 +12379,8 @@ xmlXPathNodeCollectAndTest(xmlXPathParserContextPtr ctxt,
67+
{
68+
XP_TEST_HIT
69+
}
70+
+ } else if (IS_WILDCARD_PREFIX(prefix)) {
71+
+ XP_TEST_HIT
72+
} else {
73+
if ((cur->ns != NULL) &&
74+
(xmlStrEqual(URI, cur->ns->href)))
75+
--
76+
2.31.0
77+

test/css/test_xpath_visitor.rb

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,34 @@ def visit_pseudo_class_aaron(node)
565565
end
566566
end
567567

568+
#
569+
# HTML5 documents have namespaces, and gumbo attaches namespaces to the relevant elements; but
570+
# CSS selectors should not require namespaces. See #2376 for the discussion around this design
571+
# decision, along with some of the relevant benchmarks and call stack analyses.
572+
#
573+
# (HTML5 today is only supported by CRuby/gumbo/libxml2 and so we'll ignore JRuby support for
574+
# now.)
575+
#
576+
# The way to implement this CSS search using standard XPath 1.0 queries is to check for a match
577+
# with `local-name()`. However, this is about ~10x slower than a standard search along the
578+
# `child` axis.
579+
#
580+
# I've written a builtin function in C named `nokogiri-builtin:local-name-is()` which is a bit
581+
# faster, but still ~7x slower than a standard search.
582+
#
583+
# Finally, I've patched libxml2 to support wildcard namespaces, and this is ~1.1x slower but
584+
# only available with the packaged libxml2.
585+
#
586+
# In any case, the logic for the html5 builtins here goes:
587+
#
588+
# if ALWAYS or (OPTIMAL and libxml2)
589+
# if we've patched libxml2 with wildcard support
590+
# use wildard namespacing
591+
# else
592+
# use `nokogiri-builtin:local-name-is()`
593+
# else
594+
# use `local-name()`
595+
#
568596
describe "doctype:html5" do
569597
let(:visitor) do
570598
Nokogiri::CSS::XPathVisitor.new(
@@ -575,8 +603,13 @@ def visit_pseudo_class_aaron(node)
575603

576604
describe "builtins:always" do
577605
let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::ALWAYS }
606+
578607
it "matches on the element's local-name, ignoring namespaces" do
579-
assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo"))
608+
if Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch")
609+
assert_xpath("//*:foo", parser.parse("foo"))
610+
else
611+
assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo"))
612+
end
580613
end
581614

582615
it "avoids the wildcard when using namespaces" do
@@ -595,7 +628,11 @@ def visit_pseudo_class_aaron(node)
595628
let(:builtins) { Nokogiri::CSS::XPathVisitor::BuiltinsConfig::OPTIMAL }
596629
it "matches on the element's local-name, ignoring namespaces" do
597630
if Nokogiri.uses_libxml?
598-
assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo"))
631+
if Nokogiri.libxml2_patches.include?("0009-allow-wildcard-namespaces.patch")
632+
assert_xpath("//*:foo", parser.parse("foo"))
633+
else
634+
assert_xpath("//*[nokogiri-builtin:local-name-is('foo')]", parser.parse("foo"))
635+
end
599636
else
600637
assert_xpath("//*[local-name()='foo']", parser.parse("foo"))
601638
end

test/xml/test_xpath.rb

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -699,6 +699,27 @@ def collision(nodes)
699699
end
700700
end
701701
end
702+
703+
describe "XPath wildcard namespaces" do
704+
let(:xml) { <<~XML }
705+
<root xmlns:ns1="http://nokogiri.org/ns1" xmlns:ns2="http://nokogiri.org/ns2">
706+
<ns1:child>ns1 child</ns1:child>
707+
<ns2:child>ns2 child</ns2:child>
708+
<child>plain child</child>
709+
</root>
710+
XML
711+
712+
let(:doc) { Nokogiri::XML::Document.parse(xml) }
713+
714+
it "allows namespace wildcards" do
715+
skip_unless_libxml2_patch("0009-allow-wildcard-namespaces.patch")
716+
717+
assert_equal(1, doc.xpath("//n:child", { "n" => "http://nokogiri.org/ns1" }).length)
718+
assert_equal(3, doc.xpath("//*:child").length)
719+
assert_equal(1, doc.xpath("//self::n:child", { "n" => "http://nokogiri.org/ns1" }).length)
720+
assert_equal(3, doc.xpath("//self::*:child").length)
721+
end
722+
end
702723
end
703724
end
704725
end

0 commit comments

Comments
 (0)