Skip to content

Commit dcccf72

Browse files
authored
Merge pull request #2403 from sparklemotion/2376-html5-namespaces-in-css-queries
HTML5 documents should not require namespaces in CSS selector queries
2 parents de64268 + d1a710e commit dcccf72

33 files changed

+1005
-389
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ This release ends support for:
3535
### Improved
3636

3737
* `{XML,HTML4}::DocumentFragment` constructors all now take an optional parse options parameter or block (similar to Document constructors). [[#1692](https://github.com/sparklemotion/nokogiri/issues/1692)] (Thanks, [@JackMc](https://github.com/JackMc)!)
38+
* `Nokogiri::CSS.xpath_for` allows an `XPathVisitor` to be injected, for finer-grained control over how CSS queries are translated into XPath.
3839
* [CRuby] `XML::Reader#encoding` will return the encoding detected by the parser when it's not passed to the constructor. [[#980](https://github.com/sparklemotion/nokogiri/issues/980)]
3940
* [CRuby] Handle abruptly-closed HTML comments as recommended by WHATWG. (Thanks to [tehryanx](https://hackerone.com/tehryanx?type=user) for reporting!)
4041
* [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES`, which is turned on by default in `ParseOptions::DEFAULT_{XML,XSLT,HTML,SCHEMA}` (Note that JRuby already supported large line numbers.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)]
@@ -45,7 +46,9 @@ This release ends support for:
4546

4647
### Fixed
4748

48-
* XML::Builder blocks restore context properly when exceptions are raised. [[#2372](https://github.com/sparklemotion/nokogiri/issues/2372)] (Thanks, [@ric2b](https://github.com/ric2b) and [@rinthedev](https://github.com/rinthedev)!)
49+
* CSS queries on HTML5 documents now correctly match foreign elements (SVG, MathML) when namespaces are not specified in the query. [[#2376](https://github.com/sparklemotion/nokogiri/issues/2376)]
50+
* `XML::Builder` blocks restore context properly when exceptions are raised. [[#2372](https://github.com/sparklemotion/nokogiri/issues/2372)] (Thanks, [@ric2b](https://github.com/ric2b) and [@rinthedev](https://github.com/rinthedev)!)
51+
* The `Nokogiri::CSS::Parser` cache now uses the `XPathVisitor` configuration as part of the cache key, preventing incorrect cache results from being returned when multiple `XPathVisitor` options are being used.
4952
* Error recovery from in-context parsing (e.g., `Node#parse`) now always uses the correct `DocumentFragment` class. Previously `Nokogiri::HTML4::DocumentFragment` was always used, even for XML documents. [[#1158](https://github.com/sparklemotion/nokogiri/issues/1158)]
5053
* `DocumentFragment#>` now works properly, matching a CSS selector against only the fragment roots. [[#1857](https://github.com/sparklemotion/nokogiri/issues/1857)]
5154
* `XML::DocumentFragment#errors` now correctly contains any parsing errors encountered. Previously this was always empty. (Note that `HTML::DocumentFragment#errors` already did this.)
@@ -61,6 +64,8 @@ This release ends support for:
6164
### Deprecated
6265

6366
* Passing a `Nokogiri::XML::Node` as the second parameter to `Node.new` is deprecated and will generate a warning. This will become an error in a future version of Nokogiri. [[#975](https://github.com/sparklemotion/nokogiri/issues/975)]
67+
* `Nokogiri::CSS::Parser`, `Nokogiri::CSS::Tokenizer`, and `Nokogiri::CSS::Node` are now internal-only APIs that are no longer documented, and should not be considered stable. With the introduction of `XPathVisitor` injection into `Nokogiri::CSS.xpath_for` there should be no reason to rely on these internal APIs.
68+
* CSS-to-XPath utility classes `Nokogiri::CSS::XPathVisitorAlwaysUseBuiltins` and `XPathVisitorOptimallyUseBuiltins` are deprecated. Prefer `Nokogiri::CSS::XPathVisitor` with appropriate constructor arguments. These classes will be removed in a future version of Nokogiri.
6469

6570

6671
## 1.12.5 / 2021-09-27

ext/nokogiri/xml_dtd.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ entities(VALUE self)
5757

5858
/*
5959
* call-seq:
60-
* notations
60+
* notations() → Hash<name(String)⇒Notation>
6161
*
62-
* Get a hash of the notations for this DTD.
62+
* [Returns] All the notations for this DTD in a Hash of Notation +name+ to Notation.
6363
*/
6464
static VALUE
6565
notations(VALUE self)

ext/nokogiri/xml_xpath_context.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,26 @@ xpath_builtin_css_class(xmlXPathParserContextPtr ctxt, int nargs)
8686
xmlXPathFreeObject(needle);
8787
}
8888

89+
90+
/* xmlXPathFunction to select nodes whose local name matches, for HTML5 CSS queries that should ignore namespaces */
91+
static void
92+
xpath_builtin_local_name_is(xmlXPathParserContextPtr ctxt, int nargs)
93+
{
94+
xmlXPathObjectPtr element_name;
95+
96+
assert(ctxt->context->node);
97+
98+
CHECK_ARITY(1);
99+
CAST_TO_STRING;
100+
CHECK_TYPE(XPATH_STRING);
101+
element_name = valuePop(ctxt);
102+
103+
valuePush(ctxt, xmlXPathNewBoolean(xmlStrEqual(ctxt->context->node->name, element_name->stringval)));
104+
105+
xmlXPathFreeObject(element_name);
106+
}
107+
108+
89109
/*
90110
* call-seq:
91111
* register_ns(prefix, uri)
@@ -361,6 +381,8 @@ new (VALUE klass, VALUE nodeobj)
361381
xmlXPathRegisterNs(ctx, NOKOGIRI_BUILTIN_PREFIX, NOKOGIRI_BUILTIN_URI);
362382
xmlXPathRegisterFuncNS(ctx, (const xmlChar *)"css-class", NOKOGIRI_BUILTIN_URI,
363383
xpath_builtin_css_class);
384+
xmlXPathRegisterFuncNS(ctx, (const xmlChar *)"local-name-is", NOKOGIRI_BUILTIN_URI,
385+
xpath_builtin_local_name_is);
364386

365387
self = Data_Wrap_Struct(klass, 0, deallocate, ctx);
366388
return self;

lib/nokogiri/css.rb

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,49 @@
1+
# coding: utf-8
12
# frozen_string_literal: true
23

34
module Nokogiri
5+
# Translate a CSS selector into an XPath 1.0 query
46
module CSS
57
class << self
6-
###
7-
# Parse this CSS selector in +selector+. Returns an AST.
8-
def parse(selector)
8+
# TODO: Deprecate this method ahead of 2.0 and delete it in 2.0.
9+
# It is not used by Nokogiri and shouldn't be part of the public API.
10+
def parse(selector) # :nodoc:
911
Parser.new.parse(selector)
1012
end
1113

12-
###
13-
# Get the XPath for +selector+.
14+
# :call-seq:
15+
# xpath_for(selector) → String
16+
# xpath_for(selector [, prefix:] [, visitor:] [, ns:]) → String
17+
#
18+
# Translate a CSS selector to the equivalent XPath query.
19+
#
20+
# [Parameters]
21+
# - +selector+ (String) The CSS selector to be translated into XPath
22+
#
23+
# - +prefix:+ (String)
24+
#
25+
# The XPath prefix for the query, see Nokogiri::XML::XPath for some options. Default is
26+
# +XML::XPath::GLOBAL_SEARCH_PREFIX+.
27+
#
28+
# - +visitor:+ (Nokogiri::CSS::XPathVisitor)
29+
#
30+
# The visitor class to use to transform the AST into XPath. Default is
31+
# +Nokogiri::CSS::XPathVisitor.new+.
32+
#
33+
# - +ns:+ (Hash<String ⇒ String>)
34+
#
35+
# The namespaces that are referenced in the query, if any. This is a hash where the keys are
36+
# the namespace prefix and the values are the namespace URIs. Default is an empty Hash.
37+
#
38+
# [Returns] (String) The equivalent XPath query for +selector+
39+
#
40+
# 💡 Note that translated queries are cached for performance concerns.
41+
#
1442
def xpath_for(selector, options = {})
15-
Parser.new(options[:ns] || {}).xpath_for(selector, options)
43+
prefix = options.fetch(:prefix, Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX)
44+
visitor = options.fetch(:visitor) { Nokogiri::CSS::XPathVisitor.new }
45+
ns = options.fetch(:ns, {})
46+
Parser.new(ns).xpath_for(selector, prefix, visitor)
1647
end
1748
end
1849
end

lib/nokogiri/css/node.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
module Nokogiri
44
module CSS
5-
class Node
5+
class Node # :nodoc:
66
ALLOW_COMBINATOR_ON_SELF = [:DIRECT_ADJACENT_SELECTOR, :FOLLOWING_SELECTOR, :CHILD_SELECTOR]
77

88
# Get the type of this node
@@ -23,7 +23,7 @@ def accept(visitor)
2323

2424
###
2525
# Convert this CSS node to xpath with +prefix+ using +visitor+
26-
def to_xpath(prefix = "//", visitor = XPathVisitor.new)
26+
def to_xpath(prefix, visitor)
2727
prefix = "." if ALLOW_COMBINATOR_ON_SELF.include?(type) && value.first.nil?
2828
prefix + visitor.accept(self)
2929
end

lib/nokogiri/css/parser.rb

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# frozen_string_literal: true
22
#
33
# DO NOT MODIFY!!!!
4-
# This file is automatically generated by Racc 1.5.2
4+
# This file is automatically generated by Racc 1.6.0
55
# from Racc grammar file "".
66
#
77

@@ -10,6 +10,14 @@
1010

1111
require_relative "parser_extras"
1212

13+
module Nokogiri
14+
module CSS
15+
# :nodoc: all
16+
class Parser < Racc::Parser
17+
end
18+
end
19+
end
20+
1321
module Nokogiri
1422
module CSS
1523
class Parser < Racc::Parser
@@ -247,7 +255,7 @@ def unescape_css_string(str)
247255
"." => 27,
248256
"*" => 28,
249257
"|" => 29,
250-
":" => 30, }
258+
":" => 30 }
251259

252260
racc_nt_base = 31
253261

@@ -485,7 +493,7 @@ def _reduce_27(val, _values, result)
485493
end
486494

487495
def _reduce_28(val, _values, result)
488-
result = Node.new(:ELEMENT_NAME,
496+
result = Node.new(:ATTRIB_NAME,
489497
[[val.first, val.last].compact.join(':')]
490498
)
491499

@@ -495,7 +503,7 @@ def _reduce_28(val, _values, result)
495503
def _reduce_29(val, _values, result)
496504
# Default namespace is not applied to attributes.
497505
# So we don't add prefix "xmlns:" as in namespaced_ident.
498-
result = Node.new(:ELEMENT_NAME, [val.first])
506+
result = Node.new(:ATTRIB_NAME, [val.first])
499507

500508
result
501509
end

lib/nokogiri/css/parser.y

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,14 +96,14 @@ rule
9696
;
9797
attrib_name
9898
: namespace '|' IDENT {
99-
result = Node.new(:ELEMENT_NAME,
99+
result = Node.new(:ATTRIB_NAME,
100100
[[val.first, val.last].compact.join(':')]
101101
)
102102
}
103103
| IDENT {
104104
# Default namespace is not applied to attributes.
105105
# So we don't add prefix "xmlns:" as in namespaced_ident.
106-
result = Node.new(:ELEMENT_NAME, [val.first])
106+
result = Node.new(:ATTRIB_NAME, [val.first])
107107
}
108108
;
109109
function
@@ -255,6 +255,14 @@ end
255255

256256
require_relative "parser_extras"
257257

258+
module Nokogiri
259+
module CSS
260+
# :nodoc: all
261+
class Parser < Racc::Parser
262+
end
263+
end
264+
end
265+
258266
---- inner
259267

260268
def unescape_css_identifier(identifier)

lib/nokogiri/css/parser_extras.rb

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
module Nokogiri
66
module CSS
7-
class Parser < Racc::Parser
7+
class Parser < Racc::Parser # :nodoc:
88
CACHE_SWITCH_NAME = :nokogiri_css_parser_cache_is_off
99

1010
@cache = {}
@@ -23,7 +23,7 @@ def set_cache(value) # rubocop:disable Naming/AccessorMethodName
2323

2424
# Get the css selector in +string+ from the cache
2525
def [](string)
26-
return unless cache_on?
26+
return nil unless cache_on?
2727
@mutex.synchronize { @cache[string] }
2828
end
2929

@@ -71,17 +71,10 @@ def next_token
7171
end
7272

7373
# Get the xpath for +string+ using +options+
74-
def xpath_for(string, options = {})
75-
key = "#{string}#{options[:ns]}#{options[:prefix]}"
76-
v = self.class[key]
77-
return v if v
78-
79-
args = [
80-
options[:prefix] || "//",
81-
options[:visitor] || XPathVisitor.new,
82-
]
83-
self.class[key] = parse(string).map do |ast|
84-
ast.to_xpath(*args)
74+
def xpath_for(string, prefix, visitor)
75+
key = cache_key(string, prefix, visitor)
76+
self.class[key] ||= parse(string).map do |ast|
77+
ast.to_xpath(prefix, visitor)
8578
end
8679
end
8780

@@ -90,6 +83,12 @@ def on_error(error_token_id, error_value, value_stack)
9083
after = value_stack.compact.last
9184
raise SyntaxError, "unexpected '#{error_value}' after '#{after}'"
9285
end
86+
87+
def cache_key(query, prefix, visitor)
88+
if self.class.cache_on?
89+
[query, prefix, @namespaces, visitor.config]
90+
end
91+
end
9392
end
9493
end
9594
end

lib/nokogiri/css/tokenizer.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77

88
module Nokogiri
99
module CSS
10-
class Tokenizer # :nodoc:
10+
# :nodoc: all
11+
class Tokenizer
1112
require 'strscan'
1213

1314
class ScanError < StandardError ; end

lib/nokogiri/css/tokenizer.rex

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
module Nokogiri
22
module CSS
3-
class Tokenizer # :nodoc:
3+
# :nodoc: all
4+
class Tokenizer
45

56
macro
67
nl \n|\r\n|\r|\f

0 commit comments

Comments
 (0)