Skip to content

Commit bb9ed5c

Browse files
committed
defer cosmetic filter key calculation to engine-add time
the info is essentially an engine-internal optimization detail and won't be useful for external users of the library
1 parent 6932f41 commit bb9ed5c

File tree

2 files changed

+114
-166
lines changed

2 files changed

+114
-166
lines changed

src/cosmetic_filter_cache.rs

Lines changed: 114 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use crate::utils::Hash;
1515

1616
use std::collections::{HashMap, HashSet};
1717

18+
use memchr::memchr as find_char;
1819
use serde::{Deserialize, Serialize};
1920

2021
/// Contains cosmetic filter information intended to be used on a particular URL.
@@ -138,29 +139,31 @@ impl CosmeticFilterCache {
138139

139140
/// Add a filter, assuming it has already been determined to be a generic rule
140141
fn add_generic_filter(&mut self, rule: CosmeticFilter) {
141-
if rule.mask.contains(CosmeticFilterMask::IS_CLASS_SELECTOR) {
142-
if let Some(key) = &rule.key {
143-
let key = key.clone();
144-
if rule.mask.contains(CosmeticFilterMask::IS_SIMPLE) {
145-
self.simple_class_rules.insert(key);
142+
if rule.selector.starts_with('.') {
143+
if let Some(key) = key_from_selector(&rule.selector) {
144+
assert!(key.starts_with('.'));
145+
let class = key[1..].to_string();
146+
if key == rule.selector {
147+
self.simple_class_rules.insert(class);
146148
} else {
147-
if let Some(bucket) = self.complex_class_rules.get_mut(&key) {
149+
if let Some(bucket) = self.complex_class_rules.get_mut(&class) {
148150
bucket.push(rule.selector);
149151
} else {
150-
self.complex_class_rules.insert(key, vec![rule.selector]);
152+
self.complex_class_rules.insert(class, vec![rule.selector]);
151153
}
152154
}
153155
}
154-
} else if rule.mask.contains(CosmeticFilterMask::IS_ID_SELECTOR) {
155-
if let Some(key) = &rule.key {
156-
let key = key.clone();
157-
if rule.mask.contains(CosmeticFilterMask::IS_SIMPLE) {
158-
self.simple_id_rules.insert(key);
156+
} else if rule.selector.starts_with('#') {
157+
if let Some(key) = key_from_selector(&rule.selector) {
158+
assert!(key.starts_with('#'));
159+
let id = key[1..].to_string();
160+
if key == rule.selector {
161+
self.simple_id_rules.insert(id);
159162
} else {
160-
if let Some(bucket) = self.complex_id_rules.get_mut(&key) {
163+
if let Some(bucket) = self.complex_id_rules.get_mut(&id) {
161164
bucket.push(rule.selector);
162165
} else {
163-
self.complex_id_rules.insert(key, vec![rule.selector]);
166+
self.complex_id_rules.insert(id, vec![rule.selector]);
164167
}
165168
}
166169
}
@@ -553,6 +556,103 @@ fn hostname_domain_hashes(hostname: &str, domain: &str) -> (Vec<Hash>, Vec<Hash>
553556
(request_entities, request_hostnames)
554557
}
555558

559+
/// Returns the first token of a CSS selector.
560+
///
561+
/// This should only be called once `selector` has been verified to start with either a "#" or "."
562+
/// character.
563+
fn key_from_selector(selector: &str) -> Option<String> {
564+
use once_cell::sync::Lazy;
565+
use regex::Regex;
566+
567+
static RE_PLAIN_SELECTOR: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[#.][\w\\-]+").unwrap());
568+
static RE_PLAIN_SELECTOR_ESCAPED: Lazy<Regex> =
569+
Lazy::new(|| Regex::new(r"^[#.](?:\\[0-9A-Fa-f]+ |\\.|\w|-)+").unwrap());
570+
static RE_ESCAPE_SEQUENCE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\\([0-9A-Fa-f]+ |.)").unwrap());
571+
572+
// If there are no escape characters in the selector, just take the first class or id token.
573+
let mat = RE_PLAIN_SELECTOR.find(selector);
574+
if let Some(location) = mat {
575+
let key = &location.as_str();
576+
if find_char(b'\\', key.as_bytes()).is_none() {
577+
return Some((*key).into());
578+
}
579+
} else {
580+
return None;
581+
}
582+
583+
// Otherwise, the characters in the selector must be escaped.
584+
let mat = RE_PLAIN_SELECTOR_ESCAPED.find(selector);
585+
if let Some(location) = mat {
586+
let mut key = String::with_capacity(selector.len());
587+
let escaped = &location.as_str();
588+
let mut beginning = 0;
589+
let mat = RE_ESCAPE_SEQUENCE.captures_iter(escaped);
590+
for capture in mat {
591+
// Unwrap is safe because the 0th capture group is the match itself
592+
let location = capture.get(0).unwrap();
593+
key += &escaped[beginning..location.start()];
594+
beginning = location.end();
595+
// Unwrap is safe because there is a capture group specified in the regex
596+
let capture = capture.get(1).unwrap().as_str();
597+
if capture.chars().count() == 1 { // Check number of unicode characters rather than byte length
598+
key += capture;
599+
} else {
600+
// This u32 conversion can overflow
601+
let codepoint = u32::from_str_radix(&capture[..capture.len() - 1], 16).ok()?;
602+
603+
// Not all u32s are valid Unicode codepoints
604+
key += &core::char::from_u32(codepoint)?
605+
.to_string();
606+
}
607+
}
608+
Some(key + &escaped[beginning..])
609+
} else {
610+
None
611+
}
612+
}
613+
614+
#[cfg(test)]
615+
mod key_from_selector_tests {
616+
use super::key_from_selector;
617+
618+
#[test]
619+
fn no_escapes() {
620+
assert_eq!(key_from_selector(r#"#selector"#).unwrap(), "#selector");
621+
assert_eq!(key_from_selector(r#"#ad-box[href="https://popads.net"]"#).unwrap(), "#ad-box");
622+
assert_eq!(key_from_selector(r#".p"#).unwrap(), ".p");
623+
assert_eq!(key_from_selector(r#".ad #ad.adblockblock"#).unwrap(), ".ad");
624+
assert_eq!(key_from_selector(r#"#container.contained"#).unwrap(), "#container");
625+
}
626+
627+
#[test]
628+
fn escaped_characters() {
629+
assert_eq!(key_from_selector(r"#Meebo\:AdElement\.Root").unwrap(), "#Meebo:AdElement.Root");
630+
assert_eq!(key_from_selector(r"#\ Banner\ Ad\ -\ 590\ x\ 90").unwrap(), "# Banner Ad - 590 x 90");
631+
assert_eq!(key_from_selector(r"#\ rek").unwrap(), "# rek");
632+
assert_eq!(key_from_selector(r#"#\:rr .nH[role="main"] .mq:first-child"#).unwrap(), "#:rr");
633+
assert_eq!(key_from_selector(r#"#adspot-300x600\,300x250-pos-1"#).unwrap(), "#adspot-300x600,300x250-pos-1");
634+
assert_eq!(key_from_selector(r#"#adv_\'146\'"#).unwrap(), "#adv_\'146\'");
635+
assert_eq!(key_from_selector(r#"#oas-mpu-left\<\/div\>"#).unwrap(), "#oas-mpu-left</div>");
636+
assert_eq!(key_from_selector(r#".Trsp\(op\).Trsdu\(3s\)"#).unwrap(), ".Trsp(op)");
637+
}
638+
639+
#[test]
640+
fn escape_codes() {
641+
assert_eq!(key_from_selector(r#"#\5f _mom_ad_12"#).unwrap(), "#__mom_ad_12");
642+
assert_eq!(key_from_selector(r#"#\5f _nq__hh[style="display:block!important"]"#).unwrap(), "#__nq__hh");
643+
assert_eq!(key_from_selector(r#"#\31 000-014-ros"#).unwrap(), "#1000-014-ros");
644+
assert_eq!(key_from_selector(r#"#\33 00X250ad"#).unwrap(), "#300X250ad");
645+
assert_eq!(key_from_selector(r#"#\5f _fixme"#).unwrap(), "#__fixme");
646+
assert_eq!(key_from_selector(r#"#\37 28ad"#).unwrap(), "#728ad");
647+
}
648+
649+
#[test]
650+
fn bad_escapes() {
651+
assert!(key_from_selector(r#"#\5ffffffffff overflows"#).is_none());
652+
assert!(key_from_selector(r#"#\5fffffff is_too_large"#).is_none());
653+
}
654+
}
655+
556656
#[cfg(test)]
557657
mod cosmetic_cache_tests {
558658
use super::*;

0 commit comments

Comments
 (0)