@@ -15,6 +15,7 @@ use crate::utils::Hash;
15
15
16
16
use std:: collections:: { HashMap , HashSet } ;
17
17
18
+ use memchr:: memchr as find_char;
18
19
use serde:: { Deserialize , Serialize } ;
19
20
20
21
/// Contains cosmetic filter information intended to be used on a particular URL.
@@ -138,29 +139,31 @@ impl CosmeticFilterCache {
138
139
139
140
/// Add a filter, assuming it has already been determined to be a generic rule
140
141
fn add_generic_filter ( & mut self , rule : CosmeticFilter ) {
141
- if rule. mask . contains ( CosmeticFilterMask :: IS_CLASS_SELECTOR ) {
142
- if let Some ( key) = & rule. key {
143
- let key = key. clone ( ) ;
144
- if rule. mask . contains ( CosmeticFilterMask :: IS_SIMPLE ) {
145
- self . simple_class_rules . insert ( key) ;
142
+ if rule. selector . starts_with ( '.' ) {
143
+ if let Some ( key) = key_from_selector ( & rule. selector ) {
144
+ assert ! ( key. starts_with( '.' ) ) ;
145
+ let class = key[ 1 ..] . to_string ( ) ;
146
+ if key == rule. selector {
147
+ self . simple_class_rules . insert ( class) ;
146
148
} else {
147
- if let Some ( bucket) = self . complex_class_rules . get_mut ( & key ) {
149
+ if let Some ( bucket) = self . complex_class_rules . get_mut ( & class ) {
148
150
bucket. push ( rule. selector ) ;
149
151
} else {
150
- self . complex_class_rules . insert ( key , vec ! [ rule. selector] ) ;
152
+ self . complex_class_rules . insert ( class , vec ! [ rule. selector] ) ;
151
153
}
152
154
}
153
155
}
154
- } else if rule. mask . contains ( CosmeticFilterMask :: IS_ID_SELECTOR ) {
155
- if let Some ( key) = & rule. key {
156
- let key = key. clone ( ) ;
157
- if rule. mask . contains ( CosmeticFilterMask :: IS_SIMPLE ) {
158
- self . simple_id_rules . insert ( key) ;
156
+ } else if rule. selector . starts_with ( '#' ) {
157
+ if let Some ( key) = key_from_selector ( & rule. selector ) {
158
+ assert ! ( key. starts_with( '#' ) ) ;
159
+ let id = key[ 1 ..] . to_string ( ) ;
160
+ if key == rule. selector {
161
+ self . simple_id_rules . insert ( id) ;
159
162
} else {
160
- if let Some ( bucket) = self . complex_id_rules . get_mut ( & key ) {
163
+ if let Some ( bucket) = self . complex_id_rules . get_mut ( & id ) {
161
164
bucket. push ( rule. selector ) ;
162
165
} else {
163
- self . complex_id_rules . insert ( key , vec ! [ rule. selector] ) ;
166
+ self . complex_id_rules . insert ( id , vec ! [ rule. selector] ) ;
164
167
}
165
168
}
166
169
}
@@ -553,6 +556,103 @@ fn hostname_domain_hashes(hostname: &str, domain: &str) -> (Vec<Hash>, Vec<Hash>
553
556
( request_entities, request_hostnames)
554
557
}
555
558
559
+ /// Returns the first token of a CSS selector.
560
+ ///
561
+ /// This should only be called once `selector` has been verified to start with either a "#" or "."
562
+ /// character.
563
+ fn key_from_selector ( selector : & str ) -> Option < String > {
564
+ use once_cell:: sync:: Lazy ;
565
+ use regex:: Regex ;
566
+
567
+ static RE_PLAIN_SELECTOR : Lazy < Regex > = Lazy :: new ( || Regex :: new ( r"^[#.][\w\\-]+" ) . unwrap ( ) ) ;
568
+ static RE_PLAIN_SELECTOR_ESCAPED : Lazy < Regex > =
569
+ Lazy :: new ( || Regex :: new ( r"^[#.](?:\\[0-9A-Fa-f]+ |\\.|\w|-)+" ) . unwrap ( ) ) ;
570
+ static RE_ESCAPE_SEQUENCE : Lazy < Regex > = Lazy :: new ( || Regex :: new ( r"\\([0-9A-Fa-f]+ |.)" ) . unwrap ( ) ) ;
571
+
572
+ // If there are no escape characters in the selector, just take the first class or id token.
573
+ let mat = RE_PLAIN_SELECTOR . find ( selector) ;
574
+ if let Some ( location) = mat {
575
+ let key = & location. as_str ( ) ;
576
+ if find_char ( b'\\' , key. as_bytes ( ) ) . is_none ( ) {
577
+ return Some ( ( * key) . into ( ) ) ;
578
+ }
579
+ } else {
580
+ return None ;
581
+ }
582
+
583
+ // Otherwise, the characters in the selector must be escaped.
584
+ let mat = RE_PLAIN_SELECTOR_ESCAPED . find ( selector) ;
585
+ if let Some ( location) = mat {
586
+ let mut key = String :: with_capacity ( selector. len ( ) ) ;
587
+ let escaped = & location. as_str ( ) ;
588
+ let mut beginning = 0 ;
589
+ let mat = RE_ESCAPE_SEQUENCE . captures_iter ( escaped) ;
590
+ for capture in mat {
591
+ // Unwrap is safe because the 0th capture group is the match itself
592
+ let location = capture. get ( 0 ) . unwrap ( ) ;
593
+ key += & escaped[ beginning..location. start ( ) ] ;
594
+ beginning = location. end ( ) ;
595
+ // Unwrap is safe because there is a capture group specified in the regex
596
+ let capture = capture. get ( 1 ) . unwrap ( ) . as_str ( ) ;
597
+ if capture. chars ( ) . count ( ) == 1 { // Check number of unicode characters rather than byte length
598
+ key += capture;
599
+ } else {
600
+ // This u32 conversion can overflow
601
+ let codepoint = u32:: from_str_radix ( & capture[ ..capture. len ( ) - 1 ] , 16 ) . ok ( ) ?;
602
+
603
+ // Not all u32s are valid Unicode codepoints
604
+ key += & core:: char:: from_u32 ( codepoint) ?
605
+ . to_string ( ) ;
606
+ }
607
+ }
608
+ Some ( key + & escaped[ beginning..] )
609
+ } else {
610
+ None
611
+ }
612
+ }
613
+
614
+ #[ cfg( test) ]
615
+ mod key_from_selector_tests {
616
+ use super :: key_from_selector;
617
+
618
+ #[ test]
619
+ fn no_escapes ( ) {
620
+ assert_eq ! ( key_from_selector( r#"#selector"# ) . unwrap( ) , "#selector" ) ;
621
+ assert_eq ! ( key_from_selector( r#"#ad-box[href="https://popads.net"]"# ) . unwrap( ) , "#ad-box" ) ;
622
+ assert_eq ! ( key_from_selector( r#".p"# ) . unwrap( ) , ".p" ) ;
623
+ assert_eq ! ( key_from_selector( r#".ad #ad.adblockblock"# ) . unwrap( ) , ".ad" ) ;
624
+ assert_eq ! ( key_from_selector( r#"#container.contained"# ) . unwrap( ) , "#container" ) ;
625
+ }
626
+
627
+ #[ test]
628
+ fn escaped_characters ( ) {
629
+ assert_eq ! ( key_from_selector( r"#Meebo\:AdElement\.Root" ) . unwrap( ) , "#Meebo:AdElement.Root" ) ;
630
+ assert_eq ! ( key_from_selector( r"#\ Banner\ Ad\ -\ 590\ x\ 90" ) . unwrap( ) , "# Banner Ad - 590 x 90" ) ;
631
+ assert_eq ! ( key_from_selector( r"#\ rek" ) . unwrap( ) , "# rek" ) ;
632
+ assert_eq ! ( key_from_selector( r#"#\:rr .nH[role="main"] .mq:first-child"# ) . unwrap( ) , "#:rr" ) ;
633
+ assert_eq ! ( key_from_selector( r#"#adspot-300x600\,300x250-pos-1"# ) . unwrap( ) , "#adspot-300x600,300x250-pos-1" ) ;
634
+ assert_eq ! ( key_from_selector( r#"#adv_\'146\'"# ) . unwrap( ) , "#adv_\' 146\' " ) ;
635
+ assert_eq ! ( key_from_selector( r#"#oas-mpu-left\<\/div\>"# ) . unwrap( ) , "#oas-mpu-left</div>" ) ;
636
+ assert_eq ! ( key_from_selector( r#".Trsp\(op\).Trsdu\(3s\)"# ) . unwrap( ) , ".Trsp(op)" ) ;
637
+ }
638
+
639
+ #[ test]
640
+ fn escape_codes ( ) {
641
+ assert_eq ! ( key_from_selector( r#"#\5f _mom_ad_12"# ) . unwrap( ) , "#__mom_ad_12" ) ;
642
+ assert_eq ! ( key_from_selector( r#"#\5f _nq__hh[style="display:block!important"]"# ) . unwrap( ) , "#__nq__hh" ) ;
643
+ assert_eq ! ( key_from_selector( r#"#\31 000-014-ros"# ) . unwrap( ) , "#1000-014-ros" ) ;
644
+ assert_eq ! ( key_from_selector( r#"#\33 00X250ad"# ) . unwrap( ) , "#300X250ad" ) ;
645
+ assert_eq ! ( key_from_selector( r#"#\5f _fixme"# ) . unwrap( ) , "#__fixme" ) ;
646
+ assert_eq ! ( key_from_selector( r#"#\37 28ad"# ) . unwrap( ) , "#728ad" ) ;
647
+ }
648
+
649
+ #[ test]
650
+ fn bad_escapes ( ) {
651
+ assert ! ( key_from_selector( r#"#\5ffffffffff overflows"# ) . is_none( ) ) ;
652
+ assert ! ( key_from_selector( r#"#\5fffffff is_too_large"# ) . is_none( ) ) ;
653
+ }
654
+ }
655
+
556
656
#[ cfg( test) ]
557
657
mod cosmetic_cache_tests {
558
658
use super :: * ;
0 commit comments