From e15c59ac6e73cc0e7bce7f823004ebae759076ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= Date: Sat, 9 May 2020 13:58:55 +0200 Subject: [PATCH 1/3] Partial match support New methods: Regex::is_partial_match Regex::is_partial_match_at --- src/bytes.rs | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++- src/ffi.rs | 5 +++- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index a374452..6727d64 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -8,7 +8,7 @@ use log::debug; use pcre2_sys::{ PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE, PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET, - PCRE2_NEWLINE_ANYCRLF, + PCRE2_NEWLINE_ANYCRLF, PCRE2_PARTIAL_HARD }; use thread_local::CachedThreadLocal; @@ -427,6 +427,25 @@ impl Regex { self.is_match_at(subject, 0) } + /// Returns true if and only if the regex fully or partially matches the subject string given. + /// A partial match occurs when there is a match up to the end of a subject string, + /// but more characters are needed to match the entire pattern. + /// + /// # Example + /// + /// Test if given string can be a beginning of a valid telephone number: + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let text = b"123-456-"; + /// assert!(Regex::new(r"^\d{3}-\d{3}-\d{3}")?.is_partial_match(text)?); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn is_partial_match(&self, subject: &[u8]) -> Result { + self.is_partial_match_at(subject, 0) + } + /// Returns the start and end byte range of the leftmost-first match in /// `subject`. If no match exists, then `None` is returned. /// @@ -628,6 +647,39 @@ impl Regex { Ok(unsafe { match_data.find(&self.code, subject, start, options)? }) } + /// Returns the same as is_partial_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_partial_match_at( + &self, + subject: &[u8], + start: usize, + ) -> Result { + assert!( + start <= subject.len(), + "start ({}) must be <= subject.len() ({})", + start, + subject.len() + ); + + let mut options = PCRE2_PARTIAL_HARD; + if !self.config.utf_check { + options |= PCRE2_NO_UTF_CHECK; + } + + let match_data = self.match_data(); + let mut match_data = match_data.borrow_mut(); + // SAFETY: The only unsafe PCRE2 option we potentially use here is + // PCRE2_NO_UTF_CHECK, and that only occurs if the caller executes the + // `disable_utf_check` method, which propagates the safety contract to + // the caller. + Ok(unsafe { match_data.find(&self.code, subject, start, options)? }) + } + + /// Returns the same as find, but starts the search at the given /// offset. /// @@ -1150,6 +1202,18 @@ mod tests { assert!(re.is_match(b("Β")).unwrap()); } + #[test] + fn partial() { + let re = RegexBuilder::new() + .build("ab$") + .unwrap(); + + assert!(re.is_partial_match(b("a")).unwrap()); + assert!(re.is_partial_match(b("ab")).unwrap()); + assert!(!re.is_partial_match(b("abc")).unwrap()); + assert!(!re.is_partial_match(b("b")).unwrap()); + } + #[test] fn crlf() { let re = RegexBuilder::new() diff --git a/src/ffi.rs b/src/ffi.rs index 73bc39c..6cce927 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -93,7 +93,7 @@ impl Code { /// an error. pub fn jit_compile(&mut self) -> Result<(), Error> { let error_code = unsafe { - pcre2_jit_compile_8(self.code, PCRE2_JIT_COMPLETE) + pcre2_jit_compile_8(self.code, PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_HARD) }; if error_code == 0 { self.compiled_jit = true; @@ -427,6 +427,9 @@ impl MatchData { ); if rc == PCRE2_ERROR_NOMATCH { Ok(false) + } else if rc == PCRE2_ERROR_PARTIAL && + options & (PCRE2_PARTIAL_HARD | PCRE2_PARTIAL_SOFT) != 0 { + Ok(true) } else if rc > 0 { Ok(true) } else { From 112f4674661653bad19e154d9821a8aee2c41d52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= Date: Sun, 21 Feb 2021 22:03:52 +0100 Subject: [PATCH 2/3] Addressed review comments. Improved formatting, removed duplicate code, PCRE2_JIT_PARTIAL_HARD option is now configurable and disabled by default. --- src/bytes.rs | 87 +++++++++++++++++++++++++++++++++------------------- src/ffi.rs | 15 +++++++-- 2 files changed, 69 insertions(+), 33 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index 6727d64..f47c568 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -5,11 +5,7 @@ use std::ops::Index; use std::sync::Arc; use log::debug; -use pcre2_sys::{ - PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE, - PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET, - PCRE2_NEWLINE_ANYCRLF, PCRE2_PARTIAL_HARD -}; +use pcre2_sys::{PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE, PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET, PCRE2_NEWLINE_ANYCRLF, PCRE2_PARTIAL_HARD}; use thread_local::CachedThreadLocal; use crate::error::Error; @@ -76,6 +72,8 @@ struct Config { utf_check: bool, /// use pcre2_jit_compile jit: JITChoice, + /// use JIT for partial matching + jit_partial_matching: bool, /// Match-time specific configuration knobs. match_config: MatchConfig, } @@ -102,6 +100,7 @@ impl Default for Config { utf: false, utf_check: true, jit: JITChoice::Never, + jit_partial_matching: false, match_config: MatchConfig::default(), } } @@ -156,10 +155,10 @@ impl RegexBuilder { match self.config.jit { JITChoice::Never => {} // fallthrough JITChoice::Always => { - code.jit_compile()?; + code.jit_compile(self.config.jit_partial_matching)?; } JITChoice::Attempt => { - if let Err(err) = code.jit_compile() { + if let Err(err) = code.jit_compile(self.config.jit_partial_matching) { debug!("JIT compilation failed: {}", err); } } @@ -315,6 +314,9 @@ impl RegexBuilder { /// This generally speeds up matching quite a bit. The downside is that it /// can increase the time it takes to compile a pattern. /// + /// This option enables JIT only for complete matching. + /// To enable JIT additionally for partial matching, enable `jit_partial_matching`. + /// /// If the JIT isn't available or if JIT compilation returns an error, /// then a debug message with the error will be emitted and the regex will /// otherwise silently fall back to non-JIT matching. @@ -329,6 +331,13 @@ impl RegexBuilder { self } + /// Additionally enable PCRE2's JIT for partial matching. + /// This works only together with `jit` set to true. + pub fn jit_partial_matching(&mut self, yes: bool) -> &mut RegexBuilder { + self.config.jit_partial_matching = yes; + self + } + /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is /// not enabled, then this has no effect. /// @@ -427,13 +436,15 @@ impl Regex { self.is_match_at(subject, 0) } - /// Returns true if and only if the regex fully or partially matches the subject string given. - /// A partial match occurs when there is a match up to the end of a subject string, - /// but more characters are needed to match the entire pattern. + /// Returns true if and only if the regex fully or partially matches the + /// subject string given. A partial match occurs when there is a match + /// up to the end of a subject string, but more characters are needed to + /// match the entire pattern. /// /// # Example /// /// Test if given string can be a beginning of a valid telephone number: + /// /// ```rust /// # fn example() -> Result<(), ::pcre2::Error> { /// use pcre2::bytes::Regex; @@ -615,16 +626,19 @@ impl Regex { /// Advanced or "lower level" search methods. impl Regex { + /// Returns the same as is_match, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. - pub fn is_match_at( + /// + fn is_match_at_imp( &self, subject: &[u8], start: usize, + partial: bool, ) -> Result { assert!( start <= subject.len(), @@ -637,6 +651,9 @@ impl Regex { if !self.config.utf_check { options |= PCRE2_NO_UTF_CHECK; } + if partial { + options |= PCRE2_PARTIAL_HARD; + } let match_data = self.match_data(); let mut match_data = match_data.borrow_mut(); @@ -647,6 +664,20 @@ impl Regex { Ok(unsafe { match_data.find(&self.code, subject, start, options)? }) } + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at( + &self, + subject: &[u8], + start: usize, + ) -> Result { + self.is_match_at_imp(subject, start, false) + } + /// Returns the same as is_partial_match, but starts the search at the given /// offset. /// @@ -658,28 +689,9 @@ impl Regex { subject: &[u8], start: usize, ) -> Result { - assert!( - start <= subject.len(), - "start ({}) must be <= subject.len() ({})", - start, - subject.len() - ); - - let mut options = PCRE2_PARTIAL_HARD; - if !self.config.utf_check { - options |= PCRE2_NO_UTF_CHECK; - } - - let match_data = self.match_data(); - let mut match_data = match_data.borrow_mut(); - // SAFETY: The only unsafe PCRE2 option we potentially use here is - // PCRE2_NO_UTF_CHECK, and that only occurs if the caller executes the - // `disable_utf_check` method, which propagates the safety contract to - // the caller. - Ok(unsafe { match_data.find(&self.code, subject, start, options)? }) + self.is_match_at_imp(subject, start, true) } - /// Returns the same as find, but starts the search at the given /// offset. /// @@ -1311,6 +1323,19 @@ mod tests { } } + #[test] + fn jit_partial_matching() { + if is_jit_available() { + let re = RegexBuilder::new() + .jit(true) + .jit_partial_matching(true) + .build(r"[0-9][0-9][0-9]") + .unwrap(); + assert!(!re.is_match(b("12")).unwrap()); + assert!(re.is_partial_match(b("12")).unwrap()); + } + } + // Unlike jit4lyfe, this tests that everything works when requesting the // JIT only if it's available. In jit4lyfe, we require the JIT or fail. // If the JIT isn't available, then in this test, we simply don't use it. diff --git a/src/ffi.rs b/src/ffi.rs index 6cce927..eb432bf 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -89,11 +89,18 @@ impl Code { /// JIT compile this code object. /// + /// If partial is set, PCRE2_JIT_PARTIAL_HARD option flag is added + /// to generate code for partial matching. + /// /// If there was a problem performing JIT compilation, then this returns /// an error. - pub fn jit_compile(&mut self) -> Result<(), Error> { + pub fn jit_compile(&mut self, partial: bool) -> Result<(), Error> { + let mut options = PCRE2_JIT_COMPLETE; + if partial { + options |= PCRE2_JIT_PARTIAL_HARD; + } let error_code = unsafe { - pcre2_jit_compile_8(self.code, PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_HARD) + pcre2_jit_compile_8(self.code, options) }; if error_code == 0 { self.compiled_jit = true; @@ -390,6 +397,10 @@ impl MatchData { /// /// This returns false if no match occurred. /// + /// If partial match was requested by PCRE2_PARTIAL_HARD or + /// PCRE2_PARTIAL_SOFT option, this returns true if either a partial match + /// or a complete match occurred. + /// /// Match offsets can be extracted via `ovector`. /// /// # Safety From b2932ca6a97a4d6bae7a3aaf9769052e2a6997c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= Date: Sun, 21 Feb 2021 22:08:28 +0100 Subject: [PATCH 3/3] More formatting. --- src/bytes.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/bytes.rs b/src/bytes.rs index f47c568..ad00ac7 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -5,7 +5,10 @@ use std::ops::Index; use std::sync::Arc; use log::debug; -use pcre2_sys::{PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE, PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET, PCRE2_NEWLINE_ANYCRLF, PCRE2_PARTIAL_HARD}; +use pcre2_sys::{ + PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE, + PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET, PCRE2_NEWLINE_ANYCRLF, + PCRE2_PARTIAL_HARD}; use thread_local::CachedThreadLocal; use crate::error::Error;