diff --git a/src/bytes.rs b/src/bytes.rs index a374452..ad00ac7 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -7,9 +7,8 @@ use std::sync::Arc; use log::debug; use pcre2_sys::{ PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE, - PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET, - PCRE2_NEWLINE_ANYCRLF, -}; + PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET, PCRE2_NEWLINE_ANYCRLF, + PCRE2_PARTIAL_HARD}; use thread_local::CachedThreadLocal; use crate::error::Error; @@ -76,6 +75,8 @@ struct Config { utf_check: bool, /// use pcre2_jit_compile jit: JITChoice, + /// use JIT for partial matching + jit_partial_matching: bool, /// Match-time specific configuration knobs. match_config: MatchConfig, } @@ -102,6 +103,7 @@ impl Default for Config { utf: false, utf_check: true, jit: JITChoice::Never, + jit_partial_matching: false, match_config: MatchConfig::default(), } } @@ -156,10 +158,10 @@ impl RegexBuilder { match self.config.jit { JITChoice::Never => {} // fallthrough JITChoice::Always => { - code.jit_compile()?; + code.jit_compile(self.config.jit_partial_matching)?; } JITChoice::Attempt => { - if let Err(err) = code.jit_compile() { + if let Err(err) = code.jit_compile(self.config.jit_partial_matching) { debug!("JIT compilation failed: {}", err); } } @@ -315,6 +317,9 @@ impl RegexBuilder { /// This generally speeds up matching quite a bit. The downside is that it /// can increase the time it takes to compile a pattern. /// + /// This option enables JIT only for complete matching. + /// To enable JIT additionally for partial matching, enable `jit_partial_matching`. + /// /// If the JIT isn't available or if JIT compilation returns an error, /// then a debug message with the error will be emitted and the regex will /// otherwise silently fall back to non-JIT matching. @@ -329,6 +334,13 @@ impl RegexBuilder { self } + /// Additionally enable PCRE2's JIT for partial matching. + /// This works only together with `jit` set to true. + pub fn jit_partial_matching(&mut self, yes: bool) -> &mut RegexBuilder { + self.config.jit_partial_matching = yes; + self + } + /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is /// not enabled, then this has no effect. /// @@ -427,6 +439,27 @@ impl Regex { self.is_match_at(subject, 0) } + /// Returns true if and only if the regex fully or partially matches the + /// subject string given. A partial match occurs when there is a match + /// up to the end of a subject string, but more characters are needed to + /// match the entire pattern. + /// + /// # Example + /// + /// Test if given string can be a beginning of a valid telephone number: + /// + /// ```rust + /// # fn example() -> Result<(), ::pcre2::Error> { + /// use pcre2::bytes::Regex; + /// + /// let text = b"123-456-"; + /// assert!(Regex::new(r"^\d{3}-\d{3}-\d{3}")?.is_partial_match(text)?); + /// # Ok(()) }; example().unwrap() + /// ``` + pub fn is_partial_match(&self, subject: &[u8]) -> Result { + self.is_partial_match_at(subject, 0) + } + /// Returns the start and end byte range of the leftmost-first match in /// `subject`. If no match exists, then `None` is returned. /// @@ -596,16 +629,19 @@ impl Regex { /// Advanced or "lower level" search methods. impl Regex { + /// Returns the same as is_match, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. - pub fn is_match_at( + /// + fn is_match_at_imp( &self, subject: &[u8], start: usize, + partial: bool, ) -> Result { assert!( start <= subject.len(), @@ -618,6 +654,9 @@ impl Regex { if !self.config.utf_check { options |= PCRE2_NO_UTF_CHECK; } + if partial { + options |= PCRE2_PARTIAL_HARD; + } let match_data = self.match_data(); let mut match_data = match_data.borrow_mut(); @@ -628,6 +667,34 @@ impl Regex { Ok(unsafe { match_data.find(&self.code, subject, start, options)? }) } + /// Returns the same as is_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_match_at( + &self, + subject: &[u8], + start: usize, + ) -> Result { + self.is_match_at_imp(subject, start, false) + } + + /// Returns the same as is_partial_match, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn is_partial_match_at( + &self, + subject: &[u8], + start: usize, + ) -> Result { + self.is_match_at_imp(subject, start, true) + } + /// Returns the same as find, but starts the search at the given /// offset. /// @@ -1150,6 +1217,18 @@ mod tests { assert!(re.is_match(b("Β")).unwrap()); } + #[test] + fn partial() { + let re = RegexBuilder::new() + .build("ab$") + .unwrap(); + + assert!(re.is_partial_match(b("a")).unwrap()); + assert!(re.is_partial_match(b("ab")).unwrap()); + assert!(!re.is_partial_match(b("abc")).unwrap()); + assert!(!re.is_partial_match(b("b")).unwrap()); + } + #[test] fn crlf() { let re = RegexBuilder::new() @@ -1247,6 +1326,19 @@ mod tests { } } + #[test] + fn jit_partial_matching() { + if is_jit_available() { + let re = RegexBuilder::new() + .jit(true) + .jit_partial_matching(true) + .build(r"[0-9][0-9][0-9]") + .unwrap(); + assert!(!re.is_match(b("12")).unwrap()); + assert!(re.is_partial_match(b("12")).unwrap()); + } + } + // Unlike jit4lyfe, this tests that everything works when requesting the // JIT only if it's available. In jit4lyfe, we require the JIT or fail. // If the JIT isn't available, then in this test, we simply don't use it. diff --git a/src/ffi.rs b/src/ffi.rs index 73bc39c..eb432bf 100644 --- a/src/ffi.rs +++ b/src/ffi.rs @@ -89,11 +89,18 @@ impl Code { /// JIT compile this code object. /// + /// If partial is set, PCRE2_JIT_PARTIAL_HARD option flag is added + /// to generate code for partial matching. + /// /// If there was a problem performing JIT compilation, then this returns /// an error. - pub fn jit_compile(&mut self) -> Result<(), Error> { + pub fn jit_compile(&mut self, partial: bool) -> Result<(), Error> { + let mut options = PCRE2_JIT_COMPLETE; + if partial { + options |= PCRE2_JIT_PARTIAL_HARD; + } let error_code = unsafe { - pcre2_jit_compile_8(self.code, PCRE2_JIT_COMPLETE) + pcre2_jit_compile_8(self.code, options) }; if error_code == 0 { self.compiled_jit = true; @@ -390,6 +397,10 @@ impl MatchData { /// /// This returns false if no match occurred. /// + /// If partial match was requested by PCRE2_PARTIAL_HARD or + /// PCRE2_PARTIAL_SOFT option, this returns true if either a partial match + /// or a complete match occurred. + /// /// Match offsets can be extracted via `ovector`. /// /// # Safety @@ -427,6 +438,9 @@ impl MatchData { ); if rc == PCRE2_ERROR_NOMATCH { Ok(false) + } else if rc == PCRE2_ERROR_PARTIAL && + options & (PCRE2_PARTIAL_HARD | PCRE2_PARTIAL_SOFT) != 0 { + Ok(true) } else if rc > 0 { Ok(true) } else {