diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b93d06c..08fc4d3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,7 @@ on: env: # Give nice tracebacks on segfaults. PYTHONFAULTHANDLER: "true" - MATURIN_VERSION: "1.8.3" + MATURIN_VERSION: "1.9.6" jobs: @@ -21,7 +21,7 @@ jobs: name: "${{ matrix.os }}: Python ${{ matrix.python-version }}" strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "3.14t"] os: ["ubuntu-latest", "macos-13", "windows-latest", "ubuntu-24.04-arm"] env: @@ -33,7 +33,7 @@ jobs: with: # We need tags to get the correct code version: fetch-depth: 0 - - uses: "actions/setup-python@v5" + - uses: "actions/setup-python@v6" with: python-version: "${{ matrix.python-version }}" - uses: "dtolnay/rust-toolchain@stable" diff --git a/CHANGELOG.md b/CHANGELOG.md index 821094c..a1d163d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## 1.0 + +* Added support for Python 3.14 and free-threaded Python. +* In order to support free-threaded Python, move the burden of thread safety onto users. + Specifically: you must not mutate byte arrays and the like that are passed to `BytesAhoCorasick` APIs while those APIs are running. + ## 0.22.2 * Update Rust dependencies. diff --git a/Cargo.lock b/Cargo.lock index 0da673e..65b2b8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,9 +55,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.174" +version = "0.2.176" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" +checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" [[package]] name = "memchr" @@ -88,9 +88,9 @@ checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" [[package]] name = "proc-macro2" -version = "1.0.95" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" dependencies = [ "unicode-ident", ] @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.104" +version = "2.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" dependencies = [ "proc-macro2", "quote", @@ -178,15 +178,15 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" +checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" [[package]] name = "unicode-ident" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" [[package]] name = "unindent" diff --git a/README.md b/README.md index 299ab90..022b66d 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,9 @@ You can construct a `AhoCorasick` object from any iterable (including generators You can also search `bytes`, `bytearray`, `memoryview`, and other objects supporting the Python buffer API. +> **IMPORTANT:** If you are searching mutable buffer, you **must not mutate it in another thread** while `find_matches_as_indexes()` is running. +> Similarly, the patterns cannot be mutated while the `BytesAhoCorasick` object is being constructed. + ```python >>> patterns = [b"hello", b"world"] >>> ac = ahocorasick_rs.BytesAhoCorasick(patterns) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 357670f..c2f8610 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] -channel = "1.89" +channel = "1.90" components = ["rustfmt", "clippy"] \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 96c5233..389c233 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,7 @@ use pyo3::{ buffer::{PyBuffer, ReadOnlyCell}, exceptions::{PyTypeError, PyValueError}, prelude::*, - types::{PyBytes, PyList, PyString}, + types::{PyList, PyString}, }; /// Search for multiple pattern strings against a single haystack string. @@ -258,7 +258,8 @@ impl PyAhoCorasick { let py = self_.py(); let matches = get_matches(&self_.ac_impl, haystack.as_bytes(), overlapping)?; let matches = py.detach(|| matches.collect::>().into_iter()); - let result = match self_.patterns { + + match self_.patterns { Some(ref patterns) => { PyList::new(py, matches.map(|m| patterns[m.pattern()].clone_ref(py))) } @@ -266,8 +267,7 @@ impl PyAhoCorasick { py, matches.map(|m| PyString::new(py, &haystack[m.start()..m.end()])), ), - }; - result + } } } @@ -282,7 +282,7 @@ impl<'py> TryFrom> for PyBufferBytes<'py> { // Get a PyBufferBytes from a Python object fn try_from(obj: Bound<'py, PyAny>) -> PyResult { - let buffer = PyBuffer::::get(&obj).map_err(PyErr::from)?; + let buffer = PyBuffer::::get(&obj)?; if buffer.dimensions() > 1 { return Err(PyTypeError::new_err( @@ -328,10 +328,12 @@ impl<'a> AsRef<[u8]> for PyBufferBytes<'a> { // wouldn't be able to prevent calling back into Python while holding // this reference, which might also result in a mutation). // - // This effectively means that it's only safe to hold onto the reference - // returned from this function as long as we don't release the GIL and - // don't call back into Python code while the reference is alive. - // See also https://github.com/PyO3/pyo3/issues/2824 + // In addition, in a free-threaded world there is no GIL at all to + // prevent mutation. + // + // Following the lead of `pyo3-numpy`, we deal with this by documenting + // to the user that the buffer cannot be mutated while it is passed to + // our API. See also https://github.com/PyO3/pyo3/issues/2824 unsafe { std::mem::transmute(slice) } } } @@ -348,6 +350,12 @@ impl<'a> AsRef<[u8]> for PyBufferBytes<'a> { /// finished. /// * ``matchkind``: Defaults to ``"MATCHKING_STANDARD"``. /// * ``implementation``: The underlying type of automaton to use for Aho-Corasick. +/// +/// IMPORTANT: If you are passing in patterns that are mutable buffers, you MUST +/// NOT mutate then in another thread while constructing this object. Doing so +/// will result in undefined behavior. Once the ``BytesAhoCorasick`` object is +/// constructed, however, they can be mutated since no references will be kept +/// to them. #[pyclass(name = "BytesAhoCorasick")] struct PyBytesAhoCorasick { ac_impl: AhoCorasick, @@ -406,35 +414,27 @@ impl PyBytesAhoCorasick { /// Return matches as tuple of (index_into_patterns, /// start_index_in_haystack, end_index_in_haystack). If ``overlapping`` is /// ``False`` (the default), don't include overlapping results. + /// + /// IMPORTANT: If you are passing in a mutable buffer, you MUST NOT mutate + /// it in another thread while this API is running. Doing so will result in + /// undefined behavior. #[pyo3(signature = (haystack, overlapping = false))] fn find_matches_as_indexes( self_: PyRef, haystack: Bound<'_, PyAny>, overlapping: bool, ) -> PyResult> { - let is_bytes = haystack.is_instance_of::(); let py = haystack.py(); let haystack_buffer = PyBufferBytes::try_from(haystack)?; let matches = get_matches(&self_.ac_impl, haystack_buffer.as_ref(), overlapping)? .map(|m| (m.pattern().as_u64(), m.start(), m.end())); - if !is_bytes { - // Note: we must collect here and not release the GIL or return an iterator - // from this function due to the safety caveat in the implementation of - // AsRef<[u8]> for PyBufferBytes, which is relevant here since the matches - // iterator is holding an AsRef reference to the haystack. - Ok(matches.collect()) - } else { - // However, if the haystack is a PyBytes, it's guaranteed to be immutable, - // so the safety caveat doesn't apply, and we can safely release the GIL - // while the matches iterator is holding a reference to the haystack. - py.detach(|| Ok(matches.collect())) - } + py.detach(|| Ok(matches.collect())) } } /// The main Python module. -#[pymodule] +#[pymodule(gil_used = false)] fn ahocorasick_rs(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?;