Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ on:
env:
# Give nice tracebacks on segfaults.
PYTHONFAULTHANDLER: "true"
MATURIN_VERSION: "1.8.3"
MATURIN_VERSION: "1.9.6"


jobs:
tests:
name: "${{ matrix.os }}: Python ${{ matrix.python-version }}"
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14", "3.14t"]
os: ["ubuntu-latest", "macos-13", "windows-latest", "ubuntu-24.04-arm"]

env:
Expand All @@ -33,7 +33,7 @@ jobs:
with:
# We need tags to get the correct code version:
fetch-depth: 0
- uses: "actions/setup-python@v5"
- uses: "actions/setup-python@v6"
with:
python-version: "${{ matrix.python-version }}"
- uses: "dtolnay/rust-toolchain@stable"
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## 1.0

* Added support for Python 3.14 and free-threaded Python.
* In order to support free-threaded Python, move the burden of thread safety onto users.
Specifically: you must not mutate byte arrays and the like that are passed to `BytesAhoCorasick` APIs while those APIs are running.

## 0.22.2

* Update Rust dependencies.
Expand Down
20 changes: 10 additions & 10 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ You can construct a `AhoCorasick` object from any iterable (including generators

You can also search `bytes`, `bytearray`, `memoryview`, and other objects supporting the Python buffer API.

> **IMPORTANT:** If you are searching mutable buffer, you **must not mutate it in another thread** while `find_matches_as_indexes()` is running.
> Similarly, the patterns cannot be mutated while the `BytesAhoCorasick` object is being constructed.

```python
>>> patterns = [b"hello", b"world"]
>>> ac = ahocorasick_rs.BytesAhoCorasick(patterns)
Expand Down
2 changes: 1 addition & 1 deletion rust-toolchain.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[toolchain]
channel = "1.89"
channel = "1.90"
components = ["rustfmt", "clippy"]
46 changes: 23 additions & 23 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use pyo3::{
buffer::{PyBuffer, ReadOnlyCell},
exceptions::{PyTypeError, PyValueError},
prelude::*,
types::{PyBytes, PyList, PyString},
types::{PyList, PyString},
};

/// Search for multiple pattern strings against a single haystack string.
Expand Down Expand Up @@ -258,16 +258,16 @@ impl PyAhoCorasick {
let py = self_.py();
let matches = get_matches(&self_.ac_impl, haystack.as_bytes(), overlapping)?;
let matches = py.detach(|| matches.collect::<Vec<_>>().into_iter());
let result = match self_.patterns {

match self_.patterns {
Some(ref patterns) => {
PyList::new(py, matches.map(|m| patterns[m.pattern()].clone_ref(py)))
}
_ => PyList::new(
py,
matches.map(|m| PyString::new(py, &haystack[m.start()..m.end()])),
),
};
result
}
}
}

Expand All @@ -282,7 +282,7 @@ impl<'py> TryFrom<Bound<'py, PyAny>> for PyBufferBytes<'py> {

// Get a PyBufferBytes from a Python object
fn try_from(obj: Bound<'py, PyAny>) -> PyResult<Self> {
let buffer = PyBuffer::<u8>::get(&obj).map_err(PyErr::from)?;
let buffer = PyBuffer::<u8>::get(&obj)?;

if buffer.dimensions() > 1 {
return Err(PyTypeError::new_err(
Expand Down Expand Up @@ -328,10 +328,12 @@ impl<'a> AsRef<[u8]> for PyBufferBytes<'a> {
// wouldn't be able to prevent calling back into Python while holding
// this reference, which might also result in a mutation).
//
// This effectively means that it's only safe to hold onto the reference
// returned from this function as long as we don't release the GIL and
// don't call back into Python code while the reference is alive.
// See also https://github.com/PyO3/pyo3/issues/2824
// In addition, in a free-threaded world there is no GIL at all to
// prevent mutation.
//
// Following the lead of `pyo3-numpy`, we deal with this by documenting
// to the user that the buffer cannot be mutated while it is passed to
// our API. See also https://github.com/PyO3/pyo3/issues/2824
unsafe { std::mem::transmute(slice) }
}
}
Expand All @@ -348,6 +350,12 @@ impl<'a> AsRef<[u8]> for PyBufferBytes<'a> {
/// finished.
/// * ``matchkind``: Defaults to ``"MATCHKING_STANDARD"``.
/// * ``implementation``: The underlying type of automaton to use for Aho-Corasick.
///
/// IMPORTANT: If you are passing in patterns that are mutable buffers, you MUST
/// NOT mutate then in another thread while constructing this object. Doing so
/// will result in undefined behavior. Once the ``BytesAhoCorasick`` object is
/// constructed, however, they can be mutated since no references will be kept
/// to them.
#[pyclass(name = "BytesAhoCorasick")]
struct PyBytesAhoCorasick {
ac_impl: AhoCorasick,
Expand Down Expand Up @@ -406,35 +414,27 @@ impl PyBytesAhoCorasick {
/// Return matches as tuple of (index_into_patterns,
/// start_index_in_haystack, end_index_in_haystack). If ``overlapping`` is
/// ``False`` (the default), don't include overlapping results.
///
/// IMPORTANT: If you are passing in a mutable buffer, you MUST NOT mutate
/// it in another thread while this API is running. Doing so will result in
/// undefined behavior.
#[pyo3(signature = (haystack, overlapping = false))]
fn find_matches_as_indexes(
self_: PyRef<Self>,
haystack: Bound<'_, PyAny>,
overlapping: bool,
) -> PyResult<Vec<(u64, usize, usize)>> {
let is_bytes = haystack.is_instance_of::<PyBytes>();
let py = haystack.py();
let haystack_buffer = PyBufferBytes::try_from(haystack)?;
let matches = get_matches(&self_.ac_impl, haystack_buffer.as_ref(), overlapping)?
.map(|m| (m.pattern().as_u64(), m.start(), m.end()));

if !is_bytes {
// Note: we must collect here and not release the GIL or return an iterator
// from this function due to the safety caveat in the implementation of
// AsRef<[u8]> for PyBufferBytes, which is relevant here since the matches
// iterator is holding an AsRef reference to the haystack.
Ok(matches.collect())
} else {
// However, if the haystack is a PyBytes, it's guaranteed to be immutable,
// so the safety caveat doesn't apply, and we can safely release the GIL
// while the matches iterator is holding a reference to the haystack.
py.detach(|| Ok(matches.collect()))
}
py.detach(|| Ok(matches.collect()))
}
}

/// The main Python module.
#[pymodule]
#[pymodule(gil_used = false)]
fn ahocorasick_rs(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PyMatchKind>()?;
m.add_class::<Implementation>()?;
Expand Down