Skip to content

Commit 4e79f10

Browse files
authored
[TST] Regex testing (#4570)
## Description of changes _Summarize the changes made by this PR._ - Improvements & Bug fixes - Added a few unit tests for the literal expr impl - Added a basic proptest for ascii regex patterns - New functionality - N/A ## Test plan _How are these changes tested?_ - [ ] Tests pass locally with `pytest` for python, `yarn test` for js, `cargo test` for rust ## Documentation Changes _Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs section](https://github.com/chroma-core/chroma/tree/main/docs/docs.trychroma.com)?_
1 parent cdf5e60 commit 4e79f10

File tree

4 files changed

+525
-9
lines changed

4 files changed

+525
-9
lines changed

rust/segment/src/blockfile_metadata.rs

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -981,6 +981,7 @@ mod test {
981981
blockfile_record::{
982982
RecordSegmentReader, RecordSegmentReaderCreationError, RecordSegmentWriter,
983983
},
984+
test::TestDistributedSegment,
984985
types::materialize_logs,
985986
};
986987
use chroma_blockstore::{
@@ -990,10 +991,15 @@ mod test {
990991
use chroma_cache::new_cache_for_test;
991992
use chroma_storage::{local::LocalStorage, Storage};
992993
use chroma_types::{
993-
Chunk, CollectionUuid, LogRecord, MetadataValue, Operation, OperationRecord, SegmentUuid,
994-
UpdateMetadataValue,
994+
regex::literal_expr::{LiteralExpr, NgramLiteralProvider},
995+
strategies::{ArbitraryChromaRegexTestDocumentsParameters, ChromaRegexTestDocuments},
996+
Chunk, CollectionUuid, LogRecord, MetadataValue, Operation, OperationRecord,
997+
ScalarEncoding, SegmentUuid, UpdateMetadataValue,
995998
};
999+
use proptest::prelude::any_with;
1000+
use roaring::RoaringBitmap;
9961001
use std::{collections::HashMap, str::FromStr};
1002+
use tokio::runtime::Runtime;
9971003

9981004
#[tokio::test]
9991005
async fn empty_blocks() {
@@ -1963,4 +1969,79 @@ mod test {
19631969
Some(String::from("bye").as_str())
19641970
);
19651971
}
1972+
1973+
async fn run_regex_test(test_case: ChromaRegexTestDocuments) {
1974+
let pattern = String::from(test_case.hir.clone());
1975+
let regex = regex::Regex::new(&pattern).unwrap();
1976+
let reference_results = test_case
1977+
.documents
1978+
.iter()
1979+
.enumerate()
1980+
.filter_map(|(id, doc)| regex.is_match(doc).then_some(id as u32))
1981+
.collect::<RoaringBitmap>();
1982+
let logs = test_case
1983+
.documents
1984+
.into_iter()
1985+
.enumerate()
1986+
.map(|(id, doc)| LogRecord {
1987+
log_offset: id as i64,
1988+
record: OperationRecord {
1989+
id: format!("<{id}>"),
1990+
embedding: Some(vec![id as f32; 2]),
1991+
encoding: Some(ScalarEncoding::FLOAT32),
1992+
metadata: None,
1993+
document: Some(doc),
1994+
operation: Operation::Add,
1995+
},
1996+
})
1997+
.collect::<Vec<_>>();
1998+
let mut segments = TestDistributedSegment::new_with_dimension(2);
1999+
segments.compact_log(Chunk::new(logs.into()), 0).await;
2000+
let metadata_segment_reader = MetadataSegmentReader::from_segment(
2001+
&segments.metadata_segment,
2002+
&segments.blockfile_provider,
2003+
)
2004+
.await
2005+
.expect("Metadata segment reader should be constructable");
2006+
let fts_reader = metadata_segment_reader
2007+
.full_text_index_reader
2008+
.as_ref()
2009+
.expect("Full text index reader should be present");
2010+
let literal_expression = LiteralExpr::from(test_case.hir);
2011+
let regex_results = fts_reader
2012+
.match_literal_expression(&literal_expression)
2013+
.await
2014+
.expect("Literal evaluation should not fail");
2015+
if let Some(res) = regex_results {
2016+
assert_eq!(res, reference_results);
2017+
}
2018+
}
2019+
2020+
proptest::proptest! {
2021+
#[test]
2022+
fn test_simple_regex(
2023+
test_case in any_with::<ChromaRegexTestDocuments>(ArbitraryChromaRegexTestDocumentsParameters {
2024+
recursive_hir: false,
2025+
total_document_count: 10,
2026+
})
2027+
) {
2028+
let runtime = Runtime::new().unwrap();
2029+
runtime.block_on(async {
2030+
run_regex_test(test_case).await
2031+
});
2032+
}
2033+
2034+
#[test]
2035+
fn test_composite_regex(
2036+
test_case in any_with::<ChromaRegexTestDocuments>(ArbitraryChromaRegexTestDocumentsParameters {
2037+
recursive_hir: true,
2038+
total_document_count: 50,
2039+
})
2040+
) {
2041+
let runtime = Runtime::new().unwrap();
2042+
runtime.block_on(async {
2043+
run_regex_test(test_case).await
2044+
});
2045+
}
2046+
}
19662047
}

rust/types/src/regex/hir.rs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use regex_syntax::hir::{self, Class, ClassUnicode, HirKind};
1+
use regex_syntax::hir::{self, Class, ClassUnicode, HirKind, Repetition};
22

33
use super::ChromaRegexError;
44

@@ -74,3 +74,31 @@ impl TryFrom<hir::Hir> for ChromaHir {
7474
}
7575
}
7676
}
77+
78+
impl From<ChromaHir> for hir::Hir {
79+
fn from(value: ChromaHir) -> Self {
80+
match value {
81+
ChromaHir::Empty => Self::empty(),
82+
ChromaHir::Literal(literal) => Self::literal(literal.into_bytes()),
83+
ChromaHir::Class(class_unicode) => Self::class(Class::Unicode(class_unicode)),
84+
ChromaHir::Repetition { min, max, sub } => Self::repetition(Repetition {
85+
min,
86+
max,
87+
greedy: false,
88+
sub: Box::new((*sub).into()),
89+
}),
90+
ChromaHir::Concat(chroma_hirs) => {
91+
Self::concat(chroma_hirs.into_iter().map(Into::into).collect())
92+
}
93+
ChromaHir::Alternation(chroma_hirs) => {
94+
Self::alternation(chroma_hirs.into_iter().map(Into::into).collect())
95+
}
96+
}
97+
}
98+
}
99+
100+
impl From<ChromaHir> for String {
101+
fn from(value: ChromaHir) -> Self {
102+
format!("{}", hir::Hir::from(value))
103+
}
104+
}

0 commit comments

Comments
 (0)