Skip to content

Commit 70cf337

Browse files
committed
add mixed AND OR test, fix buffered_union
1 parent 8028bae commit 70cf337

File tree

3 files changed

+106
-19
lines changed

3 files changed

+106
-19
lines changed

src/docset.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,16 @@ pub trait DocSet: Send {
5353

5454
/// Seeks to the target if possible and returns true if the target is in the DocSet.
5555
///
56-
/// DocSets that already have an efficient `seek` method don't need to implement `seek_exact`.
57-
/// All wrapper DocSets should forward `seek_exact` to the underlying DocSet.
56+
/// DocSets that already have an efficient `seek` method don't need to implement
57+
/// `seek_into_the_danger_zone`. All wrapper DocSets should forward
58+
/// `seek_into_the_danger_zone` to the underlying DocSet.
5859
///
5960
/// ## API Behaviour
60-
/// If `seek_exact` is returning true, a call to `doc()` has to return target.
61-
/// If `seek_exact` is returning false, a call to `doc()` may return any doc between
62-
/// the last doc that matched and target or a doc that is a valid next hit after target.
63-
/// The DocSet is considered to be in an invalid state until `seek_exact` returns true again.
61+
/// If `seek_into_the_danger_zone` is returning true, a call to `doc()` has to return target.
62+
/// If `seek_into_the_danger_zone` is returning false, a call to `doc()` may return any doc
63+
/// between the last doc that matched and target or a doc that is a valid next hit after
64+
/// target. The DocSet is considered to be in an invalid state until
65+
/// `seek_into_the_danger_zone` returns true again.
6466
///
6567
/// `target` needs to be equal or larger than `doc` when in a valid state.
6668
///

src/query/mod.rs

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,81 @@ pub use self::weight::Weight;
7070

7171
#[cfg(test)]
7272
mod tests {
73+
use crate::collector::TopDocs;
74+
use crate::query::phrase_query::tests::create_index;
7375
use crate::query::QueryParser;
7476
use crate::schema::{Schema, TEXT};
75-
use crate::{Index, Term};
77+
use crate::{DocAddress, Index, Term};
78+
79+
#[test]
80+
pub fn test_mixed_intersection_and_union() -> crate::Result<()> {
81+
let index = create_index(&["a b", "a c", "a b c", "b"])?;
82+
let schema = index.schema();
83+
let text_field = schema.get_field("text").unwrap();
84+
let searcher = index.reader()?.searcher();
85+
86+
let do_search = |term: &str| {
87+
let query = QueryParser::for_index(&index, vec![text_field])
88+
.parse_query(term)
89+
.unwrap();
90+
let top_docs: Vec<(f32, DocAddress)> =
91+
searcher.search(&query, &TopDocs::with_limit(10)).unwrap();
92+
93+
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
94+
};
95+
96+
assert_eq!(do_search("a AND b"), vec![0, 2]);
97+
assert_eq!(do_search("(a OR b) AND C"), vec![2, 1]);
98+
// The intersection code has special code for more than 2 intersections
99+
// left, right + others
100+
// The will place the union in the "others" insersection to that seek_into_the_danger_zone
101+
// is called
102+
assert_eq!(
103+
do_search("(a OR b) AND (c OR a) AND (b OR c)"),
104+
vec![2, 1, 0]
105+
);
106+
107+
Ok(())
108+
}
109+
110+
#[test]
111+
pub fn test_mixed_intersection_and_union_with_skip() -> crate::Result<()> {
112+
// Test 4096 skip in BufferedUnionScorer
113+
let mut data: Vec<&str> = Vec::new();
114+
data.push("a b");
115+
let zz_data = vec!["z z"; 5000];
116+
data.extend_from_slice(&zz_data);
117+
data.extend_from_slice(&["a c"]);
118+
data.extend_from_slice(&zz_data);
119+
data.extend_from_slice(&["a b c", "b"]);
120+
let index = create_index(&data)?;
121+
let schema = index.schema();
122+
let text_field = schema.get_field("text").unwrap();
123+
let searcher = index.reader()?.searcher();
124+
125+
let do_search = |term: &str| {
126+
let query = QueryParser::for_index(&index, vec![text_field])
127+
.parse_query(term)
128+
.unwrap();
129+
let top_docs: Vec<(f32, DocAddress)> =
130+
searcher.search(&query, &TopDocs::with_limit(10)).unwrap();
131+
132+
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
133+
};
134+
135+
assert_eq!(do_search("a AND b"), vec![0, 10002]);
136+
assert_eq!(do_search("(a OR b) AND C"), vec![10002, 5001]);
137+
// The intersection code has special code for more than 2 intersections
138+
// left, right + others
139+
// The will place the union in the "others" insersection to that seek_into_the_danger_zone
140+
// is called
141+
assert_eq!(
142+
do_search("(a OR b) AND (c OR a) AND (b OR c)"),
143+
vec![10002, 5001, 0]
144+
);
145+
146+
Ok(())
147+
}
76148

77149
#[test]
78150
fn test_query_terms() {

src/query/union/buffered_union.rs

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32;
1313
// This function is similar except that it does is not unstable, and
1414
// it does not keep the original vector ordering.
1515
//
16-
// Also, it does not "yield" any elements.
16+
// Elements are dropped and not yielded.
1717
fn unordered_drain_filter<T, P>(v: &mut Vec<T>, mut predicate: P)
1818
where P: FnMut(&mut T) -> bool {
1919
let mut i = 0;
@@ -123,6 +123,11 @@ impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> BufferedUnionScorer<TScorer
123123
}
124124
false
125125
}
126+
127+
fn is_in_horizon(&self, target: DocId) -> bool {
128+
let gap = target - self.offset;
129+
gap < HORIZON
130+
}
126131
}
127132

128133
impl<TScorer, TScoreCombiner> DocSet for BufferedUnionScorer<TScorer, TScoreCombiner>
@@ -148,11 +153,11 @@ where
148153
if self.doc >= target {
149154
return self.doc;
150155
}
151-
let gap = target - self.offset;
152-
if gap < HORIZON {
156+
if self.is_in_horizon(target) {
153157
// Our value is within the buffered horizon.
154158

155-
// Skipping to corresponding bucket.
159+
// Skipping to corresponding bucket.
160+
let gap = target - self.offset;
156161
let new_cursor = gap as usize / 64;
157162
for obsolete_tinyset in &mut self.bitsets[self.cursor..new_cursor] {
158163
obsolete_tinyset.clear();
@@ -198,15 +203,23 @@ where
198203
}
199204

200205
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
201-
let is_hit = self
202-
.docsets
203-
.iter_mut()
204-
.all(|docset| docset.seek_into_the_danger_zone(target));
205-
// The API requires the DocSet to be in a valid state when `seek_exact` returns true.
206-
if is_hit {
207-
self.seek(target);
206+
if self.is_in_horizon(target) {
207+
// Our value is within the buffered horizon and the docset may already have been
208+
// processed and removed
209+
self.seek(target) == target
210+
} else {
211+
let is_hit = self
212+
.docsets
213+
.iter_mut()
214+
.any(|docset| docset.seek_into_the_danger_zone(target));
215+
216+
// The API requires the DocSet to be in a valid state when `seek_into_the_danger_zone`
217+
// returns true.
218+
if is_hit {
219+
self.seek(target);
220+
}
221+
is_hit
208222
}
209-
is_hit
210223
}
211224

212225
fn doc(&self) -> DocId {

0 commit comments

Comments
 (0)