Skip to content

Reduce sorting in TopDocs #2646

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ binggan = "0.14.0"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.9"
ordered-float = "5.0.0"
pretty_assertions = "1.2.1"
proptest = "1.0.0"
test-log = "0.2.10"
Expand Down Expand Up @@ -161,9 +162,13 @@ name = "analyzer"
harness = false

[[bench]]
name = "index-bench"
name = "agg_bench"
harness = false

[[bench]]
name = "agg_bench"
name = "collector_bench"
harness = false

[[bench]]
name = "index-bench"
harness = false
2 changes: 1 addition & 1 deletion benches/agg_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
let mut index_writer = index.writer_with_num_threads(8, 200_000_000)?;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changes the test to have 8 segments instead of 1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's intentional. I believe that it should change to something other than 1, as it's definitely not realistic to have 1 segment in production.

It's very likely though that you would want the segments to be produced in some more deterministic order though, so if you'd rather I revert this here, that's totally fine.

// To make the different test cases comparable we just change one doc to force the
// cardinality
if cardinality == Cardinality::OptionalDense {
Expand Down
182 changes: 182 additions & 0 deletions benches/collector_bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand_distr::Distribution;
use serde_json::json;
use tantivy::collector::TopDocs;
use tantivy::fastfield::FastValue;
use tantivy::query::AllQuery;
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use tantivy::{doc, Index, Order};

#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;

/// Mini macro to register a function via its name
macro_rules! register {
($runner:expr, $func:ident) => {
$runner.register(stringify!($func), move |index| {
$func(index);
})
};
}

fn main() {
let inputs = vec![
("full", get_test_index_bench(Cardinality::Full).unwrap()),
(
"dense",
get_test_index_bench(Cardinality::OptionalDense).unwrap(),
),
(
"sparse",
get_test_index_bench(Cardinality::OptionalSparse).unwrap(),
),
(
"multivalue",
get_test_index_bench(Cardinality::Multivalued).unwrap(),
),
];

bench_collector(InputGroup::new_with_inputs(inputs));
}

fn bench_collector(mut group: InputGroup<Index>) {
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));

register!(group, top_docs_small_shallow);
register!(group, top_docs_small_deep);

register!(group, top_docs_large_shallow);
register!(group, top_docs_large_deep);

group.run();
}

fn execute_top_docs<F: FastValue>(
index: &Index,
fast_field: &str,
order: Order,
offset: usize,
limit: usize,
) {
let collector = TopDocs::with_limit(limit)
.and_offset(offset)
.order_by_fast_field::<F>(fast_field, order);

let reader = index.reader().unwrap();
let searcher = reader.searcher();
black_box(searcher.search(&AllQuery, &collector).unwrap());
}
fn top_docs_small_deep(index: &Index) {
execute_top_docs::<u64>(index, "score", Order::Asc, 10000, 10);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: the benches all test Ascending order only from what I can see, so this function could either be removed or should add some runs in descending order as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean that you would prefer that the order: Order argument be removed? I'm fine with that, but it can also be useful to expose assumptions like that in the caller.

}
fn top_docs_small_shallow(index: &Index) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
fn top_docs_small_shallow(index: &Index) {
fn top_docs_top_10(index: &Index) {

execute_top_docs::<u64>(index, "score", Order::Asc, 0, 10);
}
fn top_docs_large_deep(index: &Index) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
fn top_docs_large_deep(index: &Index) {
fn top_docs_top_1_000_skip_10_000(index: &Index) {

execute_top_docs::<u64>(index, "score", Order::Asc, 10000, 1000);
}
fn top_docs_large_shallow(index: &Index) {
execute_top_docs::<u64>(index, "score", Order::Asc, 0, 1000);
}

#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Cardinality {
/// All documents contain exactly one value.
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
#[default]
Full = 0,
/// All documents contain at most one value.
OptionalDense = 1,
/// All documents may contain any number of values.
Multivalued = 2,
/// 1 / 20 documents has a value
OptionalSparse = 3,
}

fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

most fields are unused

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a function of having cloned this from agg_bench, which @fulmicoton suggested.

I'll be honest: I think that we would be better off renaming agg_bench to collector_bench, and then putting the TopDocs benchmark functions in there as well. Because aggregations are also "just" collectors, and both sets of benchmarks want to consume similar datasets.

If we want to keep them in two benchmark files, then how would you feel about adding an un-published benchmarks-support crate containing this function and other support code?

let mut schema_builder = Schema::builder();
let text_fieldtype = tantivy::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];

let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();

let many_terms_data = (0..150_000)
.map(|num| format!("author{num}"))
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(8, 200_000_000)?;
// To make the different test cases comparable we just change one doc to force the
// cardinality
if cardinality == Cardinality::OptionalDense {
index_writer.add_document(doc!())?;
}
if cardinality == Cardinality::Multivalued {
index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}),
text_field => "cool",
text_field => "cool",
text_field_many_terms => "cool",
text_field_many_terms => "cool",
text_field_few_terms => "cool",
text_field_few_terms => "cool",
score_field => 1u64,
score_field => 1u64,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The value distribution is quite different for the multivalue case for the score field

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See above: this is copypasta from agg_bench.

score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => 1i64,
score_field_i64 => 1i64,
))?;
}
let mut doc_with_value = 1_000_000;
if cardinality == Cardinality::OptionalSparse {
doc_with_value /= 20;
}
let _val_max = 1_000_000.0;
for _ in 0..doc_with_value {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {
// 10% are numeric values
json!({ "mixed_type": val })
} else {
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
};
index_writer.add_document(doc!(
text_field => "cool",
json_field => json,
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64,
))?;
if cardinality == Cardinality::OptionalSparse {
for _ in 0..20 {
index_writer.add_document(doc!(text_field => "cool"))?;
}
}
}
// writing the segment
index_writer.commit()?;
}

Ok(index)
}
38 changes: 26 additions & 12 deletions src/collector/top_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
pub fn harvest(self) -> Vec<(T, DocAddress)> {
let segment_ord = self.segment_ord;
self.topn_computer
.into_sorted_vec()
.into_vec()
.into_iter()
.map(|comparable_doc| {
(
Expand All @@ -195,22 +195,36 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {

#[cfg(test)]
mod tests {
use std::collections::HashSet;

use ordered_float::OrderedFloat;

use super::{TopCollector, TopSegmentCollector};
use crate::DocAddress;

/// Individual segments are not sorted, and we convert their results to a set to emphasize that.
fn segment_results_set(
results: Vec<(f32, DocAddress)>,
) -> HashSet<(OrderedFloat<f32>, DocAddress)> {
results
.into_iter()
.map(|(score, doc)| (OrderedFloat(score), doc))
.collect()
}

#[test]
fn test_top_collector_not_at_capacity() {
let mut top_collector = TopSegmentCollector::new(0, 4);
top_collector.collect(1, 0.8);
top_collector.collect(3, 0.2);
top_collector.collect(5, 0.3);
assert_eq!(
top_collector.harvest(),
vec![
(0.8, DocAddress::new(0, 1)),
segment_results_set(top_collector.harvest()),
segment_results_set(vec![
(0.2, DocAddress::new(0, 3)),
(0.3, DocAddress::new(0, 5)),
(0.2, DocAddress::new(0, 3))
]
(0.8, DocAddress::new(0, 1)),
]),
);
}

Expand All @@ -223,13 +237,13 @@ mod tests {
top_collector.collect(7, 0.9);
top_collector.collect(9, -0.2);
assert_eq!(
top_collector.harvest(),
vec![
(0.9, DocAddress::new(0, 7)),
(0.8, DocAddress::new(0, 1)),
segment_results_set(top_collector.harvest()),
segment_results_set(vec![
(0.2, DocAddress::new(0, 3)),
(0.3, DocAddress::new(0, 5)),
(0.2, DocAddress::new(0, 3))
]
(0.8, DocAddress::new(0, 1)),
(0.9, DocAddress::new(0, 7)),
]),
);
}

Expand Down
Loading