Skip to content

Commit 54696da

Browse files
authored
Merge pull request #1505 from quickwit-oss/refact-fast-field
Refact fast field
2 parents 9436049 + c5d30a5 commit 54696da

File tree

18 files changed

+745
-624
lines changed

18 files changed

+745
-624
lines changed

fastfield_codecs/benches/bench.rs

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,17 @@ extern crate test;
44

55
#[cfg(test)]
66
mod tests {
7+
use std::sync::Arc;
8+
79
use fastfield_codecs::bitpacked::BitpackedCodec;
810
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
911
use fastfield_codecs::linear::LinearCodec;
1012
use fastfield_codecs::*;
1113

1214
fn get_data() -> Vec<u64> {
15+
let mut rng = StdRng::seed_from_u64(2u64);
1316
let mut data: Vec<_> = (100..55000_u64)
14-
.map(|num| num + rand::random::<u8>() as u64)
17+
.map(|num| num + rng.gen::<u8>() as u64)
1518
.collect();
1619
data.push(99_000);
1720
data.insert(1000, 2000);
@@ -22,32 +25,59 @@ mod tests {
2225
data
2326
}
2427

28+
#[inline(never)]
2529
fn value_iter() -> impl Iterator<Item = u64> {
2630
0..20_000
2731
}
32+
fn get_reader_for_bench<Codec: FastFieldCodec>(data: &[u64]) -> Codec::Reader {
33+
let mut bytes = Vec::new();
34+
let col = VecColumn::from(&data);
35+
let normalized_header = fastfield_codecs::NormalizedHeader {
36+
num_vals: col.num_vals(),
37+
max_value: col.max_value(),
38+
};
39+
Codec::serialize(&VecColumn::from(data), &mut bytes).unwrap();
40+
Codec::open_from_bytes(OwnedBytes::new(bytes), normalized_header).unwrap()
41+
}
2842
fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
29-
let mut bytes = vec![];
30-
Codec::serialize(&mut bytes, &VecColumn::from(data)).unwrap();
31-
let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
43+
let col = get_reader_for_bench::<Codec>(data);
44+
b.iter(|| {
45+
let mut sum = 0u64;
46+
for pos in value_iter() {
47+
let val = col.get_val(pos as u64);
48+
sum = sum.wrapping_add(val);
49+
}
50+
sum
51+
});
52+
}
53+
54+
#[inline(never)]
55+
fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn Column>) {
3256
b.iter(|| {
3357
let mut sum = 0u64;
3458
for pos in value_iter() {
35-
let val = reader.get_val(pos as u64);
36-
debug_assert_eq!(data[pos as usize], val);
59+
let val = col.get_val(pos as u64);
3760
sum = sum.wrapping_add(val);
3861
}
3962
sum
4063
});
4164
}
65+
66+
fn bench_get_dynamic<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
67+
let col = Arc::new(get_reader_for_bench::<Codec>(data));
68+
bench_get_dynamic_helper(b, col);
69+
}
4270
fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
4371
let mut bytes = Vec::new();
4472
b.iter(|| {
4573
bytes.clear();
46-
Codec::serialize(&mut bytes, &VecColumn::from(data)).unwrap();
74+
Codec::serialize(&VecColumn::from(data), &mut bytes).unwrap();
4775
});
4876
}
4977

5078
use ownedbytes::OwnedBytes;
79+
use rand::rngs::StdRng;
80+
use rand::{Rng, SeedableRng};
5181
use test::Bencher;
5282
#[bench]
5383
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
@@ -70,22 +100,28 @@ mod tests {
70100
bench_get::<BitpackedCodec>(b, &data);
71101
}
72102
#[bench]
103+
fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
104+
let data: Vec<_> = get_data();
105+
bench_get_dynamic::<BitpackedCodec>(b, &data);
106+
}
107+
#[bench]
73108
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
74109
let data: Vec<_> = get_data();
75110
bench_get::<LinearCodec>(b, &data);
76111
}
77112
#[bench]
113+
fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
114+
let data: Vec<_> = get_data();
115+
bench_get_dynamic::<LinearCodec>(b, &data);
116+
}
117+
#[bench]
78118
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
79119
let data: Vec<_> = get_data();
80120
bench_get::<BlockwiseLinearCodec>(b, &data);
81121
}
82-
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
83-
let min_value = data.iter().cloned().min().unwrap_or(0);
84-
let max_value = data.iter().cloned().max().unwrap_or(0);
85-
FastFieldStats {
86-
min_value,
87-
max_value,
88-
num_vals: data.len() as u64,
89-
}
122+
#[bench]
123+
fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
124+
let data: Vec<_> = get_data();
125+
bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
90126
}
91127
}

fastfield_codecs/src/bitpacked.rs

Lines changed: 26 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
use std::io::{self, Write};
22

3-
use common::BinarySerializable;
43
use ownedbytes::OwnedBytes;
54
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
65

6+
use crate::serialize::NormalizedHeader;
77
use crate::{Column, FastFieldCodec, FastFieldCodecType};
88

99
/// Depending on the field type, a different
@@ -12,80 +12,26 @@ use crate::{Column, FastFieldCodec, FastFieldCodecType};
1212
pub struct BitpackedReader {
1313
data: OwnedBytes,
1414
bit_unpacker: BitUnpacker,
15-
min_value_u64: u64,
16-
max_value_u64: u64,
17-
num_vals: u64,
15+
normalized_header: NormalizedHeader,
1816
}
1917

2018
impl Column for BitpackedReader {
2119
#[inline]
2220
fn get_val(&self, doc: u64) -> u64 {
23-
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
21+
self.bit_unpacker.get(doc, &self.data)
2422
}
2523
#[inline]
2624
fn min_value(&self) -> u64 {
27-
self.min_value_u64
25+
// The BitpackedReader assumes a normalized vector.
26+
0
2827
}
2928
#[inline]
3029
fn max_value(&self) -> u64 {
31-
self.max_value_u64
30+
self.normalized_header.max_value
3231
}
3332
#[inline]
3433
fn num_vals(&self) -> u64 {
35-
self.num_vals
36-
}
37-
}
38-
pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> {
39-
bit_packer: BitPacker,
40-
write: &'a mut W,
41-
min_value: u64,
42-
num_vals: u64,
43-
amplitude: u64,
44-
num_bits: u8,
45-
}
46-
47-
impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
48-
/// Creates a new fast field serializer.
49-
///
50-
/// The serializer in fact encode the values by bitpacking
51-
/// `(val - min_value)`.
52-
///
53-
/// It requires a `min_value` and a `max_value` to compute
54-
/// compute the minimum number of bits required to encode
55-
/// values.
56-
pub fn open(
57-
write: &'a mut W,
58-
min_value: u64,
59-
max_value: u64,
60-
) -> io::Result<BitpackedSerializerLegacy<'a, W>> {
61-
assert!(min_value <= max_value);
62-
let amplitude = max_value - min_value;
63-
let num_bits = compute_num_bits(amplitude);
64-
let bit_packer = BitPacker::new();
65-
Ok(BitpackedSerializerLegacy {
66-
bit_packer,
67-
write,
68-
min_value,
69-
num_vals: 0,
70-
amplitude,
71-
num_bits,
72-
})
73-
}
74-
/// Pushes a new value to the currently open u64 fast field.
75-
#[inline]
76-
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
77-
let val_to_write: u64 = val - self.min_value;
78-
self.bit_packer
79-
.write(val_to_write, self.num_bits, &mut self.write)?;
80-
self.num_vals += 1;
81-
Ok(())
82-
}
83-
pub fn close_field(mut self) -> io::Result<()> {
84-
self.bit_packer.close(&mut self.write)?;
85-
self.min_value.serialize(&mut self.write)?;
86-
self.amplitude.serialize(&mut self.write)?;
87-
self.num_vals.serialize(&mut self.write)?;
88-
Ok(())
34+
self.normalized_header.num_vals
8935
}
9036
}
9137

@@ -98,50 +44,39 @@ impl FastFieldCodec for BitpackedCodec {
9844
type Reader = BitpackedReader;
9945

10046
/// Opens a fast field given a file.
101-
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
102-
let footer_offset = bytes.len() - 24;
103-
let (data, mut footer) = bytes.split(footer_offset);
104-
let min_value = u64::deserialize(&mut footer)?;
105-
let amplitude = u64::deserialize(&mut footer)?;
106-
let num_vals = u64::deserialize(&mut footer)?;
107-
let max_value = min_value + amplitude;
108-
let num_bits = compute_num_bits(amplitude);
47+
fn open_from_bytes(
48+
data: OwnedBytes,
49+
normalized_header: NormalizedHeader,
50+
) -> io::Result<Self::Reader> {
51+
let num_bits = compute_num_bits(normalized_header.max_value);
10952
let bit_unpacker = BitUnpacker::new(num_bits);
11053
Ok(BitpackedReader {
11154
data,
11255
bit_unpacker,
113-
min_value_u64: min_value,
114-
max_value_u64: max_value,
115-
num_vals,
56+
normalized_header,
11657
})
11758
}
11859

11960
/// Serializes data with the BitpackedFastFieldSerializer.
12061
///
121-
/// The serializer in fact encode the values by bitpacking
122-
/// `(val - min_value)`.
62+
/// The bitpacker assumes that the column has been normalized.
63+
/// i.e. It has already been shifted by its minimum value, so that its
64+
/// current minimum value is 0.
12365
///
124-
/// It requires a `min_value` and a `max_value` to compute
125-
/// compute the minimum number of bits required to encode
126-
/// values.
127-
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
128-
let mut serializer = BitpackedSerializerLegacy::open(
129-
write,
130-
fastfield_accessor.min_value(),
131-
fastfield_accessor.max_value(),
132-
)?;
133-
134-
for val in fastfield_accessor.iter() {
135-
serializer.add_val(val)?;
66+
/// Ideally, we made a shift upstream on the column so that `col.min_value() == 0`.
67+
fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()> {
68+
assert_eq!(column.min_value(), 0u64);
69+
let num_bits = compute_num_bits(column.max_value());
70+
let mut bit_packer = BitPacker::new();
71+
for val in column.iter() {
72+
bit_packer.write(val, num_bits, write)?;
13673
}
137-
serializer.close_field()?;
138-
74+
bit_packer.close(write)?;
13975
Ok(())
14076
}
14177

142-
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
143-
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
144-
let num_bits = compute_num_bits(amplitude);
78+
fn estimate(column: &impl Column) -> Option<f32> {
79+
let num_bits = compute_num_bits(column.max_value());
14580
let num_bits_uncompressed = 64;
14681
Some(num_bits as f32 / num_bits_uncompressed as f32)
14782
}

0 commit comments

Comments
 (0)