Skip to content

Commit c632fc0

Browse files
committed
Refactoring fast fields codecs.
This removes the GCD part as a codec, and makes it so that fastfield codecs all share the same normalization part (shift + gcd).
1 parent ea72cf3 commit c632fc0

File tree

18 files changed

+673
-535
lines changed

18 files changed

+673
-535
lines changed

fastfield_codecs/benches/bench.rs

Lines changed: 51 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,17 @@ extern crate test;
44

55
#[cfg(test)]
66
mod tests {
7+
use std::sync::Arc;
8+
79
use fastfield_codecs::bitpacked::BitpackedCodec;
810
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
911
use fastfield_codecs::linear::LinearCodec;
1012
use fastfield_codecs::*;
1113

1214
fn get_data() -> Vec<u64> {
15+
let mut rng = StdRng::seed_from_u64(2u64);
1316
let mut data: Vec<_> = (100..55000_u64)
14-
.map(|num| num + rand::random::<u8>() as u64)
17+
.map(|num| num + rng.gen::<u8>() as u64)
1518
.collect();
1619
data.push(99_000);
1720
data.insert(1000, 2000);
@@ -22,32 +25,59 @@ mod tests {
2225
data
2326
}
2427

28+
#[inline(never)]
2529
fn value_iter() -> impl Iterator<Item = u64> {
2630
0..20_000
2731
}
32+
fn get_reader_for_bench<Codec: FastFieldCodec>(data: &[u64]) -> Codec::Reader {
33+
let mut bytes = Vec::new();
34+
let col = VecColumn::from(&data);
35+
let normalized_header = fastfield_codecs::NormalizedHeader {
36+
num_vals: col.num_vals(),
37+
max_value: col.max_value(),
38+
};
39+
Codec::serialize(&VecColumn::from(data), &mut bytes).unwrap();
40+
Codec::open_from_bytes(OwnedBytes::new(bytes), normalized_header).unwrap()
41+
}
2842
fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
29-
let mut bytes = vec![];
30-
Codec::serialize(&mut bytes, &VecColumn::from(data)).unwrap();
31-
let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
43+
let col = get_reader_for_bench::<Codec>(data);
44+
b.iter(|| {
45+
let mut sum = 0u64;
46+
for pos in value_iter() {
47+
let val = col.get_val(pos as u64);
48+
sum = sum.wrapping_add(val);
49+
}
50+
sum
51+
});
52+
}
53+
54+
#[inline(never)]
55+
fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn Column>) {
3256
b.iter(|| {
3357
let mut sum = 0u64;
3458
for pos in value_iter() {
35-
let val = reader.get_val(pos as u64);
36-
debug_assert_eq!(data[pos as usize], val);
59+
let val = col.get_val(pos as u64);
3760
sum = sum.wrapping_add(val);
3861
}
3962
sum
4063
});
4164
}
65+
66+
fn bench_get_dynamic<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
67+
let col = Arc::new(get_reader_for_bench::<Codec>(data));
68+
bench_get_dynamic_helper(b, col);
69+
}
4270
fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
4371
let mut bytes = Vec::new();
4472
b.iter(|| {
4573
bytes.clear();
46-
Codec::serialize(&mut bytes, &VecColumn::from(data)).unwrap();
74+
Codec::serialize(&VecColumn::from(data), &mut bytes).unwrap();
4775
});
4876
}
4977

5078
use ownedbytes::OwnedBytes;
79+
use rand::rngs::StdRng;
80+
use rand::{Rng, SeedableRng};
5181
use test::Bencher;
5282
#[bench]
5383
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
@@ -70,22 +100,28 @@ mod tests {
70100
bench_get::<BitpackedCodec>(b, &data);
71101
}
72102
#[bench]
103+
fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
104+
let data: Vec<_> = get_data();
105+
bench_get_dynamic::<BitpackedCodec>(b, &data);
106+
}
107+
#[bench]
73108
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
74109
let data: Vec<_> = get_data();
75110
bench_get::<LinearCodec>(b, &data);
76111
}
77112
#[bench]
113+
fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
114+
let data: Vec<_> = get_data();
115+
bench_get_dynamic::<LinearCodec>(b, &data);
116+
}
117+
#[bench]
78118
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
79119
let data: Vec<_> = get_data();
80120
bench_get::<BlockwiseLinearCodec>(b, &data);
81121
}
82-
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
83-
let min_value = data.iter().cloned().min().unwrap_or(0);
84-
let max_value = data.iter().cloned().max().unwrap_or(0);
85-
FastFieldStats {
86-
min_value,
87-
max_value,
88-
num_vals: data.len() as u64,
89-
}
122+
#[bench]
123+
fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
124+
let data: Vec<_> = get_data();
125+
bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
90126
}
91127
}

fastfield_codecs/src/bitpacked.rs

Lines changed: 21 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
use std::io::{self, Write};
22

3-
use common::BinarySerializable;
43
use ownedbytes::OwnedBytes;
54
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
65

6+
use crate::serialize::NormalizedHeader;
77
use crate::{Column, FastFieldCodec, FastFieldCodecType};
88

99
/// Depending on the field type, a different
@@ -12,80 +12,25 @@ use crate::{Column, FastFieldCodec, FastFieldCodecType};
1212
pub struct BitpackedReader {
1313
data: OwnedBytes,
1414
bit_unpacker: BitUnpacker,
15-
min_value_u64: u64,
16-
max_value_u64: u64,
17-
num_vals: u64,
15+
normalized_header: NormalizedHeader,
1816
}
1917

2018
impl Column for BitpackedReader {
2119
#[inline]
2220
fn get_val(&self, doc: u64) -> u64 {
23-
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
21+
self.bit_unpacker.get(doc, &self.data)
2422
}
2523
#[inline]
2624
fn min_value(&self) -> u64 {
27-
self.min_value_u64
25+
0
2826
}
2927
#[inline]
3028
fn max_value(&self) -> u64 {
31-
self.max_value_u64
29+
self.normalized_header.max_value
3230
}
3331
#[inline]
3432
fn num_vals(&self) -> u64 {
35-
self.num_vals
36-
}
37-
}
38-
pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> {
39-
bit_packer: BitPacker,
40-
write: &'a mut W,
41-
min_value: u64,
42-
num_vals: u64,
43-
amplitude: u64,
44-
num_bits: u8,
45-
}
46-
47-
impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
48-
/// Creates a new fast field serializer.
49-
///
50-
/// The serializer in fact encode the values by bitpacking
51-
/// `(val - min_value)`.
52-
///
53-
/// It requires a `min_value` and a `max_value` to compute
54-
/// compute the minimum number of bits required to encode
55-
/// values.
56-
pub fn open(
57-
write: &'a mut W,
58-
min_value: u64,
59-
max_value: u64,
60-
) -> io::Result<BitpackedSerializerLegacy<'a, W>> {
61-
assert!(min_value <= max_value);
62-
let amplitude = max_value - min_value;
63-
let num_bits = compute_num_bits(amplitude);
64-
let bit_packer = BitPacker::new();
65-
Ok(BitpackedSerializerLegacy {
66-
bit_packer,
67-
write,
68-
min_value,
69-
num_vals: 0,
70-
amplitude,
71-
num_bits,
72-
})
73-
}
74-
/// Pushes a new value to the currently open u64 fast field.
75-
#[inline]
76-
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
77-
let val_to_write: u64 = val - self.min_value;
78-
self.bit_packer
79-
.write(val_to_write, self.num_bits, &mut self.write)?;
80-
self.num_vals += 1;
81-
Ok(())
82-
}
83-
pub fn close_field(mut self) -> io::Result<()> {
84-
self.bit_packer.close(&mut self.write)?;
85-
self.min_value.serialize(&mut self.write)?;
86-
self.amplitude.serialize(&mut self.write)?;
87-
self.num_vals.serialize(&mut self.write)?;
88-
Ok(())
33+
self.normalized_header.num_vals
8934
}
9035
}
9136

@@ -98,50 +43,34 @@ impl FastFieldCodec for BitpackedCodec {
9843
type Reader = BitpackedReader;
9944

10045
/// Opens a fast field given a file.
101-
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
102-
let footer_offset = bytes.len() - 24;
103-
let (data, mut footer) = bytes.split(footer_offset);
104-
let min_value = u64::deserialize(&mut footer)?;
105-
let amplitude = u64::deserialize(&mut footer)?;
106-
let num_vals = u64::deserialize(&mut footer)?;
107-
let max_value = min_value + amplitude;
108-
let num_bits = compute_num_bits(amplitude);
46+
fn open_from_bytes(
47+
data: OwnedBytes,
48+
normalized_header: NormalizedHeader,
49+
) -> io::Result<Self::Reader> {
50+
let num_bits = compute_num_bits(normalized_header.max_value);
10951
let bit_unpacker = BitUnpacker::new(num_bits);
11052
Ok(BitpackedReader {
11153
data,
11254
bit_unpacker,
113-
min_value_u64: min_value,
114-
max_value_u64: max_value,
115-
num_vals,
55+
normalized_header,
11656
})
11757
}
11858

11959
/// Serializes data with the BitpackedFastFieldSerializer.
12060
///
121-
/// The serializer in fact encode the values by bitpacking
122-
/// `(val - min_value)`.
123-
///
124-
/// It requires a `min_value` and a `max_value` to compute
125-
/// compute the minimum number of bits required to encode
126-
/// values.
127-
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
128-
let mut serializer = BitpackedSerializerLegacy::open(
129-
write,
130-
fastfield_accessor.min_value(),
131-
fastfield_accessor.max_value(),
132-
)?;
133-
134-
for val in fastfield_accessor.iter() {
135-
serializer.add_val(val)?;
61+
/// Ideally, we made a shift upstream on the column so that `col.min_value() == 0`.
62+
fn serialize(col: &dyn Column, write: &mut impl Write) -> io::Result<()> {
63+
let num_bits = compute_num_bits(col.max_value());
64+
let mut bit_packer = BitPacker::new();
65+
for val in col.iter() {
66+
bit_packer.write(val, num_bits, write)?;
13667
}
137-
serializer.close_field()?;
138-
68+
bit_packer.close(write)?;
13969
Ok(())
14070
}
14171

142-
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
143-
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
144-
let num_bits = compute_num_bits(amplitude);
72+
fn estimate(col: &impl Column) -> Option<f32> {
73+
let num_bits = compute_num_bits(col.max_value());
14574
let num_bits_uncompressed = 64;
14675
Some(num_bits as f32 / num_bits_uncompressed as f32)
14776
}

0 commit comments

Comments
 (0)