Skip to content

Commit c5d30a5

Browse files
committed
CR
1 parent c632fc0 commit c5d30a5

File tree

9 files changed

+94
-111
lines changed

9 files changed

+94
-111
lines changed

fastfield_codecs/src/bitpacked.rs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ impl Column for BitpackedReader {
2222
}
2323
#[inline]
2424
fn min_value(&self) -> u64 {
25+
// The BitpackedReader assumes a normalized vector.
2526
0
2627
}
2728
#[inline]
@@ -58,19 +59,24 @@ impl FastFieldCodec for BitpackedCodec {
5859

5960
/// Serializes data with the BitpackedFastFieldSerializer.
6061
///
62+
/// The bitpacker assumes that the column has been normalized.
63+
/// i.e. It has already been shifted by its minimum value, so that its
64+
/// current minimum value is 0.
65+
///
6166
/// Ideally, we made a shift upstream on the column so that `col.min_value() == 0`.
62-
fn serialize(col: &dyn Column, write: &mut impl Write) -> io::Result<()> {
63-
let num_bits = compute_num_bits(col.max_value());
67+
fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()> {
68+
assert_eq!(column.min_value(), 0u64);
69+
let num_bits = compute_num_bits(column.max_value());
6470
let mut bit_packer = BitPacker::new();
65-
for val in col.iter() {
71+
for val in column.iter() {
6672
bit_packer.write(val, num_bits, write)?;
6773
}
6874
bit_packer.close(write)?;
6975
Ok(())
7076
}
7177

72-
fn estimate(col: &impl Column) -> Option<f32> {
73-
let num_bits = compute_num_bits(col.max_value());
78+
fn estimate(column: &impl Column) -> Option<f32> {
79+
let num_bits = compute_num_bits(column.max_value());
7480
let num_bits_uncompressed = 64;
7581
Some(num_bits as f32 / num_bits_uncompressed as f32)
7682
}

fastfield_codecs/src/blockwise_linear.rs

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,11 @@ impl FastFieldCodec for BlockwiseLinearCodec {
7171
}
7272

7373
// Estimate first_chunk and extrapolate
74-
fn estimate(fastfield_accessor: &impl crate::Column) -> Option<f32> {
75-
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE as u64 {
74+
fn estimate(column: &impl crate::Column) -> Option<f32> {
75+
if column.num_vals() < 10 * CHUNK_SIZE as u64 {
7676
return None;
7777
}
78-
let mut first_chunk: Vec<u64> = fastfield_accessor
79-
.iter()
80-
.take(CHUNK_SIZE as usize)
81-
.collect();
78+
let mut first_chunk: Vec<u64> = column.iter().take(CHUNK_SIZE as usize).collect();
8279
let line = Line::train(&VecColumn::from(&first_chunk));
8380
for (i, buffer_val) in first_chunk.iter_mut().enumerate() {
8481
let interpolated_val = line.eval(i as u64);
@@ -96,24 +93,23 @@ impl FastFieldCodec for BlockwiseLinearCodec {
9693
Block::default().serialize(&mut out).unwrap();
9794
out.len()
9895
};
99-
let num_bits = estimated_bit_width as u64 * fastfield_accessor.num_vals() as u64
96+
let num_bits = estimated_bit_width as u64 * column.num_vals() as u64
10097
// function metadata per block
101-
+ metadata_per_block as u64 * (fastfield_accessor.num_vals() / CHUNK_SIZE as u64);
102-
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
98+
+ metadata_per_block as u64 * (column.num_vals() / CHUNK_SIZE as u64);
99+
let num_bits_uncompressed = 64 * column.num_vals();
103100
Some(num_bits as f32 / num_bits_uncompressed as f32)
104101
}
105102

106-
fn serialize(
107-
fastfield_accessor: &dyn crate::Column,
108-
wrt: &mut impl io::Write,
109-
) -> io::Result<()> {
103+
fn serialize(column: &dyn crate::Column, wrt: &mut impl io::Write) -> io::Result<()> {
104+
// The BitpackedReader assumes a normalized vector.
105+
assert_eq!(column.min_value(), 0);
110106
let mut buffer = Vec::with_capacity(CHUNK_SIZE);
111-
let num_vals = fastfield_accessor.num_vals();
107+
let num_vals = column.num_vals();
112108

113109
let num_blocks = compute_num_blocks(num_vals);
114110
let mut blocks = Vec::with_capacity(num_blocks);
115111

116-
let mut vals = fastfield_accessor.iter();
112+
let mut vals = column.iter();
117113

118114
let mut bit_packer = BitPacker::new();
119115

@@ -176,6 +172,7 @@ impl Column for BlockwiseLinearReader {
176172
}
177173

178174
fn min_value(&self) -> u64 {
175+
// The BlockwiseLinearReader assumes a normalized vector.
179176
0u64
180177
}
181178

fastfield_codecs/src/column.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,18 @@ pub trait Column<T = u64> {
3434

3535
/// Returns the minimum value for this fast field.
3636
///
37-
/// The min value does not take in account of possible
38-
/// deleted document, and should be considered as a lower bound
39-
/// of the actual minimum value.
37+
/// This min_value may not be exact.
38+
/// For instance, the min value does not take in account of possible
39+
/// deleted document. All values are however guaranteed to be higher than
40+
/// `.min_value()`.
4041
fn min_value(&self) -> T;
4142

4243
/// Returns the maximum value for this fast field.
4344
///
44-
/// The max value does not take in account of possible
45-
/// deleted document, and should be considered as an upper bound
46-
/// of the actual maximum value
45+
/// This max_value may not be exact.
46+
/// For instance, the max value does not take in account of possible
47+
/// deleted document. All values are however guaranteed to be higher than
48+
/// `.max_value()`.
4749
fn max_value(&self) -> T;
4850

4951
fn num_vals(&self) -> u64;

fastfield_codecs/src/lib.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,17 @@ use std::io::Write;
1313
use common::BinarySerializable;
1414
use ownedbytes::OwnedBytes;
1515

16-
pub mod bitpacked;
17-
pub mod blockwise_linear;
16+
mod bitpacked;
17+
mod blockwise_linear;
1818
pub(crate) mod line;
19-
pub mod linear;
19+
mod linear;
2020

2121
mod column;
2222
mod gcd;
2323
mod serialize;
2424

2525
pub use self::column::{monotonic_map_column, Column, VecColumn};
26-
pub use self::serialize::{open, serialize, serialize_and_load, NormalizedHeader};
26+
pub use self::serialize::{estimate, open, serialize, serialize_and_load, NormalizedHeader};
2727

2828
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
2929
#[repr(u8)]
@@ -124,7 +124,7 @@ impl MonotonicallyMappableToU64 for f64 {
124124

125125
/// The FastFieldSerializerEstimate trait is required on all variants
126126
/// of fast field compressions, to decide which one to choose.
127-
pub trait FastFieldCodec: 'static {
127+
trait FastFieldCodec: 'static {
128128
/// A codex needs to provide a unique name and id, which is
129129
/// used for debugging and de/serialization.
130130
const CODEC_TYPE: FastFieldCodecType;

fastfield_codecs/src/linear.rs

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ impl Column for LinearReader {
2727

2828
#[inline]
2929
fn min_value(&self) -> u64 {
30+
// The LinearReader assumes a normalized vector.
3031
0u64
3132
}
3233

@@ -84,11 +85,11 @@ impl FastFieldCodec for LinearCodec {
8485
}
8586

8687
/// Creates a new fast field serializer.
87-
fn serialize(fastfield_accessor: &dyn Column, write: &mut impl Write) -> io::Result<()> {
88-
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
89-
let line = Line::train(fastfield_accessor);
88+
fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()> {
89+
assert_eq!(column.min_value(), 0);
90+
let line = Line::train(column);
9091

91-
let max_offset_from_line = fastfield_accessor
92+
let max_offset_from_line = column
9293
.iter()
9394
.enumerate()
9495
.map(|(pos, actual_value)| {
@@ -106,7 +107,7 @@ impl FastFieldCodec for LinearCodec {
106107
linear_params.serialize(write)?;
107108

108109
let mut bit_packer = BitPacker::new();
109-
for (pos, actual_value) in fastfield_accessor.iter().enumerate() {
110+
for (pos, actual_value) in column.iter().enumerate() {
110111
let calculated_value = line.eval(pos as u64);
111112
let offset = actual_value.wrapping_sub(calculated_value);
112113
bit_packer.write(offset, num_bits, write)?;
@@ -120,23 +121,23 @@ impl FastFieldCodec for LinearCodec {
120121
/// where the local maxima for the deviation of the calculated value are and
121122
/// the offset to shift all values to >=0 is also unknown.
122123
#[allow(clippy::question_mark)]
123-
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
124-
if fastfield_accessor.num_vals() < 3 {
124+
fn estimate(column: &impl Column) -> Option<f32> {
125+
if column.num_vals() < 3 {
125126
return None; // disable compressor for this case
126127
}
127128

128129
// let's sample at 0%, 5%, 10% .. 95%, 100%
129-
let num_vals = fastfield_accessor.num_vals() as f32 / 100.0;
130+
let num_vals = column.num_vals() as f32 / 100.0;
130131
let sample_positions = (0..20)
131132
.map(|pos| (num_vals * pos as f32 * 5.0) as u64)
132133
.collect::<Vec<_>>();
133134

134-
let line = Line::estimate(fastfield_accessor, &sample_positions);
135+
let line = Line::estimate(column, &sample_positions);
135136

136137
let estimated_bit_width = sample_positions
137138
.into_iter()
138139
.map(|pos| {
139-
let actual_value = fastfield_accessor.get_val(pos);
140+
let actual_value = column.get_val(pos);
140141
let interpolated_val = line.eval(pos as u64);
141142
actual_value.wrapping_sub(interpolated_val)
142143
})
@@ -145,8 +146,8 @@ impl FastFieldCodec for LinearCodec {
145146
.max()
146147
.unwrap_or(0);
147148

148-
let num_bits = (estimated_bit_width as u64 * fastfield_accessor.num_vals() as u64) + 64;
149-
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
149+
let num_bits = (estimated_bit_width as u64 * column.num_vals() as u64) + 64;
150+
let num_bits_uncompressed = 64 * column.num_vals();
150151
Some(num_bits as f32 / num_bits_uncompressed as f32)
151152
}
152153
}

fastfield_codecs/src/main.rs

Lines changed: 11 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,8 @@
11
#[macro_use]
22
extern crate prettytable;
3-
use fastfield_codecs::bitpacked::BitpackedCodec;
4-
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
5-
use fastfield_codecs::linear::LinearCodec;
6-
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
3+
use fastfield_codecs::{Column, FastFieldCodecType, FastFieldStats, VecColumn};
74
use prettytable::{Cell, Row, Table};
85

9-
struct Data<'a>(&'a [u64]);
10-
11-
impl<'a> Column for Data<'a> {
12-
fn get_val(&self, position: u64) -> u64 {
13-
self.0[position as usize]
14-
}
15-
16-
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
17-
Box::new(self.0.iter().cloned())
18-
}
19-
20-
fn min_value(&self) -> u64 {
21-
*self.0.iter().min().unwrap_or(&0)
22-
}
23-
24-
fn max_value(&self) -> u64 {
25-
*self.0.iter().max().unwrap_or(&0)
26-
}
27-
28-
fn num_vals(&self) -> u64 {
29-
self.0.len() as u64
30-
}
31-
}
32-
336
fn main() {
347
let mut table = Table::new();
358

@@ -38,10 +11,9 @@ fn main() {
3811

3912
for (data, data_set_name) in get_codec_test_data_sets() {
4013
let results: Vec<(f32, f32, FastFieldCodecType)> = [
41-
serialize_with_codec::<LinearCodec>(&data),
42-
serialize_with_codec::<BlockwiseLinearCodec>(&data),
43-
serialize_with_codec::<BlockwiseLinearCodec>(&data),
44-
serialize_with_codec::<BitpackedCodec>(&data),
14+
serialize_with_codec(&data, FastFieldCodecType::Bitpacked),
15+
serialize_with_codec(&data, FastFieldCodecType::Linear),
16+
serialize_with_codec(&data, FastFieldCodecType::BlockwiseLinear),
4517
]
4618
.into_iter()
4719
.flatten()
@@ -107,15 +79,16 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
10779
data_and_names
10880
}
10981

110-
pub fn serialize_with_codec<C: FastFieldCodec>(
82+
pub fn serialize_with_codec(
11183
data: &[u64],
84+
codec_type: FastFieldCodecType,
11285
) -> Option<(f32, f32, FastFieldCodecType)> {
113-
let data = Data(data);
114-
let estimation = C::estimate(&data)?;
86+
let col = VecColumn::from(data);
87+
let estimation = fastfield_codecs::estimate(&col, codec_type)?;
11588
let mut out = Vec::new();
116-
C::serialize(&data, &mut out).unwrap();
117-
let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
118-
Some((estimation, actual_compression, C::CODEC_TYPE))
89+
fastfield_codecs::serialize(&col, &mut out, &[codec_type]).ok()?;
90+
let actual_compression = out.len() as f32 / (col.num_vals() * 8) as f32;
91+
Some((estimation, actual_compression, codec_type))
11992
}
12093

12194
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

fastfield_codecs/src/serialize.rs

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,11 @@ use crate::{
3434
VecColumn, ALL_CODEC_TYPES,
3535
};
3636

37-
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
38-
// https://github.com/rust-lang/rust/pull/86176
39-
fn codec_estimation<C: FastFieldCodec, D: Column>(
40-
fastfield_accessor: &D,
41-
estimations: &mut Vec<(f32, FastFieldCodecType)>,
42-
) {
43-
if let Some(ratio) = C::estimate(fastfield_accessor) {
44-
estimations.push((ratio, C::CODEC_TYPE));
45-
}
46-
}
47-
37+
/// The normalized header gives some parameters after applying the following
38+
/// normalization of the vector:
39+
/// val -> (val - min_value) / gcd
40+
///
41+
/// By design, after normalization, `min_value = 0` and `gcd = 1`.
4842
#[derive(Debug, Copy, Clone)]
4943
pub struct NormalizedHeader {
5044
pub num_vals: u64,
@@ -160,6 +154,23 @@ fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
160154
}
161155
}
162156

157+
pub fn estimate<T: MonotonicallyMappableToU64>(
158+
typed_column: impl Column<T>,
159+
codec_type: FastFieldCodecType,
160+
) -> Option<f32> {
161+
let column = monotonic_map_column(typed_column, T::to_u64);
162+
let min_value = column.min_value();
163+
let gcd = crate::gcd::find_gcd(column.iter().map(|val| val - min_value))
164+
.filter(|gcd| gcd.get() > 1u64);
165+
let divider = DividerU64::divide_by(gcd.map(|gcd| gcd.get()).unwrap_or(1u64));
166+
let normalized_column = monotonic_map_column(&column, |val| divider.divide(val - min_value));
167+
match codec_type {
168+
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&normalized_column),
169+
FastFieldCodecType::Linear => LinearCodec::estimate(&normalized_column),
170+
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&normalized_column),
171+
}
172+
}
173+
163174
pub fn serialize<T: MonotonicallyMappableToU64>(
164175
typed_column: impl Column<T>,
165176
output: &mut impl io::Write,
@@ -188,16 +199,13 @@ fn detect_codec(
188199
) -> Option<FastFieldCodecType> {
189200
let mut estimations = Vec::new();
190201
for &codec in codecs {
191-
match codec {
192-
FastFieldCodecType::Bitpacked => {
193-
codec_estimation::<BitpackedCodec, _>(&column, &mut estimations);
194-
}
195-
FastFieldCodecType::Linear => {
196-
codec_estimation::<LinearCodec, _>(&column, &mut estimations);
197-
}
198-
FastFieldCodecType::BlockwiseLinear => {
199-
codec_estimation::<BlockwiseLinearCodec, _>(&column, &mut estimations);
200-
}
202+
let estimation_opt = match codec {
203+
FastFieldCodecType::Bitpacked => BitpackedCodec::estimate(&column),
204+
FastFieldCodecType::Linear => LinearCodec::estimate(&column),
205+
FastFieldCodecType::BlockwiseLinear => BlockwiseLinearCodec::estimate(&column),
206+
};
207+
if let Some(estimation) = estimation_opt {
208+
estimations.push((estimation, codec));
201209
}
202210
}
203211
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan()) {

src/fastfield/multivalued/writer.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -332,13 +332,11 @@ mod tests {
332332

333333
#[test]
334334
fn test_multivalue_get_vals() {
335-
let doc_id_mapping = DocIdMapping::from_new_id_to_old_id(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
335+
let doc_id_mapping =
336+
DocIdMapping::from_new_id_to_old_id(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
336337
assert_eq!(doc_id_mapping.num_old_doc_ids(), 10);
337-
let col = VecColumn::from(&[0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55,][..]);
338-
let multivalue_start_index = MultivalueStartIndex::new(
339-
&col,
340-
&doc_id_mapping,
341-
);
338+
let col = VecColumn::from(&[0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55][..]);
339+
let multivalue_start_index = MultivalueStartIndex::new(&col, &doc_id_mapping);
342340
assert_eq!(
343341
multivalue_start_index.iter().collect::<Vec<u64>>(),
344342
vec![0, 1, 1, 2, 3, 5, 8, 13, 21, 34, 55]
@@ -351,5 +349,4 @@ mod tests {
351349
assert_eq!(multivalue_start_index.get_val(0), 0);
352350
assert_eq!(multivalue_start_index.get_val(10), 55);
353351
}
354-
355352
}

0 commit comments

Comments
 (0)