Skip to content

Commit 00657d9

Browse files
authored
Merge pull request #1504 from quickwit-oss/move-to-fastfield-codec
Move to fastfield codec
2 parents 8e775b6 + 26876d4 commit 00657d9

File tree

19 files changed

+789
-797
lines changed

19 files changed

+789
-797
lines changed

fastfield_codecs/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
1414
ownedbytes = { version = "0.3.0", path = "../ownedbytes" }
1515
prettytable-rs = {version="0.9.0", optional= true}
1616
rand = {version="0.8.3", optional= true}
17+
fastdivide = "0.4"
18+
log = "0.4"
1719

1820
[dev-dependencies]
1921
more-asserts = "0.3.0"
@@ -23,4 +25,5 @@ rand = "0.8.3"
2325
[features]
2426
bin = ["prettytable-rs", "rand"]
2527
default = ["bin"]
28+
unstable = []
2629

fastfield_codecs/benches/bench.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ mod tests {
2727
}
2828
fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
2929
let mut bytes = vec![];
30-
Codec::serialize(&mut bytes, &data).unwrap();
30+
Codec::serialize(&mut bytes, &VecColumn::from(data)).unwrap();
3131
let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
3232
b.iter(|| {
3333
let mut sum = 0u64;
@@ -43,7 +43,7 @@ mod tests {
4343
let mut bytes = Vec::new();
4444
b.iter(|| {
4545
bytes.clear();
46-
Codec::serialize(&mut bytes, &data).unwrap();
46+
Codec::serialize(&mut bytes, &VecColumn::from(data)).unwrap();
4747
});
4848
}
4949

fastfield_codecs/src/column.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,11 @@ impl<'a, T: Copy + PartialOrd> Column<T> for VecColumn<'a, T> {
8181
}
8282
}
8383

84-
impl<'a, T: Copy + Ord + Default> From<&'a [T]> for VecColumn<'a, T> {
85-
fn from(values: &'a [T]) -> Self {
84+
impl<'a, T: Copy + Ord + Default, V> From<&'a V> for VecColumn<'a, T>
85+
where V: AsRef<[T]> + ?Sized
86+
{
87+
fn from(values: &'a V) -> Self {
88+
let values = values.as_ref();
8689
let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default();
8790
Self {
8891
values,

fastfield_codecs/src/gcd.rs

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
use std::io::{self, Write};
2+
use std::num::NonZeroU64;
3+
4+
use common::BinarySerializable;
5+
use fastdivide::DividerU64;
6+
7+
#[derive(Debug, Clone, Copy)]
8+
pub struct GCDParams {
9+
pub gcd: u64,
10+
pub min_value: u64,
11+
pub num_vals: u64,
12+
}
13+
14+
impl BinarySerializable for GCDParams {
15+
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
16+
self.gcd.serialize(writer)?;
17+
self.min_value.serialize(writer)?;
18+
self.num_vals.serialize(writer)?;
19+
Ok(())
20+
}
21+
22+
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
23+
let gcd: u64 = u64::deserialize(reader)?;
24+
let min_value: u64 = u64::deserialize(reader)?;
25+
let num_vals: u64 = u64::deserialize(reader)?;
26+
Ok(Self {
27+
gcd,
28+
min_value,
29+
num_vals,
30+
})
31+
}
32+
}
33+
34+
/// Compute the gcd of two non null numbers.
35+
///
36+
/// It is recommended, but not required, to feed values such that `large >= small`.
37+
fn compute_gcd(mut large: NonZeroU64, mut small: NonZeroU64) -> NonZeroU64 {
38+
loop {
39+
let rem: u64 = large.get() % small;
40+
if let Some(new_small) = NonZeroU64::new(rem) {
41+
(large, small) = (small, new_small);
42+
} else {
43+
return small;
44+
}
45+
}
46+
}
47+
48+
// Find GCD for iterator of numbers
49+
pub fn find_gcd(numbers: impl Iterator<Item = u64>) -> Option<NonZeroU64> {
50+
let mut numbers = numbers.flat_map(NonZeroU64::new);
51+
let mut gcd: NonZeroU64 = numbers.next()?;
52+
if gcd.get() == 1 {
53+
return Some(gcd);
54+
}
55+
56+
let mut gcd_divider = DividerU64::divide_by(gcd.get());
57+
for val in numbers {
58+
let remainder = val.get() - (gcd_divider.divide(val.get())) * gcd.get();
59+
if remainder == 0 {
60+
continue;
61+
}
62+
gcd = compute_gcd(val, gcd);
63+
if gcd.get() == 1 {
64+
return Some(gcd);
65+
}
66+
67+
gcd_divider = DividerU64::divide_by(gcd.get());
68+
}
69+
Some(gcd)
70+
}
71+
72+
#[cfg(test)]
73+
mod tests {
74+
use std::io;
75+
use std::num::NonZeroU64;
76+
77+
use ownedbytes::OwnedBytes;
78+
79+
use crate::gcd::{compute_gcd, find_gcd};
80+
use crate::{FastFieldCodecType, VecColumn};
81+
82+
fn test_fastfield_gcd_i64_with_codec(
83+
codec_type: FastFieldCodecType,
84+
num_vals: usize,
85+
) -> io::Result<()> {
86+
let mut vals: Vec<i64> = (-4..=(num_vals as i64) - 5).map(|val| val * 1000).collect();
87+
let mut buffer: Vec<u8> = Vec::new();
88+
crate::serialize(
89+
VecColumn::from(&vals),
90+
&mut buffer,
91+
&[codec_type, FastFieldCodecType::Gcd],
92+
)?;
93+
let buffer = OwnedBytes::new(buffer);
94+
let column = crate::open::<i64>(buffer.clone())?;
95+
assert_eq!(column.get_val(0), -4000i64);
96+
assert_eq!(column.get_val(1), -3000i64);
97+
assert_eq!(column.get_val(2), -2000i64);
98+
assert_eq!(column.max_value(), (num_vals as i64 - 5) * 1000);
99+
assert_eq!(column.min_value(), -4000i64);
100+
101+
// Can't apply gcd
102+
let mut buffer_without_gcd = Vec::new();
103+
vals.pop();
104+
vals.push(1001i64);
105+
crate::serialize(
106+
VecColumn::from(&vals),
107+
&mut buffer_without_gcd,
108+
&[codec_type],
109+
)?;
110+
let buffer_without_gcd = OwnedBytes::new(buffer_without_gcd);
111+
assert!(buffer_without_gcd.len() > buffer.len());
112+
113+
Ok(())
114+
}
115+
116+
#[test]
117+
fn test_fastfield_gcd_i64() -> io::Result<()> {
118+
for &codec_type in &[
119+
FastFieldCodecType::Bitpacked,
120+
FastFieldCodecType::BlockwiseLinear,
121+
FastFieldCodecType::Linear,
122+
] {
123+
test_fastfield_gcd_i64_with_codec(codec_type, 5500)?;
124+
}
125+
Ok(())
126+
}
127+
128+
fn test_fastfield_gcd_u64_with_codec(
129+
codec_type: FastFieldCodecType,
130+
num_vals: usize,
131+
) -> io::Result<()> {
132+
let mut vals: Vec<u64> = (1..=num_vals).map(|i| i as u64 * 1000u64).collect();
133+
let mut buffer: Vec<u8> = Vec::new();
134+
crate::serialize(
135+
VecColumn::from(&vals),
136+
&mut buffer,
137+
&[codec_type, FastFieldCodecType::Gcd],
138+
)?;
139+
let buffer = OwnedBytes::new(buffer);
140+
let column = crate::open::<u64>(buffer.clone())?;
141+
assert_eq!(column.get_val(0), 1000u64);
142+
assert_eq!(column.get_val(1), 2000u64);
143+
assert_eq!(column.get_val(2), 3000u64);
144+
assert_eq!(column.max_value(), num_vals as u64 * 1000);
145+
assert_eq!(column.min_value(), 1000u64);
146+
147+
// Can't apply gcd
148+
let mut buffer_without_gcd = Vec::new();
149+
vals.pop();
150+
vals.push(1001u64);
151+
crate::serialize(
152+
VecColumn::from(&vals),
153+
&mut buffer_without_gcd,
154+
&[codec_type],
155+
)?;
156+
let buffer_without_gcd = OwnedBytes::new(buffer_without_gcd);
157+
assert!(buffer_without_gcd.len() > buffer.len());
158+
Ok(())
159+
}
160+
161+
#[test]
162+
fn test_fastfield_gcd_u64() -> io::Result<()> {
163+
for &codec_type in &[
164+
FastFieldCodecType::Bitpacked,
165+
FastFieldCodecType::BlockwiseLinear,
166+
FastFieldCodecType::Linear,
167+
] {
168+
test_fastfield_gcd_u64_with_codec(codec_type, 5500)?;
169+
}
170+
Ok(())
171+
}
172+
173+
#[test]
174+
pub fn test_fastfield2() {
175+
let test_fastfield = crate::serialize_and_load(&[100u64, 200u64, 300u64]);
176+
assert_eq!(test_fastfield.get_val(0), 100);
177+
assert_eq!(test_fastfield.get_val(1), 200);
178+
assert_eq!(test_fastfield.get_val(2), 300);
179+
}
180+
181+
#[test]
182+
fn test_compute_gcd() {
183+
let test_compute_gcd_aux = |large, small, expected| {
184+
let large = NonZeroU64::new(large).unwrap();
185+
let small = NonZeroU64::new(small).unwrap();
186+
let expected = NonZeroU64::new(expected).unwrap();
187+
assert_eq!(compute_gcd(small, large), expected);
188+
assert_eq!(compute_gcd(large, small), expected);
189+
};
190+
test_compute_gcd_aux(1, 4, 1);
191+
test_compute_gcd_aux(2, 4, 2);
192+
test_compute_gcd_aux(10, 25, 5);
193+
test_compute_gcd_aux(25, 25, 25);
194+
}
195+
196+
#[test]
197+
fn find_gcd_test() {
198+
assert_eq!(find_gcd([0].into_iter()), None);
199+
assert_eq!(find_gcd([0, 10].into_iter()), NonZeroU64::new(10));
200+
assert_eq!(find_gcd([10, 0].into_iter()), NonZeroU64::new(10));
201+
assert_eq!(find_gcd([].into_iter()), None);
202+
assert_eq!(find_gcd([15, 30, 5, 10].into_iter()), NonZeroU64::new(5));
203+
assert_eq!(find_gcd([15, 16, 10].into_iter()), NonZeroU64::new(1));
204+
assert_eq!(find_gcd([0, 5, 5, 5].into_iter()), NonZeroU64::new(5));
205+
assert_eq!(find_gcd([0, 0].into_iter()), None);
206+
}
207+
}

0 commit comments

Comments
 (0)