Skip to content

Commit 889a7dc

Browse files
committed
feat(query): Implement Vector Index with HNSW Algorithm
1 parent b9e4e50 commit 889a7dc

File tree

101 files changed

+7486
-113
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+7486
-113
lines changed

Cargo.lock

Lines changed: 32 additions & 13 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ base64 = "0.22"
263263
bincode = { version = "2.0.0-rc.3", features = ["serde", "std", "alloc"] }
264264
bincode_v1 = { package = "bincode", version = "1.3.3" }
265265
bitpacking = "0.8.0"
266+
bitvec = "1.0.1"
266267
blake3 = "1.3.1"
267268
bollard = { version = "0.17" }
268269
borsh = { version = "1.2.1", features = ["derive"] }
@@ -311,6 +312,7 @@ enumflags2 = { version = "0.7.7", features = ["serde"] }
311312
ethnum = { version = "1.5.1" }
312313
faststr = "0.2"
313314
feature-set = { version = "0.1.1" }
315+
feistel-permutation-rs = "0.1.1"
314316
flatbuffers = "25" # Must use the same version with arrow-ipc
315317
foreign_vec = "0.1.0"
316318
form_urlencoded = { version = "1" }
@@ -393,7 +395,7 @@ num = "0.4.0"
393395
num-bigint = "0.4.6"
394396
num-derive = "0.4.2"
395397
num-traits = "0.2.19"
396-
num_cpus = "1.13.1"
398+
num_cpus = "1.17"
397399
object = "0.36.5"
398400
object_store_opendal = { version = "0.52.0" }
399401
once_cell = "1.15.0"
@@ -472,6 +474,7 @@ rustls-pemfile = "2"
472474
rustls-pki-types = "1"
473475
rustyline = "14"
474476
scroll = "0.12.0"
477+
self_cell = "1.2.0"
475478
semver = "1.0.14"
476479
serde = { version = "1.0.164", features = ["derive", "rc"] }
477480
serde_derive = "1"
@@ -542,6 +545,7 @@ wiremock = "0.6"
542545
wkt = "0.11.1"
543546
xorf = { version = "0.11.0", default-features = false, features = ["binary-fuse"] }
544547
xorfilter-rs = "0.5"
548+
zerocopy = "0.8.26"
545549
zip = "3.0.0"
546550
zstd = "0.12.3"
547551

src/common/metrics/src/metrics/storage.rs

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,24 @@ static BLOCK_INVERTED_INDEX_READ_MILLISECONDS: LazyLock<Histogram> = LazyLock::n
177177
static BLOCK_INVERTED_INDEX_SEARCH_MILLISECONDS: LazyLock<Histogram> = LazyLock::new(|| {
178178
register_histogram_in_milliseconds("fuse_block_inverted_index_search_milliseconds")
179179
});
180+
static BLOCK_VECTOR_INDEX_WRITE_NUMS: LazyLock<Counter> =
181+
LazyLock::new(|| register_counter("fuse_block_vector_index_write_nums"));
182+
static BLOCK_VECTOR_INDEX_WRITE_BYTES: LazyLock<Counter> =
183+
LazyLock::new(|| register_counter("fuse_block_vector_index_write_bytes"));
184+
static BLOCK_VECTOR_INDEX_WRITE_MILLISECONDS: LazyLock<Histogram> = LazyLock::new(|| {
185+
register_histogram_in_milliseconds("fuse_block_vector_index_write_milliseconds")
186+
});
187+
static BLOCK_VECTOR_INDEX_GENERATE_MILLISECONDS: LazyLock<Histogram> = LazyLock::new(|| {
188+
register_histogram_in_milliseconds("fuse_block_vector_index_generate_milliseconds")
189+
});
190+
static BLOCK_VECTOR_INDEX_READ_MILLISECONDS: LazyLock<Histogram> = LazyLock::new(|| {
191+
register_histogram_in_milliseconds("fuse_block_vector_index_read_milliseconds")
192+
});
193+
static BLOCK_VECTOR_INDEX_SEARCH_MILLISECONDS: LazyLock<Histogram> = LazyLock::new(|| {
194+
register_histogram_in_milliseconds("fuse_block_vector_index_search_milliseconds")
195+
});
196+
static BLOCK_VECTOR_INDEX_READ_BYTES: LazyLock<Counter> =
197+
LazyLock::new(|| register_counter("fuse_block_vector_index_read_bytes"));
180198
static COMPACT_BLOCK_READ_NUMS: LazyLock<Counter> =
181199
LazyLock::new(|| register_counter("fuse_compact_block_read_nums"));
182200
static COMPACT_BLOCK_READ_BYTES: LazyLock<Counter> =
@@ -548,6 +566,10 @@ pub fn metrics_inc_block_index_write_bytes(c: u64) {
548566
BLOCK_INDEX_WRITE_BYTES.inc_by(c);
549567
}
550568

569+
pub fn metrics_inc_block_index_read_bytes(c: u64) {
570+
BLOCK_INDEX_READ_BYTES.inc_by(c);
571+
}
572+
551573
pub fn metrics_inc_block_index_write_milliseconds(c: u64) {
552574
BLOCK_INDEX_WRITE_MILLISECONDS.observe(c as f64);
553575
}
@@ -576,8 +598,32 @@ pub fn metrics_inc_block_inverted_index_search_milliseconds(c: u64) {
576598
BLOCK_INVERTED_INDEX_SEARCH_MILLISECONDS.observe(c as f64);
577599
}
578600

579-
pub fn metrics_inc_block_index_read_bytes(c: u64) {
580-
BLOCK_INDEX_READ_BYTES.inc_by(c);
601+
pub fn metrics_inc_block_vector_index_write_nums(c: u64) {
602+
BLOCK_VECTOR_INDEX_WRITE_NUMS.inc_by(c);
603+
}
604+
605+
pub fn metrics_inc_block_vector_index_write_bytes(c: u64) {
606+
BLOCK_VECTOR_INDEX_WRITE_BYTES.inc_by(c);
607+
}
608+
609+
pub fn metrics_inc_block_vector_index_write_milliseconds(c: u64) {
610+
BLOCK_VECTOR_INDEX_WRITE_MILLISECONDS.observe(c as f64);
611+
}
612+
613+
pub fn metrics_inc_block_vector_index_generate_milliseconds(c: u64) {
614+
BLOCK_VECTOR_INDEX_GENERATE_MILLISECONDS.observe(c as f64);
615+
}
616+
617+
pub fn metrics_inc_block_vector_index_read_milliseconds(c: u64) {
618+
BLOCK_VECTOR_INDEX_READ_MILLISECONDS.observe(c as f64);
619+
}
620+
621+
pub fn metrics_inc_block_vector_index_search_milliseconds(c: u64) {
622+
BLOCK_VECTOR_INDEX_SEARCH_MILLISECONDS.observe(c as f64);
623+
}
624+
625+
pub fn metrics_inc_block_vector_index_read_bytes(c: u64) {
626+
BLOCK_VECTOR_INDEX_READ_BYTES.inc_by(c);
581627
}
582628

583629
/// Compact metrics.

src/common/vector/src/distance.rs

Lines changed: 56 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,69 +16,101 @@ use databend_common_exception::ErrorCode;
1616
use databend_common_exception::Result;
1717
use ndarray::ArrayView;
1818

19-
pub fn cosine_distance(from: &[f32], to: &[f32]) -> Result<f32> {
20-
if from.len() != to.len() {
19+
pub fn cosine_distance(lhs: &[f32], rhs: &[f32]) -> Result<f32> {
20+
if lhs.len() != rhs.len() {
2121
return Err(ErrorCode::InvalidArgument(format!(
2222
"Vector length not equal: {:} != {:}",
23-
from.len(),
24-
to.len(),
23+
lhs.len(),
24+
rhs.len(),
2525
)));
2626
}
2727

28-
let a = ArrayView::from(from);
29-
let b = ArrayView::from(to);
28+
let a = ArrayView::from(lhs);
29+
let b = ArrayView::from(rhs);
3030
let aa_sum = (&a * &a).sum();
3131
let bb_sum = (&b * &b).sum();
3232

3333
Ok(1.0 - (&a * &b).sum() / ((aa_sum).sqrt() * (bb_sum).sqrt()))
3434
}
3535

36-
pub fn l2_distance(from: &[f32], to: &[f32]) -> Result<f32> {
37-
if from.len() != to.len() {
36+
pub fn l1_distance(lhs: &[f32], rhs: &[f32]) -> Result<f32> {
37+
if lhs.len() != rhs.len() {
3838
return Err(ErrorCode::InvalidArgument(format!(
3939
"Vector length not equal: {:} != {:}",
40-
from.len(),
41-
to.len(),
40+
lhs.len(),
41+
rhs.len(),
4242
)));
4343
}
4444

45-
Ok(from
45+
Ok(lhs
4646
.iter()
47-
.zip(to.iter())
47+
.zip(rhs.iter())
48+
.map(|(a, b)| (a - b).abs())
49+
.sum::<f32>())
50+
}
51+
52+
pub fn l2_distance(lhs: &[f32], rhs: &[f32]) -> Result<f32> {
53+
if lhs.len() != rhs.len() {
54+
return Err(ErrorCode::InvalidArgument(format!(
55+
"Vector length not equal: {:} != {:}",
56+
lhs.len(),
57+
rhs.len(),
58+
)));
59+
}
60+
61+
Ok(lhs
62+
.iter()
63+
.zip(rhs.iter())
4864
.map(|(a, b)| (a - b).powi(2))
4965
.sum::<f32>()
5066
.sqrt())
5167
}
5268

53-
pub fn cosine_distance_64(from: &[f64], to: &[f64]) -> Result<f64> {
54-
if from.len() != to.len() {
69+
pub fn cosine_distance_64(lhs: &[f64], rhs: &[f64]) -> Result<f64> {
70+
if lhs.len() != rhs.len() {
5571
return Err(ErrorCode::InvalidArgument(format!(
5672
"Vector length not equal: {:} != {:}",
57-
from.len(),
58-
to.len(),
73+
lhs.len(),
74+
rhs.len(),
5975
)));
6076
}
6177

62-
let a = ArrayView::from(from);
63-
let b = ArrayView::from(to);
78+
let a = ArrayView::from(lhs);
79+
let b = ArrayView::from(rhs);
6480
let aa_sum = (&a * &a).sum();
6581
let bb_sum = (&b * &b).sum();
6682

6783
Ok(1.0 - (&a * &b).sum() / ((aa_sum).sqrt() * (bb_sum).sqrt()))
6884
}
6985

70-
pub fn l2_distance_64(from: &[f64], to: &[f64]) -> Result<f64> {
71-
if from.len() != to.len() {
86+
pub fn l1_distance_64(lhs: &[f64], rhs: &[f64]) -> Result<f64> {
87+
if lhs.len() != rhs.len() {
88+
return Err(ErrorCode::InvalidArgument(format!(
89+
"Vector length not equal: {:} != {:}",
90+
lhs.len(),
91+
rhs.len(),
92+
)));
93+
}
94+
95+
Ok(lhs
96+
.iter()
97+
.zip(rhs.iter())
98+
.map(|(a, b)| (a - b).abs())
99+
.sum::<f64>())
100+
}
101+
102+
pub fn l2_distance_64(lhs: &[f64], rhs: &[f64]) -> Result<f64> {
103+
if lhs.len() != rhs.len() {
72104
return Err(ErrorCode::InvalidArgument(format!(
73105
"Vector length not equal: {:} != {:}",
74-
from.len(),
75-
to.len(),
106+
lhs.len(),
107+
rhs.len(),
76108
)));
77109
}
78110

79-
Ok(from
111+
Ok(lhs
80112
.iter()
81-
.zip(to.iter())
113+
.zip(rhs.iter())
82114
.map(|(a, b)| (a - b).powi(2))
83115
.sum::<f64>()
84116
.sqrt())

src/common/vector/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,7 @@ mod distance;
1616

1717
pub use distance::cosine_distance;
1818
pub use distance::cosine_distance_64;
19+
pub use distance::l1_distance;
20+
pub use distance::l1_distance_64;
1921
pub use distance::l2_distance;
2022
pub use distance::l2_distance_64;

0 commit comments

Comments
 (0)