Skip to content

feat(query): Implement Vector Index with HNSW Algorithm #18134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 31 additions & 13 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ base64 = "0.22"
bincode = { version = "2.0.0-rc.3", features = ["serde", "std", "alloc"] }
bincode_v1 = { package = "bincode", version = "1.3.3" }
bitpacking = "0.8.0"
bitvec = "1.0.1"
blake3 = "1.3.1"
bollard = { version = "0.17" }
borsh = { version = "1.2.1", features = ["derive"] }
Expand Down Expand Up @@ -311,6 +312,7 @@ enumflags2 = { version = "0.7.7", features = ["serde"] }
ethnum = { version = "1.5.1" }
faststr = "0.2"
feature-set = { version = "0.1.1" }
feistel-permutation-rs = "0.1.1"
flatbuffers = "25" # Must use the same version with arrow-ipc
foreign_vec = "0.1.0"
form_urlencoded = { version = "1" }
Expand Down Expand Up @@ -393,7 +395,7 @@ num = "0.4.0"
num-bigint = "0.4.6"
num-derive = "0.4.2"
num-traits = "0.2.19"
num_cpus = "1.13.1"
num_cpus = "1.17"
object = "0.36.5"
object_store_opendal = { version = "0.52.0" }
once_cell = "1.15.0"
Expand Down Expand Up @@ -472,6 +474,7 @@ rustls-pemfile = "2"
rustls-pki-types = "1"
rustyline = "14"
scroll = "0.12.0"
self_cell = "1.2.0"
semver = "1.0.14"
serde = { version = "1.0.164", features = ["derive", "rc"] }
serde_derive = "1"
Expand Down Expand Up @@ -542,6 +545,7 @@ wiremock = "0.6"
wkt = "0.11.1"
xorf = { version = "0.11.0", default-features = false, features = ["binary-fuse"] }
xorfilter-rs = "0.5"
zerocopy = "0.8.26"
zip = "3.0.0"
zstd = "0.12.3"

Expand Down
74 changes: 72 additions & 2 deletions src/common/metrics/src/metrics/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,24 @@ static BLOCK_INVERTED_INDEX_READ_MILLISECONDS: LazyLock<Histogram> = LazyLock::n
static BLOCK_INVERTED_INDEX_SEARCH_MILLISECONDS: LazyLock<Histogram> = LazyLock::new(|| {
register_histogram_in_milliseconds("fuse_block_inverted_index_search_milliseconds")
});
static BLOCK_VECTOR_INDEX_WRITE_NUMS: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_block_vector_index_write_nums"));
static BLOCK_VECTOR_INDEX_WRITE_BYTES: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_block_vector_index_write_bytes"));
static BLOCK_VECTOR_INDEX_WRITE_MILLISECONDS: LazyLock<Histogram> = LazyLock::new(|| {
register_histogram_in_milliseconds("fuse_block_vector_index_write_milliseconds")
});
static BLOCK_VECTOR_INDEX_GENERATE_MILLISECONDS: LazyLock<Histogram> = LazyLock::new(|| {
register_histogram_in_milliseconds("fuse_block_vector_index_generate_milliseconds")
});
static BLOCK_VECTOR_INDEX_READ_MILLISECONDS: LazyLock<Histogram> = LazyLock::new(|| {
register_histogram_in_milliseconds("fuse_block_vector_index_read_milliseconds")
});
static BLOCK_VECTOR_INDEX_PRUNING_MILLISECONDS: LazyLock<Histogram> = LazyLock::new(|| {
register_histogram_in_milliseconds("fuse_block_vector_index_pruning_milliseconds")
});
static BLOCK_VECTOR_INDEX_READ_BYTES: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_block_vector_index_read_bytes"));
static COMPACT_BLOCK_READ_NUMS: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_compact_block_read_nums"));
static COMPACT_BLOCK_READ_BYTES: LazyLock<Counter> =
Expand Down Expand Up @@ -236,6 +254,14 @@ static BYTES_BLOCK_INVERTED_INDEX_PRUNING_BEFORE: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_bytes_block_inverted_index_pruning_before"));
static BYTES_BLOCK_INVERTED_INDEX_PRUNING_AFTER: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_bytes_block_inverted_index_pruning_after"));
static BLOCKS_VECTOR_INDEX_PRUNING_BEFORE: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_blocks_vector_index_pruning_before"));
static BLOCKS_VECTOR_INDEX_PRUNING_AFTER: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_blocks_vector_index_pruning_after"));
static BYTES_BLOCK_VECTOR_INDEX_PRUNING_BEFORE: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_bytes_block_vector_index_pruning_before"));
static BYTES_BLOCK_VECTOR_INDEX_PRUNING_AFTER: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_bytes_block_vector_index_pruning_after"));
static PRUNING_PREWHERE_NUMS: LazyLock<Counter> =
LazyLock::new(|| register_counter("fuse_pruning_prewhere_nums"));
static PRUNING_MILLISECONDS: LazyLock<Histogram> =
Expand Down Expand Up @@ -548,6 +574,10 @@ pub fn metrics_inc_block_index_write_bytes(c: u64) {
BLOCK_INDEX_WRITE_BYTES.inc_by(c);
}

pub fn metrics_inc_block_index_read_bytes(c: u64) {
BLOCK_INDEX_READ_BYTES.inc_by(c);
}

pub fn metrics_inc_block_index_write_milliseconds(c: u64) {
BLOCK_INDEX_WRITE_MILLISECONDS.observe(c as f64);
}
Expand Down Expand Up @@ -576,8 +606,32 @@ pub fn metrics_inc_block_inverted_index_search_milliseconds(c: u64) {
BLOCK_INVERTED_INDEX_SEARCH_MILLISECONDS.observe(c as f64);
}

pub fn metrics_inc_block_index_read_bytes(c: u64) {
BLOCK_INDEX_READ_BYTES.inc_by(c);
pub fn metrics_inc_block_vector_index_write_nums(c: u64) {
BLOCK_VECTOR_INDEX_WRITE_NUMS.inc_by(c);
}

pub fn metrics_inc_block_vector_index_write_bytes(c: u64) {
BLOCK_VECTOR_INDEX_WRITE_BYTES.inc_by(c);
}

pub fn metrics_inc_block_vector_index_write_milliseconds(c: u64) {
BLOCK_VECTOR_INDEX_WRITE_MILLISECONDS.observe(c as f64);
}

pub fn metrics_inc_block_vector_index_generate_milliseconds(c: u64) {
BLOCK_VECTOR_INDEX_GENERATE_MILLISECONDS.observe(c as f64);
}

pub fn metrics_inc_block_vector_index_read_milliseconds(c: u64) {
BLOCK_VECTOR_INDEX_READ_MILLISECONDS.observe(c as f64);
}

pub fn metrics_inc_block_vector_index_pruning_milliseconds(c: u64) {
BLOCK_VECTOR_INDEX_PRUNING_MILLISECONDS.observe(c as f64);
}

pub fn metrics_inc_block_vector_index_read_bytes(c: u64) {
BLOCK_VECTOR_INDEX_READ_BYTES.inc_by(c);
}

/// Compact metrics.
Expand Down Expand Up @@ -670,6 +724,22 @@ pub fn metrics_inc_bytes_block_inverted_index_pruning_after(c: u64) {
BYTES_BLOCK_INVERTED_INDEX_PRUNING_AFTER.inc_by(c);
}

pub fn metrics_inc_blocks_vector_index_pruning_before(c: u64) {
BLOCKS_VECTOR_INDEX_PRUNING_BEFORE.inc_by(c);
}

pub fn metrics_inc_blocks_vector_index_pruning_after(c: u64) {
BLOCKS_VECTOR_INDEX_PRUNING_AFTER.inc_by(c);
}

pub fn metrics_inc_bytes_block_vector_index_pruning_before(c: u64) {
BYTES_BLOCK_VECTOR_INDEX_PRUNING_BEFORE.inc_by(c);
}

pub fn metrics_inc_bytes_block_vector_index_pruning_after(c: u64) {
BYTES_BLOCK_VECTOR_INDEX_PRUNING_AFTER.inc_by(c);
}

pub fn metrics_inc_pruning_prewhere_nums(c: u64) {
PRUNING_PREWHERE_NUMS.inc_by(c);
}
Expand Down
80 changes: 56 additions & 24 deletions src/common/vector/src/distance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,69 +16,101 @@ use databend_common_exception::ErrorCode;
use databend_common_exception::Result;
use ndarray::ArrayView;

pub fn cosine_distance(from: &[f32], to: &[f32]) -> Result<f32> {
if from.len() != to.len() {
pub fn cosine_distance(lhs: &[f32], rhs: &[f32]) -> Result<f32> {
if lhs.len() != rhs.len() {
return Err(ErrorCode::InvalidArgument(format!(
"Vector length not equal: {:} != {:}",
from.len(),
to.len(),
lhs.len(),
rhs.len(),
)));
}

let a = ArrayView::from(from);
let b = ArrayView::from(to);
let a = ArrayView::from(lhs);
let b = ArrayView::from(rhs);
let aa_sum = (&a * &a).sum();
let bb_sum = (&b * &b).sum();

Ok(1.0 - (&a * &b).sum() / ((aa_sum).sqrt() * (bb_sum).sqrt()))
}

pub fn l2_distance(from: &[f32], to: &[f32]) -> Result<f32> {
if from.len() != to.len() {
pub fn l1_distance(lhs: &[f32], rhs: &[f32]) -> Result<f32> {
if lhs.len() != rhs.len() {
return Err(ErrorCode::InvalidArgument(format!(
"Vector length not equal: {:} != {:}",
from.len(),
to.len(),
lhs.len(),
rhs.len(),
)));
}

Ok(from
Ok(lhs
.iter()
.zip(to.iter())
.zip(rhs.iter())
.map(|(a, b)| (a - b).abs())
.sum::<f32>())
}

pub fn l2_distance(lhs: &[f32], rhs: &[f32]) -> Result<f32> {
if lhs.len() != rhs.len() {
return Err(ErrorCode::InvalidArgument(format!(
"Vector length not equal: {:} != {:}",
lhs.len(),
rhs.len(),
)));
}

Ok(lhs
.iter()
.zip(rhs.iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f32>()
.sqrt())
}

pub fn cosine_distance_64(from: &[f64], to: &[f64]) -> Result<f64> {
if from.len() != to.len() {
pub fn cosine_distance_64(lhs: &[f64], rhs: &[f64]) -> Result<f64> {
if lhs.len() != rhs.len() {
return Err(ErrorCode::InvalidArgument(format!(
"Vector length not equal: {:} != {:}",
from.len(),
to.len(),
lhs.len(),
rhs.len(),
)));
}

let a = ArrayView::from(from);
let b = ArrayView::from(to);
let a = ArrayView::from(lhs);
let b = ArrayView::from(rhs);
let aa_sum = (&a * &a).sum();
let bb_sum = (&b * &b).sum();

Ok(1.0 - (&a * &b).sum() / ((aa_sum).sqrt() * (bb_sum).sqrt()))
}

pub fn l2_distance_64(from: &[f64], to: &[f64]) -> Result<f64> {
if from.len() != to.len() {
pub fn l1_distance_64(lhs: &[f64], rhs: &[f64]) -> Result<f64> {
if lhs.len() != rhs.len() {
return Err(ErrorCode::InvalidArgument(format!(
"Vector length not equal: {:} != {:}",
lhs.len(),
rhs.len(),
)));
}

Ok(lhs
.iter()
.zip(rhs.iter())
.map(|(a, b)| (a - b).abs())
.sum::<f64>())
}

pub fn l2_distance_64(lhs: &[f64], rhs: &[f64]) -> Result<f64> {
if lhs.len() != rhs.len() {
return Err(ErrorCode::InvalidArgument(format!(
"Vector length not equal: {:} != {:}",
from.len(),
to.len(),
lhs.len(),
rhs.len(),
)));
}

Ok(from
Ok(lhs
.iter()
.zip(to.iter())
.zip(rhs.iter())
.map(|(a, b)| (a - b).powi(2))
.sum::<f64>()
.sqrt())
Expand Down
Loading
Loading