Skip to content

Dorer SQ8 dist functions [MOD-9626] #673

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 54 commits into from
Jun 9, 2025
Merged
Show file tree
Hide file tree
Changes from 49 commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
69d63ac
add sq8
dor-forer May 11, 2025
af85432
Change to IP_AVX512F
dor-forer May 11, 2025
b215799
Change
dor-forer May 11, 2025
8b4188b
vec1
dor-forer May 11, 2025
a1d1a16
float
dor-forer May 11, 2025
b5860bb
finish
dor-forer May 11, 2025
0d07d71
now
dor-forer May 11, 2025
66c49e8
remove Choose_SQ8_Cosine_implementation_AVX512F
dor-forer May 11, 2025
aa26c71
in test
dor-forer May 11, 2025
43b58a8
alignemnt
dor-forer May 11, 2025
1e12fa3
back to bw
dor-forer May 11, 2025
984a030
back again
dor-forer May 11, 2025
c3670a8
again
dor-forer May 11, 2025
11303b7
optimization
dor-forer May 11, 2025
7474c05
more BW
dor-forer May 11, 2025
2cfd9b6
fix avx
dor-forer May 11, 2025
3cdf05e
add avx cosine test
dor-forer May 11, 2025
fc8bc7d
avx
dor-forer May 11, 2025
513839b
add impl
dor-forer May 11, 2025
f676c1b
add l2
dor-forer May 11, 2025
9a899cc
replace OPT_AVX512_F_BW_VL_VNNI
dor-forer May 11, 2025
4fa5327
align
dor-forer May 11, 2025
1379d6d
Fix avx
dor-forer May 11, 2025
f7fdb2b
add l2 sse
dor-forer May 11, 2025
4fa88b2
Remove prints
dor-forer May 11, 2025
4476833
sve2 l2
dor-forer May 11, 2025
2a7477c
add neon
dor-forer May 12, 2025
b1f502c
fix sve
dor-forer May 12, 2025
dc154b5
add sq8 cosine test
dor-forer May 12, 2025
25a9400
test utils
dor-forer May 12, 2025
9ced0be
static const
dor-forer May 12, 2025
6028dd7
format
dor-forer May 12, 2025
3c2ee11
change to uint
dor-forer May 12, 2025
5c2952c
Merge branch 'main' of https://github.com/RedisAI/VectorSimilarity in…
dor-forer May 12, 2025
ad3985e
format
dor-forer May 12, 2025
41216e6
Merge branch 'main' of https://github.com/RedisAI/VectorSimilarity in…
dor-forer May 14, 2025
76d2fdd
added fma avx2
dor-forer May 18, 2025
b47cc52
format
dor-forer May 18, 2025
6566a0b
remove opt.avx2
dor-forer May 18, 2025
d767ea9
fix OPT_AVX2 bm-spaces
dor-forer May 18, 2025
ea0ac00
pr chanes
dor-forer May 21, 2025
ef09ead
format
dor-forer May 22, 2025
7567730
change to _mm_cvtsi32_si128
dor-forer May 22, 2025
a767547
Change in the l2
dor-forer May 22, 2025
e6422dc
PR changes
dor-forer May 27, 2025
10a6098
added chunk to functions
dor-forer May 27, 2025
767e190
diff squared
dor-forer May 27, 2025
44be275
format
dor-forer May 27, 2025
3a956bf
chnage diff
dor-forer May 27, 2025
5840e3f
Remove align from tests improve sse4
dor-forer Jun 5, 2025
2a89dd8
format
dor-forer Jun 5, 2025
e562a86
applied to l2
dor-forer Jun 8, 2025
2a0b4e6
format
dor-forer Jun 8, 2025
ab18690
Remove alignment l2
dor-forer Jun 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cmake/x86_64InstructionFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2)
CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
CHECK_CXX_COMPILER_FLAG(-mf16c CXX_F16C)
CHECK_CXX_COMPILER_FLAG(-mfma CXX_FMA)
CHECK_CXX_COMPILER_FLAG(-msse4.1 CXX_SSE4)
CHECK_CXX_COMPILER_FLAG(-msse3 CXX_SSE3)
CHECK_CXX_COMPILER_FLAG(-msse CXX_SSE)

Expand Down Expand Up @@ -60,10 +61,18 @@ if(CXX_AVX2)
add_compile_definitions(OPT_AVX2)
endif()

if(CXX_AVX2 AND CXX_FMA)
add_compile_definitions(OPT_AVX2_FMA)
endif()

if(CXX_AVX)
add_compile_definitions(OPT_AVX)
endif()

if(CXX_SSE4)
add_compile_definitions(OPT_SSE4)
endif()

if(CXX_SSE3)
add_compile_definitions(OPT_SSE3)
endif()
Expand Down
12 changes: 12 additions & 0 deletions src/VecSim/spaces/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
list(APPEND OPTIMIZATIONS functions/AVX2.cpp)
endif()

if(CXX_AVX2 AND CXX_FMA)
message("Building with AVX2 and FMA")
set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp)
endif()

if(CXX_F16C AND CXX_FMA AND CXX_AVX)
message("Building with CXX_F16C")
set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx")
Expand All @@ -74,6 +80,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
list(APPEND OPTIMIZATIONS functions/SSE3.cpp)
endif()

if(CXX_SSE4)
message("Building with SSE4")
set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS -msse4.1)
list(APPEND OPTIMIZATIONS functions/SSE4.cpp)
endif()

if(CXX_SSE)
message("Building with SSE")
set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)
Expand Down
37 changes: 37 additions & 0 deletions src/VecSim/spaces/IP/IP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,43 @@
using bfloat16 = vecsim_types::bfloat16;
using float16 = vecsim_types::float16;

float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension,
float min_val, float delta, float inv_norm) {
float res = 0;
for (size_t i = 0; i < dimension; i++) {
float dequantized_V2 = (pVect2v[i] * delta + min_val);
res += pVect1v[i] * dequantized_V2;
}
return res * inv_norm;
}

float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
const auto *pVect1 = static_cast<const float *>(pVect1v);
const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
// pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply
// it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta
// (float)][inv_norm (float)] The last two values are used to dequantize the vector.
const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
Comment on lines +33 to +34
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider remodelling so the metadata is at the start of the vector

// Compute inner product with dequantization
const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, 1.0f);
return 1.0f - res;
}

float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
const auto *pVect1 = static_cast<const float *>(pVect1v);
const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);

// Get quantization parameters
const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
// Compute inner product with dequantization
const float res =
FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm);
return 1.0f - res;
}

float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) {
auto *vec1 = (float *)pVect1;
auto *vec2 = (float *)pVect2;
Expand Down
6 changes: 6 additions & 0 deletions src/VecSim/spaces/IP/IP.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@

#include <cstdlib>

// pVect1v vector of type fp32 and pVect2v vector of type uint8
float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension);

// pVect1v vector of type fp32 and pVect2v vector of type uint8
float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);

float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);

double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
Expand Down
113 changes: 113 additions & 0 deletions src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/AVX_utils.h"

static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2,
__m256 &sum256, const __m256 &min_val_vec,
const __m256 &delta_vec) {
// Load 8 float elements from pVect1
__m256 v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;

// Load 8 uint8 elements from pVect2, convert to int32, then to float
__m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
pVect2 += 8;

// Zero-extend uint8 to int32
__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);

// Convert int32 to float
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);

// Dequantize and compute dot product in one step using FMA
// (val * delta) + min_val -> v2_dequant
// sum256 += v1 * v2_dequant
// Using FMA: sum256 = v1 * v2_dequant + sum256

// First, compute v2_dequant = v2_f * delta_vec + min_val_vec
__m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);

// Then, compute sum256 += v1 * v2_dequant using FMA
sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256);
}

template <unsigned char residual> // 0..15
float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
const float *pVect1 = static_cast<const float *>(pVect1v);
// pVect2 is a quantized uint8_t vector
const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
const float *pEnd1 = pVect1 + dimension;

// Get dequantization parameters from the end of quantized vector
const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
// Create broadcast vectors for SIMD operations
__m256 min_val_vec = _mm256_set1_ps(min_val);
__m256 delta_vec = _mm256_set1_ps(delta);

__m256 sum256 = _mm256_setzero_ps();

// Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
// 16-float block, so mask loading is guaranteed to be safe.
if constexpr (residual % 8) {
__mmask8 constexpr mask = (1 << (residual % 8)) - 1;
__m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
pVect1 += residual % 8;

// Load quantized values and dequantize
__m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
pVect2 += residual % 8;

// Zero-extend uint8 to int32
__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);

// Convert int32 to float
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);

// Dequantize using FMA: (val * delta) + min_val
__m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);

// Compute dot product with masking
sum256 = _mm256_mul_ps(v1, v2_dequant);
}

// If the reminder is >=8, have another step of 8 floats
if constexpr (residual >= 8) {
InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
}

// We dealt with the residual part. We are left with some multiple of 16 floats.
// In each iteration we calculate 16 floats = 512 bits.
do {
InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
} while (pVect1 < pEnd1);

return my_mm256_reduce_add_ps(sum256);
}

template <unsigned char residual> // 0..15
float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
// Get dequantization parameters from the end of quantized vector
const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));

// Calculate inner product using common implementation with normalization
float ip = SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);

// For cosine, we need to account for the vector norms
// The inv_norm parameter is stored after min_val and delta in the quantized vector
return 1.0f - ip * inv_norm;
}
107 changes: 107 additions & 0 deletions src/VecSim/spaces/IP/IP_AVX2_SQ8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/AVX_utils.h"

static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
const __m256 &min_val_vec, const __m256 &delta_vec) {
// Load 8 float elements from pVect1
__m256 v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;

// Load 8 uint8 elements from pVect2, convert to int32, then to float
__m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
pVect2 += 8;

// Zero-extend uint8 to int32
__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);

// Convert int32 to float
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);

// Dequantize: (val * delta) + min_val
__m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);

// Compute dot product and add to sum
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant));
}

template <unsigned char residual> // 0..15
float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
const float *pVect1 = static_cast<const float *>(pVect1v);
// pVect2 is a quantized uint8_t vector
const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
const float *pEnd1 = pVect1 + dimension;

// Get dequantization parameters from the end of quantized vector
const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
// Create broadcast vectors for SIMD operations
__m256 min_val_vec = _mm256_set1_ps(min_val);
__m256 delta_vec = _mm256_set1_ps(delta);

__m256 sum256 = _mm256_setzero_ps();

// Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
// 16-float block, so mask loading is guaranteed to be safe.
if constexpr (residual % 8) {
__mmask8 constexpr mask = (1 << (residual % 8)) - 1;
__m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
pVect1 += residual % 8;

// Load quantized values and dequantize
__m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
pVect2 += residual % 8;

// Zero-extend uint8 to int32
__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);

// Convert int32 to float
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);

// Dequantize: (val * delta) + min_val
__m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);

// Compute dot product with masking
sum256 = _mm256_mul_ps(v1, v2_dequant);
}

// If the reminder is >=8, have another step of 8 floats
if constexpr (residual >= 8) {
InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
}

// We dealt with the residual part. We are left with some multiple of 16 floats.
// In each iteration we calculate 16 floats = 512 bits.
do {
InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
} while (pVect1 < pEnd1);

return my_mm256_reduce_add_ps(sum256);
}

template <unsigned char residual> // 0..15
float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
}

template <unsigned char residual> // 0..15
float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
// Get dequantization parameters from the end of quantized vector
const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));

// Calculate inner product using common implementation with normalization
float ip = SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);

// For cosine, we need to account for the vector norms
// The inv_norm parameter is stored after min_val and delta in the quantized vector
return 1.0f - ip * inv_norm;
}
Loading
Loading