From 69d63ac76248a45e696d0f7e538d3da0227ebcab Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 10:25:54 +0300 Subject: [PATCH 01/52] add sq8 --- src/VecSim/spaces/IP/IP.cpp | 43 ++ src/VecSim/spaces/IP/IP.h | 6 + src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 104 ++++ .../spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h | 176 ++++++ src/VecSim/spaces/IP/IP_AVX_SQ8.h | 116 ++++ src/VecSim/spaces/IP/IP_SSE_SQ8.h | 134 +++++ src/VecSim/spaces/IP/IP_SVE_SQ8.h | 149 +++++ src/VecSim/spaces/IP_space.cpp | 116 ++++ src/VecSim/spaces/IP_space.h | 7 +- src/VecSim/spaces/L2/L2.cpp | 19 + src/VecSim/spaces/L2/L2.h | 2 + src/VecSim/spaces/L2/L2_AVX_SQ8.h | 55 ++ src/VecSim/spaces/L2_space.cpp | 58 ++ src/VecSim/spaces/computer/preprocessors.h | 131 +++++ src/VecSim/spaces/functions/AVX.cpp | 7 + src/VecSim/spaces/functions/AVX.h | 1 + src/VecSim/spaces/functions/AVX2.cpp | 7 + src/VecSim/spaces/functions/AVX2.h | 1 + .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 14 + .../spaces/functions/AVX512F_BW_VL_VNNI.h | 6 + src/VecSim/spaces/functions/SSE.cpp | 13 + src/VecSim/spaces/functions/SSE.h | 2 + src/VecSim/spaces/functions/SVE.cpp | 13 + src/VecSim/spaces/functions/SVE.h | 3 + src/VecSim/spaces/functions/SVE2.cpp | 13 + src/VecSim/spaces/functions/SVE2.h | 3 + tests/benchmark/CMakeLists.txt | 2 +- tests/benchmark/benchmarks.sh | 5 + .../spaces_benchmarks/bm_spaces_sq8.cpp | 81 +++ tests/unit/test_spaces.cpp | 534 +++++++++++++++++- tests/utils/tests_utils.h | 51 ++ 31 files changed, 1869 insertions(+), 3 deletions(-) create mode 100644 src/VecSim/spaces/IP/IP_AVX2_SQ8.h create mode 100644 src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h create mode 100644 src/VecSim/spaces/IP/IP_AVX_SQ8.h create mode 100644 src/VecSim/spaces/IP/IP_SSE_SQ8.h create mode 100644 src/VecSim/spaces/IP/IP_SVE_SQ8.h create mode 100644 src/VecSim/spaces/L2/L2_AVX_SQ8.h create mode 100644 tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 638397f0f..a1e5cb8e7 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -10,10 +10,53 @@ #include "VecSim/types/bfloat16.h" #include "VecSim/types/float16.h" #include +#include using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; + +float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, float min_val, + float delta, float inv_norm) { + float res = 0; + std::cout << "\nQuantized values: "; + for (size_t i = 0; i < dimension; i++) { + float dequantized_V2 = (pVect2v[i] * delta + min_val) * inv_norm; + std::cout << dequantized_V2 << ", "; + res += pVect1v[i] * dequantized_V2; + } + std::cout << "\n"; + std::cout << "res before normalization: " << res << std::endl; + return res; +} + +float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + // pVect2 is a vector of int8_t, so we need to dequantize it, normalize it and then multiply it. + // it is structured as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm (float)] + // The last two values are used to dequantize the vector. + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Compute inner product with dequantization + const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, 1.0f); + return 1.0f - res; +} + +float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + + // Get quantization parameters + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + std::cout << "inv_norm: " << inv_norm << std::endl; + // Compute inner product with dequantization + const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm); + return 1.0f - res; +} + float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) { auto *vec1 = (float *)pVect1; auto *vec2 = (float *)pVect2; diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index a0c5f2838..7dfad24ce 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -10,6 +10,12 @@ #include +/* + pVect1v vector of type fp32 and pVect2v vector of type int8 +*/ +float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); +float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); + float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h new file mode 100644 index 000000000..6d0dd4af7 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" + +static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 &sum256, + const __m256 &min_val_vec, const __m256 &delta_vec) { + // Load 8 float elements from pVect1 + __m256 v1 = _mm256_loadu_ps(pVect1); + pVect1 += 8; + + // Load 8 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2); + pVect2 += 8; + + // Zero-extend uint8 to int32 (AVX2 instruction) + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: (val * delta) + min_val + // Use FMA instruction available in AVX2 for better performance + __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product and add to sum (using FMA) + sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256); +} + +template // 0..15 +float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { + float *pVect1 = (float *)pVect1v; + uint8_t *quantized = (uint8_t *)pVect2v; + + // Get dequantization parameters from the end of quantized vector + float min = *(float *)(quantized + dimension); + float delta = *(float *)(quantized + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m256 min_val_vec = _mm256_set1_ps(min); + __m256 delta_vec = _mm256_set1_ps(delta); + + const float *pEnd1 = pVect1 + dimension; + + __m256 sum256 = _mm256_setzero_ps(); + + // Deal with 1-7 floats with mask loading, if needed + if constexpr (residual % 8) { + // AVX2 doesn't have native mask loading, so we use the helper function + __mmask8 constexpr mask = (1 << (residual % 8)) - 1; + + // Load masked float elements + __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); + pVect1 += residual % 8; + + // Load masked uint8 elements + __m128i v2_128; + if constexpr (residual % 8 <= 4) { + // Load 4 or fewer bytes + uint32_t temp = 0; + memcpy(&temp, quantized, residual % 8); + v2_128 = _mm_cvtsi32_si128(temp); + } else { + // Load 5-7 bytes + uint64_t temp = 0; + memcpy(&temp, quantized, residual % 8); + v2_128 = _mm_cvtsi64_si128(temp); + } + quantized += residual % 8; + + // Zero-extend uint8 to int32 (AVX2 instruction) + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: (val * delta) + min (using FMA) + __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product with masking + sum256 = _mm256_mul_ps(v1, v2_dequant); + } + + // If the reminder is >=8, have another step of 8 floats + if constexpr (residual >= 8) { + InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); + } + + // We dealt with the residual part. We are left with some multiple of 16 floats. + // In each iteration we calculate 16 floats = 512 bits. + while (pVect1 < pEnd1) { + InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); + } + + // Horizontal sum - AVX2 can use more efficient reduction + return 1.0f - my_mm256_reduce_add_ps(sum256); +} diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h new file mode 100644 index 000000000..6c001efcf --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include +#include + +static inline void +SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum, + const __m512 &min_val_vec, const __m512 &delta_vec) { + // Load 16 float elements from pVec1 + __m512 v1 = _mm512_loadu_ps(pVec1); + + // Load 16 uint8 elements from pVec2 and convert to __m512i + __m128i v2_128 = _mm_loadu_si128((__m128i*)pVec2); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + + // Convert uint8 to float + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize: (val * delta) + min_val + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product and add to sum + sum = _mm512_fmadd_ps(v1, dequantized, sum); + + // Advance pointers + pVec1 += 16; + pVec2 += 16; +} + +// Common implementation for both inner product and cosine similarity +template // 0..63 +float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) { + const float *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + const uint8_t *pEnd2 = pVec2 + dimension; + + // Get dequantization parameters from the end of pVec2 + const float min_val = *reinterpret_cast(pVec2 + dimension); + const float delta = *reinterpret_cast(pVec2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m512 min_val_vec = _mm512_set1_ps(min_val); + __m512 delta_vec = _mm512_set1_ps(delta); + + // Initialize sum accumulator + __m512 sum = _mm512_setzero_ps(); + + // Deal with remainder first + if constexpr (residual) { + if constexpr (residual < 16) { + // Handle less than 16 elements + __mmask16 mask = (1U << residual) - 1; + + // Load masked float elements + __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); + + // Load masked uint8 elements + __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVec2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product + sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized); + } + else if constexpr (residual == 16) { + // Handle exactly 16 elements + SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); + } + else if constexpr (residual < 32) { + // Handle 16-31 elements: process 16 and then remainder + SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); + + // Process remaining elements (residual - 16) + constexpr unsigned char remaining = residual - 16; + __mmask16 mask = (1U << remaining) - 1; + + // Load masked float elements + __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); + + // Load masked uint8 elements + __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVec2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product + sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized); + } + else if constexpr (residual == 32) { + // Handle exactly 32 elements: process two chunks of 16 + SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); + SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); + } + else { + // Handle more than 32 elements: process chunks of 16 until less than 16 remain + constexpr size_t full_chunks = residual / 16; + for (size_t i = 0; i < full_chunks; i++) { + SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); + } + + // Process remaining elements (residual % 16) + constexpr unsigned char remaining = residual % 16; + if constexpr (remaining > 0) { + __mmask16 mask = (1U << remaining) - 1; + + // Load masked float elements + __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); + + // Load masked uint8 elements + __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVec2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product + sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized); + } + } + + pVec1 += residual; + pVec2 += residual; + } + + // Process remaining full chunks of 16 elements + while (pVec2 < pEnd2) { + SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); + } + + // Horizontal sum + float result = _mm512_reduce_add_ps(sum); + + // Return 1 - result as per the pattern in other implementations + return result; +} + +template // 0..63 +float SQ8_InnerProductSIMD64_AVX512_BW_VL_VNNI(const void *pVec1v, + const void *pVec2v, + size_t dimension) { + // Calculate inner product using common implementation + float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + std::cout << "result: " << ip << std::endl; + + // Return 1 - result as per the pattern in other implementations + return 1.0f - ip; +} + +template // 0..63 +float SQ8_CosineSIMD64_AVX512_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // Calculate inner product using common implementation + float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + + // Get the inverse norm factor stored after min_val and delta + const uint8_t *pVec2 = static_cast(pVec2v); + const float inv_norm = *reinterpret_cast(pVec2 + dimension + 2 * sizeof(float)); + std::cout << "result2: " << ip << std::endl; + // Return 1 - (ip * inv_norm) as per the pattern in other implementations + return 1.0f - ip * inv_norm; +} + diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h new file mode 100644 index 000000000..38c836652 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" + +static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 &sum256, + const __m256 &min_val_vec, const __m256 &delta_vec) { + // Load 8 float elements from pVect1 + __m256 v1 = _mm256_loadu_ps(pVect1); + pVect1 += 8; + + // Load 8 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2); + pVect2 += 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: (val * delta) + min_val + __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute dot product and add to sum + sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant)); +} + +template // 0..15 +float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { + float *pVect1 = (float *)pVect1v; + uint8_t *quantized = (uint8_t *)pVect2v; + + // Get dequantization parameters from the end of quantized vector + float min = *(float *)(quantized + dimension); + float delta = *(float *)(quantized + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m256 min_val_vec = _mm256_set1_ps(min); + __m256 delta_vec = _mm256_set1_ps(delta); + + const float *pEnd1 = pVect1 + dimension; + + __m256 sum256 = _mm256_setzero_ps(); + + // Deal with 1-7 floats with mask loading, if needed + if constexpr (residual % 8) { + __mmask8 constexpr mask = (1 << (residual % 8)) - 1; + + // Load masked float elements + __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); + pVect1 += residual % 8; + + // Load masked uint8 elements + __m128i v2_128; + if constexpr (residual % 8 <= 4) { + // Load 4 or fewer bytes directly using unaligned loads and shifts + uint32_t temp = 0; + // Direct byte-by-byte loading to avoid memcpy + switch (residual % 8) { + case 4: temp |= (uint32_t)quantized[3] << 24; + case 3: temp |= (uint32_t)quantized[2] << 16; + case 2: temp |= (uint32_t)quantized[1] << 8; + case 1: temp |= quantized[0]; + } + v2_128 = _mm_cvtsi32_si128(temp); + } else { + // Load 5-7 bytes directly using unaligned loads and shifts + uint64_t temp = 0; + // Direct byte-by-byte loading to avoid memcpy + switch (residual % 8) { + case 7: temp |= (uint64_t)quantized[6] << 48; + case 6: temp |= (uint64_t)quantized[5] << 40; + case 5: temp |= (uint64_t)quantized[4] << 32; + case 4: temp |= (uint64_t)quantized[3] << 24; + case 3: temp |= (uint64_t)quantized[2] << 16; + case 2: temp |= (uint64_t)quantized[1] << 8; + case 1: temp |= quantized[0]; + } + v2_128 = _mm_cvtsi64_si128(temp); + } + quantized += residual % 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: (val * delta) + min + __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute dot product with masking + sum256 = _mm256_mul_ps(v1, v2_dequant); + } + + // If the reminder is >=8, have another step of 8 floats + if constexpr (residual >= 8) { + InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); + } + + // We dealt with the residual part. We are left with some multiple of 16 floats. + // In each iteration we calculate 16 floats = 512 bits. + do { + InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); + } while (pVect1 < pEnd1); + + return 1.0f - my_mm256_reduce_add_ps(sum256); +} diff --git a/src/VecSim/spaces/IP/IP_SSE_SQ8.h b/src/VecSim/spaces/IP/IP_SSE_SQ8.h new file mode 100644 index 000000000..deced094c --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SSE_SQ8.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include +#include + +static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, __m128 &sum_prod, + const __m128 &min_val_vec, const __m128 &delta_vec) { + // Load 4 float elements from pVect1 + __m128 v1 = _mm_loadu_ps(pVect1); + pVect1 += 4; + + // Load 4 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float*)pVect2))); + pVect2 += 4; + + // Convert int32 to float + __m128 v2_f = _mm_cvtepi32_ps(v2_i); + + // Dequantize: (val * delta) + min_val + __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute dot product and add to sum + sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant)); +} + +template // 0..15 +float SQ8_InnerProductSIMD16_SSE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { + float *pVect1 = (float *)pVect1v; + uint8_t *quantized = (uint8_t *)pVect2v; + + // Get dequantization parameters from the end of quantized vector + float min = *(float *)(quantized + dimension); + float delta = *(float *)(quantized + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m128 min_val_vec = _mm_set1_ps(min); + __m128 delta_vec = _mm_set1_ps(delta); + + const float *pEnd1 = pVect1 + dimension; + + __m128 sum = _mm_setzero_ps(); + + // Process residual elements if needed + if constexpr (residual) { + // Handle residual elements (1-3) + if constexpr (residual % 4) { + __m128 v1; + __m128 v2_dequant = _mm_setzero_ps(); + + if constexpr (residual % 4 == 3) { + // Load 3 floats and set the last one to 0 + v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0 + v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part + + // Dequantize first value + float dequant0 = quantized[0] * delta + min; + v2_dequant = _mm_load_ss(&dequant0); + + // Dequantize next two values + float dequant_high[2] = { + quantized[1] * delta + min, + quantized[2] * delta + min + }; + v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high); + + } else if constexpr (residual % 4 == 2) { + // Load 2 floats and set the last two to 0 + v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1); + + // Dequantize two values + float dequant_high[2] = { + quantized[0] * delta + min, + quantized[1] * delta + min + }; + v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high); + + } else if constexpr (residual % 4 == 1) { + // Load 1 float and set the last three to 0 + v1 = _mm_load_ss(pVect1); + + // Dequantize one value + float dequant0 = quantized[0] * delta + min; + v2_dequant = _mm_load_ss(&dequant0); + } + + pVect1 += residual % 4; + quantized += residual % 4; + sum = _mm_mul_ps(v1, v2_dequant); + } + } + + // Process 4 elements at a time + while (pVect1 < pEnd1) { + InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec); + } + + // TmpRes must be 16 bytes aligned. + float PORTABLE_ALIGN16 TmpRes[4]; + _mm_store_ps(TmpRes, sum); + float result = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + + return result; +} + +template // 0..15 +float SQ8_InnerProductSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_InnerProductSIMD16_SSE_IMP(pVect1v, pVect2v, dimension); +} + + +template // 0..15 +float SQ8_CosineSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dimension) { + + const uint8_t *pVect2 = static_cast(pVect2v); + // Get quantization parameters + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Compute inner product with dequantization using the common function + // We need to cast away const for the inner product function, but it doesn't modify the vectors + const float res = SQ8_InnerProductSIMD16_SSE_IMP(pVect1v, pVect2v, dimension); + + std::cout << "res before normalization sse: " << res << std::endl; + std::cout << "inv_norm: " << inv_norm << std::endl; + // For cosine, we need to account for the vector norms + // The inv_norm parameter is stored after min_val and delta in the quantized vector + return 1.0f - res * inv_norm; +} diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h new file mode 100644 index 000000000..d6c0faa3d --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include +#include +#include + +static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, size_t &offset, + svfloat32_t &sum, const svfloat32_t &min_val_vec, + const svfloat32_t &delta_vec) { + svbool_t pg = svptrue_b32(); + + // Load float elements from pVect1 + svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); + + // Load uint8 elements from pVect2, convert to int32, then to float + svbool_t pg_b8 = svptrue_b8(); + svuint8_t v2_u8 = svld1_u8(pg_b8, pVect2 + offset); + + // Convert uint8 to uint32 + svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8)); + + // Convert uint32 to float32 + svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32); + + // Dequantize: (val * delta) + min_val + svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec); + + // Compute dot product and add to sum + sum = svmla_f32_z(pg, sum, v1, v2_dequant); + + // Move to the next set of elements + offset += svcntw(); +} + +template +float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { + float *pVect1 = (float *)pVect1v; + uint8_t *quantized = (uint8_t *)pVect2v; + size_t offset = 0; + + // Get dequantization parameters from the end of quantized vector + float min = *(float *)(quantized + dimension); + float delta = *(float *)(quantized + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + svbool_t pg = svptrue_b32(); + svfloat32_t min_val_vec = svdup_f32(min); + svfloat32_t delta_vec = svdup_f32(delta); + + // Get the number of 32-bit elements per vector at runtime + uint64_t sve_word_count = svcntw(); + + // Multiple accumulators to increase instruction-level parallelism + svfloat32_t sum0 = svdup_f32(0.0f); + svfloat32_t sum1 = svdup_f32(0.0f); + svfloat32_t sum2 = svdup_f32(0.0f); + svfloat32_t sum3 = svdup_f32(0.0f); + + // Handle partial chunk if needed + if constexpr (partial_chunk) { + size_t remaining = dimension % sve_word_count; + if (remaining > 0) { + // Create predicate for the remaining elements + svbool_t pg_partial = svwhilelt_b32(0, remaining); + + // Load float elements from pVect1 with predicate + svfloat32_t v1 = svld1_f32(pg_partial, pVect1); + + // Load uint8 elements from pVect2 with predicate, convert to int32, then to float + svbool_t pg_b8_partial = svwhilelt_b8(0, remaining); + svuint8_t v2_u8 = svld1_u8(pg_b8_partial, quantized); + + // Convert uint8 to uint32 + svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8)); + + // Convert uint32 to float32 + svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); + + // Dequantize: (val * delta) + min_val + svfloat32_t v2_dequant = svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec); + + // Compute dot product and add to sum + sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant); + + // Move pointers past the partial chunk + pVect1 += remaining; + quantized += remaining; + } + } + + // Process 4 chunks at a time in the main loop + auto chunk_size = 4 * sve_word_count; + const size_t number_of_chunks = (dimension - (partial_chunk ? dimension % sve_word_count : 0)) / chunk_size; + + for (size_t i = 0; i < number_of_chunks; i++) { + InnerProductStep(pVect1, quantized, offset, sum0, min_val_vec, delta_vec); + InnerProductStep(pVect1, quantized, offset, sum1, min_val_vec, delta_vec); + InnerProductStep(pVect1, quantized, offset, sum2, min_val_vec, delta_vec); + InnerProductStep(pVect1, quantized, offset, sum3, min_val_vec, delta_vec); + } + + // Handle remaining steps (0-3) + if constexpr (additional_steps > 0) { + InnerProductStep(pVect1, quantized, offset, sum0, min_val_vec, delta_vec); + } + if constexpr (additional_steps > 1) { + InnerProductStep(pVect1, quantized, offset, sum1, min_val_vec, delta_vec); + } + if constexpr (additional_steps > 2) { + InnerProductStep(pVect1, quantized, offset, sum2, min_val_vec, delta_vec); + } + + // Combine the accumulators + svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); + sum = svadd_f32_z(pg, sum, sum2); + sum = svadd_f32_z(pg, sum, sum3); + + // Horizontal sum of all elements in the vector + float result = svaddv_f32(pg, sum); + + return result; +} + +template +float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); +} + +template +float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const uint8_t *pVect2 = static_cast(pVect2v); + + // Get quantization parameters + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Compute inner product with dequantization using the common function + const float res = SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + // The inv_norm parameter is stored after min_val and delta in the quantized vector + return 1.0f - res * inv_norm; +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index a74d2e59a..497605744 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -33,6 +33,122 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; namespace spaces { + dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_InnerProduct; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + #ifdef CPU_FEATURES_ARCH_AARCH64 + + #ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_IP_implementation_SVE2(dim); + } + #endif + #ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_IP_implementation_SVE(dim); + } + #endif + // #ifdef OPT_NEON + // if (features.asimd) { + // return Choose_SQ8_IP_implementation_NEON(dim); + // } + // #endif + + #endif + + #ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + if (dim < 16) { + return ret_dist_func; + } + #ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { + if (dim % 16 == 0) // no point in aligning if we have an offsetting residual + *alignment = 16 * sizeof(float); // handles 16 floats + return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + } + #endif + #ifdef OPT_AVX + if (features.avx) { + if (dim % 8 == 0) // no point in aligning if we have an offsetting residual + *alignment = 8 * sizeof(float); // handles 8 floats + return Choose_SQ8_IP_implementation_AVX(dim); + } + #endif + #ifdef OPT_SSE + if (features.sse) { + if (dim % 4 == 0) // no point in aligning if we have an offsetting residual + *alignment = 4 * sizeof(float); // handles 4 floats + return Choose_SQ8_IP_implementation_SSE(dim); + } + #endif + #endif // __x86_64__ + return ret_dist_func; + } + +dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_Cosine; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + #ifdef CPU_FEATURES_ARCH_AARCH64 + + #ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_Cosine_implementation_SVE2(dim); + } + #endif + #ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_Cosine_implementation_SVE(dim); + } + #endif + #ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_Cosine_implementation_NEON(dim); + } + #endif + + #endif + + #ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + if (dim < 16) { + return ret_dist_func; + } + #ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { + if (dim % 16 == 0) // no point in aligning if we have an offsetting residual + *alignment = 16 * sizeof(float); // handles 16 floats + return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + } + #endif + // #ifdef OPT_AVX + // if (features.avx) { + // if (dim % 8 == 0) // no point in aligning if we have an offsetting residual + // *alignment = 8 * sizeof(float); // handles 8 floats + // return Choose_SQ8_Cosine_implementation_AVX(dim); + // } + // #endif + #ifdef OPT_SSE + if (features.sse) { + if (dim % 4 == 0) // no point in aligning if we have an offsetting residual + *alignment = 4 * sizeof(float); // handles 4 floats + return Choose_SQ8_Cosine_implementation_SSE(dim); + } + #endif + #endif // __x86_64__ + return ret_dist_func; + } + dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (alignment == nullptr) { diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index 70aee6244..e375e8e37 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -5,11 +5,14 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #pragma once #include "VecSim/spaces/spaces.h" namespace spaces { +dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); + dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); dist_func_t IP_FP64_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, @@ -26,4 +29,6 @@ dist_func_t IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = n const void *arch_opt = nullptr); dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 395c158b8..08ea8674c 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -14,6 +14,25 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; +float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + // pvect2 is a vector of int8_t, so we need to dequantize it, normalize it and then multiply it. + // it structred as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm (float)] + // The last two values are used to dequantize the vector. + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + float res = 0; + for (size_t i = 0; i < dimension; i++) { + auto dequantized_normalized_V2 = (pVect2[i] * delta + min_val) * inv_norm; + float t = pVect1[i] - dequantized_normalized_V2; + res += t * t; + } + return res; +} + float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { float *vec1 = (float *)pVect1v; float *vec2 = (float *)pVect2v; diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h index b3ac4d4c7..055e8c630 100644 --- a/src/VecSim/spaces/L2/L2.h +++ b/src/VecSim/spaces/L2/L2.h @@ -10,6 +10,8 @@ #include +float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); + float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); double FP64_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h new file mode 100644 index 000000000..e4cf82c45 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). +*/ +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" + +static inline void L2SqrStep(float *&pVect1, float *&pVect2, __m256 &sum) { + __m256 v1 = _mm256_loadu_ps(pVect1); + pVect1 += 8; + __m256 v2 = _mm256_loadu_ps(pVect2); + pVect2 += 8; + __m256 diff = _mm256_sub_ps(v1, v2); + // sum = _mm256_fmadd_ps(diff, diff, sum); + sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); +} + +template // 0..15 +float FP32_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { + float *pVect1 = (float *)pVect1v; + float *pVect2 = (float *)pVect2v; + + const float *pEnd1 = pVect1 + dimension; + + __m256 sum = _mm256_setzero_ps(); + + // Deal with 1-7 floats with mask loading, if needed + if constexpr (residual % 8) { + __mmask8 constexpr mask8 = (1 << (residual % 8)) - 1; + __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); + pVect1 += residual % 8; + __m256 v2 = my_mm256_maskz_loadu_ps(pVect2); + pVect2 += residual % 8; + __m256 diff = _mm256_sub_ps(v1, v2); + sum = _mm256_mul_ps(diff, diff); + } + + // If the reminder is >=8, have another step of 8 floats + if constexpr (residual >= 8) { + L2SqrStep(pVect1, pVect2, sum); + } + + // We dealt with the residual part. We are left with some multiple of 16 floats. + // In each iteration we calculate 16 floats = 512 bits. + do { + L2SqrStep(pVect1, pVect2, sum); + L2SqrStep(pVect1, pVect2, sum); + } while (pVect1 < pEnd1); + + return my_mm256_reduce_add_ps(sum); +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 5304f1f86..488e2fe5a 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -33,6 +33,64 @@ using float16 = vecsim_types::float16; namespace spaces { + dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (!alignment) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_L2Sqr; + + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + // #ifdef CPU_FEATURES_ARCH_AARCH64 + // #ifdef OPT_SVE2 + // if (features.sve2) { + // return Choose_FP32_L2_implementation_SVE2(dim); + // } + // #endif + // #ifdef OPT_SVE + // if (features.sve) { + // return Choose_FP32_L2_implementation_SVE(dim); + // } + // #endif + // #ifdef OPT_NEON + // if (features.asimd) { + // return Choose_FP32_L2_implementation_NEON(dim); + // } + // #endif + // #endif + + // #ifdef CPU_FEATURES_ARCH_X86_64 + // // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + + // if (dim < 16) { + // return ret_dist_func; + // } + // #ifdef OPT_AVX512F + // if (features.avx512f) { + // if (dim % 16 == 0) // no point in aligning if we have an offsetting residual + // *alignment = 16 * sizeof(float); // handles 16 floats + // return Choose_SQ8_L2_implementation_AVX512F(dim); + // } + // #endif + // #ifdef OPT_AVX + // if (features.avx) { + // if (dim % 8 == 0) // no point in aligning if we have an offsetting residual + // *alignment = 8 * sizeof(float); // handles 8 floats + // return Choose_SQ8_L2_implementation_AVX(dim); + // } + // #endif + // #ifdef OPT_SSE + // if (features.sse) { + // if (dim % 4 == 0) // no point in aligning if we have an offsetting residual + // *alignment = 4 * sizeof(float); // handles 4 floats + // return Choose_SQ8_L2_implementation_SSE(dim); + // } + // #endif + // #endif // __x86_64__ + return ret_dist_func; + } + dist_func_t L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (!alignment) { diff --git a/src/VecSim/spaces/computer/preprocessors.h b/src/VecSim/spaces/computer/preprocessors.h index 1045299b4..ae434ea69 100644 --- a/src/VecSim/spaces/computer/preprocessors.h +++ b/src/VecSim/spaces/computer/preprocessors.h @@ -111,3 +111,134 @@ class CosinePreprocessor : public PreprocessorInterface { spaces::normalizeVector_f normalize_func; const size_t dim; }; + +template +class QuantPreprocessor : public PreprocessorInterface { +public: + QuantPreprocessor(std::shared_ptr allocator, size_t dim, size_t bits_per_dim = 8) + : PreprocessorInterface(allocator), dim(dim), bits_per_dim(bits_per_dim), + compressed_bytes_count(calculateCompressedSize(dim)) {} + + void preprocess(const void *original_blob, void *&storage_blob, void *&query_blob, + size_t processed_bytes_count, unsigned char alignment) const override { + // Case 1: Blobs are different (one might be null, or both are allocated and processed separately) + if (storage_blob != query_blob) { + // Process storage blob (compress) + if (storage_blob == nullptr) { + storage_blob = this->allocator->allocate(compressed_bytes_count); + quantize(original_blob, storage_blob); + } + + // Query blob remains uncompressed + if (query_blob == nullptr) { + query_blob = this->allocator->allocate_aligned(processed_bytes_count, alignment); + memcpy(query_blob, original_blob, processed_bytes_count); + } + } else { // Case 2: Blobs are the same or both null + if (query_blob == nullptr) { + // For query, we keep the original format + query_blob = this->allocator->allocate_aligned(processed_bytes_count, alignment); + memcpy(query_blob, original_blob, processed_bytes_count); + + // For storage, we compress + storage_blob = this->allocator->allocate(compressed_bytes_count); + quantize(original_blob, storage_blob); + } else { + // If both point to the same memory, we need to separate them + void* new_storage = this->allocator->allocate(compressed_bytes_count); + quantize(query_blob, new_storage); + storage_blob = new_storage; + } + } + } + + void preprocessForStorage(const void *original_blob, void *&blob, + size_t processed_bytes_count) const override { + if (blob == nullptr) { + blob = this->allocator->allocate(compressed_bytes_count); + quantize(original_blob, blob); + } else { + // If blob is already allocated, we need to compress in-place + void* temp = this->allocator->allocate(compressed_bytes_count); + quantize(blob, temp); + this->allocator->free_allocation(blob); + blob = temp; + } + } + + void preprocessQuery(const void *original_blob, void *&blob, size_t processed_bytes_count, + unsigned char alignment) const override { + // For query, we keep the original format + if (blob == nullptr) { + blob = this->allocator->allocate_aligned(processed_bytes_count, alignment); + memcpy(blob, original_blob, processed_bytes_count); + } + } + + void preprocessQueryInPlace(void *blob, size_t processed_bytes_count, + unsigned char alignment) const override { + // No compression for query vectors + assert(blob); + } + + void preprocessStorageInPlace(void *blob, size_t processed_bytes_count) const override { + assert(blob); + // Create temporary storage for compressed data + void* temp = this->allocator->allocate(compressed_bytes_count); + quantize(blob, temp); + + // Copy compressed data back to original location + // Note: This assumes blob has enough space for the compressed data + memcpy(blob, temp, compressed_bytes_count); + this->allocator->free_allocation(temp); + } + +private: + const size_t dim; + const size_t bits_per_dim; + const size_t compressed_bytes_count; + + // Calculate the size needed for the compressed vector + static size_t calculateCompressedSize(size_t dim) { + // Quantized values (int8 per dimension) + min (float32) + delta (float32) + return dim * sizeof(int8_t) + 2 * sizeof(float); + } + + // Quantize the vector from original format to compressed format + void quantize(const void *src, void *dst) const { + const DataType* src_data = static_cast(src); + + // Find min and max values in the vector + DataType min_val = src_data[0]; + DataType max_val = src_data[0]; + + for (size_t i = 0; i < dim; i++) { + DataType val = src_data[i]; + min_val = val < min_val ? val : min_val; + max_val = val > max_val ? val : max_val; + } + + // Calculate delta (quantization step) + float delta = (max_val - min_val) / 255.0f; + if (delta == 0){ + delta = 1.0f; // Avoid division by zero if all values are the same + } + + // Structure of compressed data: + // [quantized values (int8_t * dim)][min_val (float)][delta (float)] + int8_t* quant_values = static_cast(dst); // convert to int8_t pointer + float* params = reinterpret_cast(quant_values + dim); // convert to float pointer starting after quantized values + + // Store min and delta values for dequantization + params[0] = static_cast(min_val); + params[1] = delta; + + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (src_data[i] - min_val) / delta; + if (normalized < 0) normalized = 0; + if (normalized > 255) normalized = 255; + quant_values[i] = static_cast(normalized); + } + } +}; diff --git a/src/VecSim/spaces/functions/AVX.cpp b/src/VecSim/spaces/functions/AVX.cpp index 7033a7c70..d0e5b6fbe 100644 --- a/src/VecSim/spaces/functions/AVX.cpp +++ b/src/VecSim/spaces/functions/AVX.cpp @@ -11,6 +11,7 @@ #include "VecSim/spaces/L2/L2_AVX_FP32.h" #include "VecSim/spaces/L2/L2_AVX_FP64.h" +#include "VecSim/spaces/IP/IP_AVX_SQ8.h" #include "VecSim/spaces/IP/IP_AVX_FP32.h" #include "VecSim/spaces/IP/IP_AVX_FP64.h" @@ -18,6 +19,12 @@ namespace spaces { #include "implementation_chooser.h" +dist_func_t Choose_SQ8_IP_implementation_AVX(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX); + return ret_dist_func; +} + dist_func_t Choose_FP32_IP_implementation_AVX(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX); diff --git a/src/VecSim/spaces/functions/AVX.h b/src/VecSim/spaces/functions/AVX.h index 16b1e4f85..7f2c38b1f 100644 --- a/src/VecSim/spaces/functions/AVX.h +++ b/src/VecSim/spaces/functions/AVX.h @@ -12,6 +12,7 @@ namespace spaces { +dist_func_t Choose_SQ8_IP_implementation_AVX(size_t dim); dist_func_t Choose_FP32_IP_implementation_AVX(size_t dim); dist_func_t Choose_FP64_IP_implementation_AVX(size_t dim); diff --git a/src/VecSim/spaces/functions/AVX2.cpp b/src/VecSim/spaces/functions/AVX2.cpp index bd1997a23..5e0bde6c8 100644 --- a/src/VecSim/spaces/functions/AVX2.cpp +++ b/src/VecSim/spaces/functions/AVX2.cpp @@ -10,6 +10,7 @@ #include "VecSim/spaces/IP/IP_AVX2_BF16.h" #include "VecSim/spaces/L2/L2_AVX2_BF16.h" +#include "VecSim/spaces/IP/IP_AVX2_SQ8.h" namespace spaces { @@ -27,6 +28,12 @@ dist_func_t Choose_BF16_L2_implementation_AVX2(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_IP_implementation_AVX2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX2.h b/src/VecSim/spaces/functions/AVX2.h index 8ad04a8a5..06b0269de 100644 --- a/src/VecSim/spaces/functions/AVX2.h +++ b/src/VecSim/spaces/functions/AVX2.h @@ -14,5 +14,6 @@ namespace spaces { dist_func_t Choose_BF16_IP_implementation_AVX2(size_t dim); dist_func_t Choose_BF16_L2_implementation_AVX2(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_AVX2(size_t dim); } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 9ef8e0efd..ffa62375d 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -14,10 +14,13 @@ #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h" + namespace spaces { #include "implementation_chooser.h" + dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI); @@ -54,6 +57,17 @@ dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t return ret_dist_func; } +dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_InnerProductSIMD64_AVX512_BW_VL_VNNI); + return ret_dist_func; +} +dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_CosineSIMD64_AVX512_BW_VL_VNNI); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index 384e2549b..b6760eca9 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -12,6 +12,8 @@ namespace spaces { + + dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); @@ -20,4 +22,8 @@ dist_func_t Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) dist_func_t Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); + + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SSE.cpp b/src/VecSim/spaces/functions/SSE.cpp index 8962306db..dd218d957 100644 --- a/src/VecSim/spaces/functions/SSE.cpp +++ b/src/VecSim/spaces/functions/SSE.cpp @@ -13,11 +13,24 @@ #include "VecSim/spaces/IP/IP_SSE_FP32.h" #include "VecSim/spaces/IP/IP_SSE_FP64.h" +#include "VecSim/spaces/IP/IP_SSE_SQ8.h" namespace spaces { #include "implementation_chooser.h" +dist_func_t Choose_SQ8_IP_implementation_SSE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_SSE); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_SSE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_SSE); + return ret_dist_func; +} + dist_func_t Choose_FP32_IP_implementation_SSE(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_SSE); diff --git a/src/VecSim/spaces/functions/SSE.h b/src/VecSim/spaces/functions/SSE.h index ab09de7d6..a86921a9c 100644 --- a/src/VecSim/spaces/functions/SSE.h +++ b/src/VecSim/spaces/functions/SSE.h @@ -12,6 +12,8 @@ namespace spaces { +dist_func_t Choose_SQ8_IP_implementation_SSE(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_SSE(size_t dim); dist_func_t Choose_FP32_IP_implementation_SSE(size_t dim); dist_func_t Choose_FP64_IP_implementation_SSE(size_t dim); diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index fd80512b1..39098bd8c 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -22,6 +22,7 @@ #include "VecSim/spaces/L2/L2_SVE_UINT8.h" #include "VecSim/spaces/IP/IP_SVE_UINT8.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8.h" namespace spaces { @@ -96,6 +97,18 @@ dist_func_t Choose_UINT8_Cosine_implementation_SVE(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index a98613449..86f7a7094 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -29,4 +29,7 @@ dist_func_t Choose_UINT8_L2_implementation_SVE(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_SVE(size_t dim); dist_func_t Choose_UINT8_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index 4758150d0..52ba020a4 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -20,6 +20,7 @@ #include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8.h" // SVE2 implementation is identical to SVE namespace spaces { @@ -94,6 +95,18 @@ dist_func_t Choose_UINT8_Cosine_implementation_SVE2(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_IP_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index 248ca710b..cd3570caf 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -29,4 +29,7 @@ dist_func_t Choose_UINT8_L2_implementation_SVE2(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_SVE2(size_t dim); dist_func_t Choose_UINT8_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_SVE2(size_t dim); + } // namespace spaces diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt index a5c9e7257..8a207228a 100644 --- a/tests/benchmark/CMakeLists.txt +++ b/tests/benchmark/CMakeLists.txt @@ -38,7 +38,7 @@ endif() # Spaces benchmarks # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # -set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8) +set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8 sq8) foreach(data_type IN LISTS DATA_TYPE) add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp) target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark) diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh index 78584130e..76389ad89 100755 --- a/tests/benchmark/benchmarks.sh +++ b/tests/benchmark/benchmarks.sh @@ -15,6 +15,7 @@ if [ -z "$BM_TYPE" ] || [ "$BM_TYPE" = "benchmarks-all" ]; then echo spaces_fp16 echo spaces_int8 echo spaces_uint8 + echo spaces_sq8 elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo basics_single_fp32 @@ -25,6 +26,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo spaces_fp16 echo spaces_int8 echo spaces_uint8 + echo spaces_sq8 # Basic benchmarks @@ -91,6 +93,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then echo spaces_bf16 echo spaces_int8 echo spaces_uint8 + echo spaces_sq8 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then echo spaces_fp32 @@ -104,4 +107,6 @@ elif [ "$BM_TYPE" = "bm-spaces-int8" ] ; then echo spaces_int8 elif [ "$BM_TYPE" = "bm-spaces-uint8" ] ; then echo spaces_uint8 +elif [ "$BM_TYPE" = "bm-spaces-sq8" ] ; then + echo spaces_sq8 fi diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp new file mode 100644 index 000000000..197765e85 --- /dev/null +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). +*/ +#include "bm_spaces.h" +#include "utils/tests_utils.h" + +class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { + protected: + std::mt19937 rng; + size_t dim; + float *v1; + uint8_t *v2; + + public: + BM_VecSimSpaces_SQ8() { rng.seed(47); } + ~BM_VecSimSpaces_SQ8() = default; + + void SetUp(const ::benchmark::State &state) { + dim = state.range(0); + v1 = new float[dim]; + test_utils::populate_float_vec(v1, dim, 123); + // Allocate vector with extra space for min, delta and cosine calculations + v2 = new uint8_t[dim + sizeof(float) * 3]; + test_utils::populate_float_vec_to_sq8(v2, dim, 1234); + } + void TearDown(const ::benchmark::State &state) { + delete v1; + delete v2; + } + }; + +#ifdef CPU_FEATURES_ARCH_X86_64 +cpu_features::X86Features opt = cpu_features::GetX86Info().features; + +// AVX512_F_BW_VL_VNNI functions +#ifdef OPT_AVX512_F_BW_VL_VNNI +bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 32, + avx512_f_bw_vl_vnni_supported); +// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, +// avx512_f_bw_vl_vnni_supported); +#endif // AVX512_F_BW_VL_VNNI + +#ifdef AVX2 +// AVX2 functions +bool avx2_supported = opt.avx2; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 32, avx2_supported); +// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX2, 32, +// avx2_supported); +#endif // AVX2 + +// AVX functions +#ifdef OPT_AVX +bool avx_supported = opt.avx; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX, 32, avx_supported); +// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX, 32, +// avx_supported); +#endif // AVX +// SSE functions +#ifdef OPT_SSE +bool sse_supported = opt.sse; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, SSE, 32, sse_supported); +// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SSE, 32, +// sse_supported); +#endif // SSE +#endif // x86_64 + +// Naive algorithms + +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, InnerProduct, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, Cosine, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, L2Sqr, 16); + +// Naive + +BENCHMARK_MAIN(); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 2310782f4..2cf61cea8 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include #include @@ -305,6 +305,177 @@ TEST_F(SpacesTest, uint8_Cosine_no_optimization_func_test) { ASSERT_NEAR(dist, 0.0, 0.000001); } +void common_ip_sq8(bool should_normalize, float expected_dist) { + + size_t dim = 5; + + // Create original vectors + float v1_orig[dim], v2_orig[dim]; + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i + 1.5); + } + + // Create SQ8 compressed version of v2 + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + if (should_normalize) { + spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); + } + + // Find min and max for quantization + float min_val = v2_orig[0]; + float max_val = v2_orig[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v2_orig[i]); + max_val = std::max(max_val, v2_orig[i]); + } + + // Calculate delta and inverse norm + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + std::vector v2_compressed(compressed_size); + + // Quantize v2 + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + float *params = reinterpret_cast(quant_values + dim); + + // Store parameters + params[0] = min_val; + params[1] = delta; + + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v2_orig[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + } + + float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim); + + // Since we're comparing identical vectors, the inner product distance should be close to + // expected + ASSERT_NEAR(dist, expected_dist, 0.01) << "SQ8_InnerProduct failed to match expected distance"; +} + +/* ======================== Tests SQ8 ========================= */ +TEST_F(SpacesTest, SQ8_ip_no_optimization_func_test) { + float expected_dist = -70.2147f; // Expected distance for identical vectors + common_ip_sq8(false, expected_dist); +} + +TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { common_ip_sq8(true, 0.0f); } + +TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { + // create a vector with extra space for the norm + size_t dim = 5; + + // Create original vectors + float v1_orig[dim], v2_orig[dim]; + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i + 1.5); + } + + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + spaces::GetNormalizeFunc()(v1_orig, dim); + // Find min and max for quantization + float min_val = v2_orig[0]; + float max_val = v2_orig[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v2_orig[i]); + max_val = std::max(max_val, v2_orig[i]); + } + // Calculate delta and inverse norm + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + // Compress v2 + std::vector v2_compressed(compressed_size); + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + float *params = reinterpret_cast(quant_values + dim); + + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v2_orig[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + } + // Calculate inverse norm from decompressed values + float inv_norm = 0.0f; + for (size_t i = 0; i < dim; i++) { + float decompressed_value = min_val + quant_values[i] * delta; + inv_norm += decompressed_value * decompressed_value; + } + inv_norm = 1.0f / std::sqrt(inv_norm); + // Store parameters + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; + + float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); + ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance"; +} +TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { + // create a vector with extra space for the norm + size_t dim = 5; + + // Create original vectors + float v1_orig[dim], v2_orig[dim]; + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i + 1.5); + } + + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + spaces::GetNormalizeFunc()(v1_orig, dim); + // Find min and max for quantization + float min_val = v2_orig[0]; + float max_val = v2_orig[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v2_orig[i]); + max_val = std::max(max_val, v2_orig[i]); + } + // Calculate delta and inverse norm + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + // Compress v2 + std::vector v2_compressed(compressed_size); + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + float *params = reinterpret_cast(quant_values + dim); + + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v2_orig[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + } + // Calculate inverse norm from decompressed values + float inv_norm = 0.0f; + for (size_t i = 0; i < dim; i++) { + float decompressed_value = min_val + quant_values[i] * delta; + inv_norm += decompressed_value * decompressed_value; + } + inv_norm = 1.0f / std::sqrt(inv_norm); + // Store parameters + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; + std::cout << "min_val: " << min_val << ", delta: " << delta << ", inv_norm: " << inv_norm + << std::endl; + + float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim); + ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance"; +} + /* ======================== Test Getters ======================== */ TEST_F(SpacesTest, GetDistFuncInvalidMetricFP32) { @@ -1889,3 +2060,364 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) { INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest, testing::Range(32UL, 64 * 2UL + 1)); + +// Helper function to create SQ8 compressed vector +std::vector CreateSQ8CompressedVector(const float *original, size_t dim) { + // Create a copy of the original vector that we can modify + std::vector vec_copy(original, original + dim); + + // Size: dim (uint8_t) + min_val (float) + delta (float) + norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + std::vector compressed(compressed_size); + + // Find min and max for quantization + float min_val = vec_copy[0]; + float max_val = vec_copy[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, vec_copy[i]); + max_val = std::max(max_val, vec_copy[i]); + } + + // Calculate delta + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + // Quantize vector + uint8_t *quant_values = compressed.data(); + float norm = 0.0f; + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (vec_copy[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + norm += (quant_values[i] * delta + min_val) * (quant_values[i] * delta + min_val); + } + + float inv_norm = 1.0f / std::sqrt(norm); + // Store parameters + float *params = reinterpret_cast(quant_values + dim); + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; + + return compressed; +} + +class SQ8SpacesOptimizationTest : public testing::TestWithParam {}; + +// TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { +// auto optimization = getCpuOptimizationFeatures(); +// size_t dim = GetParam(); + +// // Create original vectors +// std::vector v1_orig(dim); +// std::vector v2_orig(dim); +// for (size_t i = 0; i < dim; i++) { +// v1_orig[i] = float(i + 1.5); +// v2_orig[i] = float(i * 0.75 + 1.0); +// } + +// // Create SQ8 compressed version of v2 +// std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim, false); + +// auto expected_alignment = [](size_t reg_bit_size, size_t dim) { +// size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; +// return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0; +// }; + +// dist_func_t arch_opt_func; +// float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); + +// // Test different optimizations based on CPU features +// #ifdef OPT_AVX512_F_BW_VL_VNNI +// if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && +// optimization.avx512vnni) { +// unsigned char alignment = 0; +// arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); +// ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) +// << "Unexpected distance function chosen for dim " << dim; +// ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) +// << "AVX512 with dim " << dim; +// ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; +// // Unset optimizations flag, so we'll choose the next optimization. +// optimization.avx512f = optimization.avx512bw = optimization.avx512vl = +// optimization.avx512vnni = 0; +// } +// #endif + +// // Add other optimizations as needed (SVE2, SVE, NEON, etc.) + +// // Test default implementation +// unsigned char alignment = 0; +// arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); +// ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; +// ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) +// << "No optimization with dim " << dim; +// ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +// } + +TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + spaces::GetNormalizeFunc()(v1_orig.data(), dim); + // spaces::GetNormalizeFunc()(v2_orig.data(), dim); + // print v1_orig + std::cout << "v1_orig: "; + for (size_t i = 0; i < dim; i++) { + std::cout << v1_orig[i] << ", "; + } + std::cout << std::endl; + std::cout << "v2_orig: "; + for (size_t i = 0; i < dim; i++) { + std::cout << v2_orig[i] << ", "; + } + std::cout << std::endl; + + // Create SQ8 compressed version of v2 + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + // print min and delta + float *params = reinterpret_cast(v2_compressed.data() + dim); + + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); + + // Test different optimizations based on CPU features + #ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX512 with dim " << dim; + ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } + #endif + #ifdef OPT_AVX + if (optimization.avx) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; + optimization.avx = 0; + } + #endif + #ifdef OPT_SSE + if (optimization.sse) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SSE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SSE with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; + optimization.sse = 0; + } + #endif + + + // Test default implementation + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +// Instantiate the test suite with dimensions to test +INSTANTIATE_TEST_SUITE_P(SQ8InnerProductTest, SQ8SpacesOptimizationTest, + testing::Range(16UL, 16 * 2UL + 1)); + +TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + + // Normalize v1 + spaces::GetNormalizeFunc()(v1_orig.data(), dim); + spaces::GetNormalizeFunc()(v2_orig.data(), dim); + + // Create SQ8 compressed version of v2 (with normalization) + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim); + + #ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE2 with dim " << dim; + // We don't align SQ8 vectors with cosine distance + // ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } + #endif + #ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE with dim " << dim; + // We don't align SQ8 vectors with cosine distance + // ASSERT_EQ(alignment, 0) << "SVE with dim " << dim; + optimization.sve = 0; + } + #endif + + // Test different optimizations based on CPU features + #ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && + optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX512 with dim " << dim; + // We don't align SQ8 vectors with cosine distance + // ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } + #endif + + #ifdef OPT_SSE + if (optimization.sse) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SSE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SSE with dim " << dim; + // We don't align SQ8 vectors with cosine distance + // ASSERT_EQ(alignment, 0) << "SSE with dim " << dim; + optimization.sse = 0; + } + #endif + + // Test default implementation + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << + dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +// Instantiate the test suite with dimensions to test +INSTANTIATE_TEST_SUITE_P(SQ8CosineTest, SQ8SpacesOptimizationTest, + testing::Range(16UL, 16 * 2UL + 1)); + +// TEST_P(SQ8SpacesOptimizationTest, SQ8_full_range_test) { +// auto optimization = getCpuOptimizationFeatures(); +// constexpr size_t dim = 512; + +// // Create vectors with full range of values +// std::vector v1(dim); +// std::vector v2(dim); + +// // v1: 0..255 followed by 255..0 +// for (size_t i = 0; i < 256; i++) { +// v1[i] = static_cast(i) / 255.0f; +// v1[256 + i] = static_cast(255 - i) / 255.0f; +// } + +// // v2: 255..0 followed by 0..255 +// for (size_t i = 0; i < 256; i++) { +// v2[i] = static_cast(255 - i) / 255.0f; +// v2[256 + i] = static_cast(i) / 255.0f; +// } + +// // Create SQ8 compressed version of v2 +// std::vector v2_compressed = CreateSQ8CompressedVector(v2.data(), dim, false); + +// // Create normalized version of v1 for cosine +// std::vector v1_norm(v1); +// spaces::GetNormalizeFunc()(v1_norm.data(), dim); + +// // Create normalized SQ8 compressed version of v2 for cosine +// std::vector v2_compressed_norm = CreateSQ8CompressedVector(v2.data(), dim, true); + +// float baseline_l2 = SQ8_L2Sqr(v1.data(), v2_compressed.data(), dim); +// float baseline_ip = SQ8_InnerProduct(v1.data(), v2_compressed.data(), dim); +// float baseline_cosine = SQ8_Cosine(v1_norm.data(), v2_compressed_norm.data(), dim); + +// dist_func_t arch_opt_func; + +// // Test different optimizations for each metric +// #ifdef OPT_AVX512F +// if (optimization.avx512f) { +// // L2 test +// arch_opt_func = Choose_SQ8_L2_implementation_AVX512F(dim); +// ASSERT_NEAR(baseline_l2, arch_opt_func(v1.data(), v2_compressed.data(), dim), 0.01) +// << "L2 AVX512 with dim " << dim; + +// // IP test +// arch_opt_func = Choose_SQ8_IP_implementation_AVX512F(dim); +// ASSERT_NEAR(baseline_ip, arch_opt_func(v1.data(), v2_compressed.data(), dim), 0.01) +// << "IP AVX512 with dim " << dim; + +// // Cosine test +// arch_opt_func = Choose_SQ8_Cosine_implementation_AVX512F(dim); +// ASSERT_NEAR(baseline_cosine, arch_opt_func(v1_norm.data(), v2_compressed_norm.data(), +// dim), 0.01) +// << "Cosine AVX512 with dim " << dim; + +// optimization.avx512f = 0; +// } +// #endif + +// // Add other optimizations as needed (SVE2, SVE, NEON, etc.) + + +// Instantiate the test suite with dimensions to test +INSTANTIATE_TEST_SUITE_P(SQ8OptFuncs, SQ8SpacesOptimizationTest, + testing::Range(16UL, 16 * 2UL + 1)); + +// #endif // defined(OPT_AVX512_FP16_VL) || defined(CPU_FEATURES_ARCH_AARCH64) + +// class INT8SpacesOptimizationTest : public testing::TestWithParam {}; + +// TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { +// auto optimization = getCpuOptimizationFeatures(); +// size_t dim = GetParam(); +// int8_t v1[dim]; +// int8_t v2[dim]; +// test_utils::populate_int8_vec(v1, dim diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index a1526867b..bb041b0af 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -40,6 +40,57 @@ static void populate_uint8_vec(uint8_t *v, size_t dim, int seed = 1234) { } } +static void populate_float_vec(float *v, size_t dim, int seed = 1234) { + + std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed + std::uniform_real_distribution dis(-1.0f, 1.0f); + + for (size_t i = 0; i < dim; i++) { + v[i] = dis(gen); + } +} + +static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { + + std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed + std::uniform_real_distribution dis(-1.0f, 1.0f); + std::vector vec_copy(dim); + for (size_t i = 0; i < dim; i++) { + vec_copy[i] = dis(gen); + } + + // Find min and max for quantization + float min_val = vec_copy[0]; + float max_val = vec_copy[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, vec_copy[i]); + max_val = std::max(max_val, vec_copy[i]); + } + + // Calculate delta + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + float norm = 0.0f; + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (vec_copy[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + v[i] = static_cast(std::round(normalized)); + norm += (v[i] * delta + min_val) * (v[i] * delta + min_val); + } + + float inv_norm = 1.0f / std::sqrt(norm); + // Store parameters + float *params = reinterpret_cast(v + dim); + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; + +} + + template float integral_compute_norm(const datatype *vec, size_t dim) { return spaces::IntegralType_ComputeNorm(vec, dim); From af854320511463f81dc9e9e0d474840253d932ff Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:15:41 +0300 Subject: [PATCH 02/52] Change to IP_AVX512F --- .../spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h | 176 ------------------ src/VecSim/spaces/IP/IP_AVX512F_SQ8.h | 139 ++++++++++++++ .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 8 +- tests/unit/test_spaces.cpp | 11 +- 4 files changed, 148 insertions(+), 186 deletions(-) delete mode 100644 src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h create mode 100644 src/VecSim/spaces/IP/IP_AVX512F_SQ8.h diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h deleted file mode 100644 index 6c001efcf..000000000 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2006-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - */ -#pragma once -#include "VecSim/spaces/space_includes.h" -#include -#include - -static inline void -SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum, - const __m512 &min_val_vec, const __m512 &delta_vec) { - // Load 16 float elements from pVec1 - __m512 v1 = _mm512_loadu_ps(pVec1); - - // Load 16 uint8 elements from pVec2 and convert to __m512i - __m128i v2_128 = _mm_loadu_si128((__m128i*)pVec2); - __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - - // Convert uint8 to float - __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - - // Dequantize: (val * delta) + min_val - __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); - - // Compute dot product and add to sum - sum = _mm512_fmadd_ps(v1, dequantized, sum); - - // Advance pointers - pVec1 += 16; - pVec2 += 16; -} - -// Common implementation for both inner product and cosine similarity -template // 0..63 -float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) { - const float *pVec1 = static_cast(pVec1v); - const uint8_t *pVec2 = static_cast(pVec2v); - const uint8_t *pEnd2 = pVec2 + dimension; - - // Get dequantization parameters from the end of pVec2 - const float min_val = *reinterpret_cast(pVec2 + dimension); - const float delta = *reinterpret_cast(pVec2 + dimension + sizeof(float)); - - // Create broadcast vectors for SIMD operations - __m512 min_val_vec = _mm512_set1_ps(min_val); - __m512 delta_vec = _mm512_set1_ps(delta); - - // Initialize sum accumulator - __m512 sum = _mm512_setzero_ps(); - - // Deal with remainder first - if constexpr (residual) { - if constexpr (residual < 16) { - // Handle less than 16 elements - __mmask16 mask = (1U << residual) - 1; - - // Load masked float elements - __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); - - // Load masked uint8 elements - __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVec2)); - __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - - // Dequantize - __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); - - // Compute dot product - sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized); - } - else if constexpr (residual == 16) { - // Handle exactly 16 elements - SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); - } - else if constexpr (residual < 32) { - // Handle 16-31 elements: process 16 and then remainder - SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); - - // Process remaining elements (residual - 16) - constexpr unsigned char remaining = residual - 16; - __mmask16 mask = (1U << remaining) - 1; - - // Load masked float elements - __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); - - // Load masked uint8 elements - __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVec2)); - __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - - // Dequantize - __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); - - // Compute dot product - sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized); - } - else if constexpr (residual == 32) { - // Handle exactly 32 elements: process two chunks of 16 - SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); - SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); - } - else { - // Handle more than 32 elements: process chunks of 16 until less than 16 remain - constexpr size_t full_chunks = residual / 16; - for (size_t i = 0; i < full_chunks; i++) { - SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); - } - - // Process remaining elements (residual % 16) - constexpr unsigned char remaining = residual % 16; - if constexpr (remaining > 0) { - __mmask16 mask = (1U << remaining) - 1; - - // Load masked float elements - __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); - - // Load masked uint8 elements - __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVec2)); - __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - - // Dequantize - __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); - - // Compute dot product - sum = _mm512_mask_fmadd_ps(sum, mask, v1, dequantized); - } - } - - pVec1 += residual; - pVec2 += residual; - } - - // Process remaining full chunks of 16 elements - while (pVec2 < pEnd2) { - SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); - } - - // Horizontal sum - float result = _mm512_reduce_add_ps(sum); - - // Return 1 - result as per the pattern in other implementations - return result; -} - -template // 0..63 -float SQ8_InnerProductSIMD64_AVX512_BW_VL_VNNI(const void *pVec1v, - const void *pVec2v, - size_t dimension) { - // Calculate inner product using common implementation - float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); - std::cout << "result: " << ip << std::endl; - - // Return 1 - result as per the pattern in other implementations - return 1.0f - ip; -} - -template // 0..63 -float SQ8_CosineSIMD64_AVX512_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, - size_t dimension) { - // Calculate inner product using common implementation - float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); - - // Get the inverse norm factor stored after min_val and delta - const uint8_t *pVec2 = static_cast(pVec2v); - const float inv_norm = *reinterpret_cast(pVec2 + dimension + 2 * sizeof(float)); - std::cout << "result2: " << ip << std::endl; - // Return 1 - (ip * inv_norm) as per the pattern in other implementations - return 1.0f - ip * inv_norm; -} - diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h new file mode 100644 index 000000000..7005b7a15 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include +#include + +static inline void +SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum, + const __m512 &min_val_vec, const __m512 &delta_vec) { + // Load 16 float elements from pVec1 + __m512 v1 = _mm512_loadu_ps(pVec1); + + // Load 16 uint8 elements from pVec2 and convert to __m512i + __m128i v2_128 = _mm_loadu_si128((__m128i*)pVec2); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + + // Convert uint8 to float + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize: (val * delta) + min_val + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product and add to sum + sum = _mm512_fmadd_ps(v1, dequantized, sum); + + // Advance pointers + pVec1 += 16; + pVec2 += 16; +} + +// Common implementation for both inner product and cosine similarity +template // 0..15 +float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, float inv_norm = 1.0f) { + const float *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + const uint8_t *pEnd2 = pVec2 + dimension; + + // Get dequantization parameters from the end of pVec2 + const float min_val = *reinterpret_cast(pVec2 + dimension); + const float delta = *reinterpret_cast(pVec2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m512 min_val_vec = _mm512_set1_ps(min_val); + __m512 delta_vec = _mm512_set1_ps(delta); + + // Initialize sum accumulator + __m512 sum = _mm512_setzero_ps(); + + // Deal with remainder first + if constexpr (residual > 0) { + // Handle less than 16 elements + __mmask16 mask = (1U << residual) - 1; + + // Load masked float elements + __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); + + // Load masked uint8 elements + __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVec2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + + // Dequantize + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product + __m512 product = _mm512_mul_ps(v1, dequantized); + + + // Apply mask to product and add to sum + sum = _mm512_mask_add_ps(sum, mask, sum, product); + + pVec1 += residual; + pVec2 += residual; + } + + // Print and compare the residual sums + float simd_residual_sum = _mm512_reduce_add_ps(sum); + std::cout << "Residual part - SIMD sum: " << simd_residual_sum + << ", Naive sum: " << naive_sum + << ", Difference: " << std::abs(simd_residual_sum - naive_sum) << std::endl; + + // Process remaining full chunks of 16 elements + while (pVec2 <= pEnd2) { + SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); + } + + // Horizontal sum + float result = _mm512_reduce_add_ps(sum); + + // Calculate full naive sum for comparison + float full_naive_sum = naive_sum; + const float *orig_pVec1 = static_cast(pVec1v) + residual; + const uint8_t *orig_pVec2 = static_cast(pVec2v) + residual; + for (size_t i = 0; i < dimension - residual; i++) { + float dequantized = orig_pVec2[i] * delta + min_val; + full_naive_sum += orig_pVec1[i] * dequantized; + } + + std::cout << "Full calculation - SIMD sum: " << result + << ", Naive sum: " << full_naive_sum + << ", Difference: " << std::abs(result - full_naive_sum) << std::endl; + + // Return the raw inner product result + return result; +} + +template // 0..15 +float SQ8_InnerProductSIMD16_AVX512F(const void *pVec1v, + const void *pVec2v, + size_t dimension) { + // Calculate inner product using common implementation + float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + + // The inner product similarity is 1 - ip + return 1.0f - ip; +} + +template // 0..15 +float SQ8_CosineSIMD16_AVX512F(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // Get the inverse norm factor stored after min_val and delta + const uint8_t *pVec2 = static_cast(pVec2v); + const float inv_norm = *reinterpret_cast(pVec2 + dimension + 2 * sizeof(float)); + + // Calculate inner product using common implementation with normalization + float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension, inv_norm); + + // The cosine similarity is 1 - ip + return 1.0f - ip; +} + diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index ffa62375d..d06a68d02 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -14,7 +14,7 @@ #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" -#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX512F_SQ8.h" namespace spaces { @@ -57,14 +57,14 @@ dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t return ret_dist_func; } -dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { +dist_func_t Choose_SQ8_IP_implementation_AVX512F(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_InnerProductSIMD64_AVX512_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F); return ret_dist_func; } dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_CosineSIMD64_AVX512_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F); return ret_dist_func; } diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 2cf61cea8..6859fe30b 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2169,9 +2169,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { v2_orig[i] = float(i * 0.75 + 1.0); } spaces::GetNormalizeFunc()(v1_orig.data(), dim); - // spaces::GetNormalizeFunc()(v2_orig.data(), dim); // print v1_orig - std::cout << "v1_orig: "; + std::cout << "v1_normalized: "; for (size_t i = 0; i < dim; i++) { std::cout << v1_orig[i] << ", "; } @@ -2196,8 +2195,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); // Test different optimizations based on CPU features - #ifdef OPT_AVX512_F_BW_VL_VNNI - if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && optimization.avx512vnni) { + #ifdef OPT_AVX512F + if (optimization.avx512f) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) @@ -2303,12 +2302,12 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { #endif // Test different optimizations based on CPU features - #ifdef OPT_AVX512_F_BW_VL_VNNI + #ifdef OPT_AVX512F if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F(dim)) << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX512 with dim " << dim; From b215799aaeaf61c492a01a3ca0e00ae09b898dcf Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:17:12 +0300 Subject: [PATCH 03/52] Change --- src/VecSim/spaces/IP/IP_AVX512F_SQ8.h | 28 +++------------------------ 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h index 7005b7a15..c179aa0e9 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h @@ -81,35 +81,13 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi pVec2 += residual; } - // Print and compare the residual sums - float simd_residual_sum = _mm512_reduce_add_ps(sum); - std::cout << "Residual part - SIMD sum: " << simd_residual_sum - << ", Naive sum: " << naive_sum - << ", Difference: " << std::abs(simd_residual_sum - naive_sum) << std::endl; - // Process remaining full chunks of 16 elements - while (pVec2 <= pEnd2) { + do { SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); - } - - // Horizontal sum - float result = _mm512_reduce_add_ps(sum); - - // Calculate full naive sum for comparison - float full_naive_sum = naive_sum; - const float *orig_pVec1 = static_cast(pVec1v) + residual; - const uint8_t *orig_pVec2 = static_cast(pVec2v) + residual; - for (size_t i = 0; i < dimension - residual; i++) { - float dequantized = orig_pVec2[i] * delta + min_val; - full_naive_sum += orig_pVec1[i] * dequantized; - } - - std::cout << "Full calculation - SIMD sum: " << result - << ", Naive sum: " << full_naive_sum - << ", Difference: " << std::abs(result - full_naive_sum) << std::endl; + } while (pVec1 < pEnd2); // Return the raw inner product result - return result; + return _mm512_reduce_add_ps(sum);; } template // 0..15 From 8b4188b01450500d3e166a3450d2652b7ac92b3e Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:18:04 +0300 Subject: [PATCH 04/52] vec1 --- src/VecSim/spaces/IP/IP_AVX512F_SQ8.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h index c179aa0e9..8c32ca6a7 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h @@ -40,7 +40,7 @@ template // 0..15 float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, float inv_norm = 1.0f) { const float *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); - const uint8_t *pEnd2 = pVec2 + dimension; + const uint8_t *pEnd1 = pVec1 + dimension; // Get dequantization parameters from the end of pVec2 const float min_val = *reinterpret_cast(pVec2 + dimension); @@ -84,7 +84,7 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi // Process remaining full chunks of 16 elements do { SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); - } while (pVec1 < pEnd2); + } while (pVec1 < pEnd1); // Return the raw inner product result return _mm512_reduce_add_ps(sum);; From a1d1a162f3e3df1eda4c05dc72fe49c9dffa0060 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:18:36 +0300 Subject: [PATCH 05/52] float --- src/VecSim/spaces/IP/IP_AVX512F_SQ8.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h index 8c32ca6a7..36b2d0ff4 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h @@ -40,7 +40,7 @@ template // 0..15 float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, float inv_norm = 1.0f) { const float *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); - const uint8_t *pEnd1 = pVec1 + dimension; + const float *pEnd1 = pVec1 + dimension; // Get dequantization parameters from the end of pVec2 const float min_val = *reinterpret_cast(pVec2 + dimension); From b5860bbc61a6f7abc5e6565a93524b658dec1e72 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:19:58 +0300 Subject: [PATCH 06/52] finish --- src/VecSim/spaces/IP_space.cpp | 6 +++--- src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h | 4 ++-- tests/unit/test_spaces.cpp | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 497605744..fc1b18aa9 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -66,11 +66,11 @@ namespace spaces { if (dim < 16) { return ret_dist_func; } - #ifdef OPT_AVX512_F_BW_VL_VNNI - if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { + #ifdef OPT_AVX512F + if (features.avx512f) { if (dim % 16 == 0) // no point in aligning if we have an offsetting residual *alignment = 16 * sizeof(float); // handles 16 floats - return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + return Choose_SQ8_IP_implementation_AVX512F(dim); } #endif #ifdef OPT_AVX diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index b6760eca9..c44dfe635 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -22,8 +22,8 @@ dist_func_t Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) dist_func_t Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); -dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); -dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_AVX512F(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_AVX512F(size_t dim); } // namespace spaces diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 6859fe30b..fa840655f 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2199,7 +2199,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { if (optimization.avx512f) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F(dim)) << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX512 with dim " << dim; From 0d07d718f35f9a196ce48b7f9523fb82239f61fa Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:20:42 +0300 Subject: [PATCH 07/52] now --- src/VecSim/spaces/IP_space.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index fc1b18aa9..07b849f0c 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -124,11 +124,11 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, if (dim < 16) { return ret_dist_func; } - #ifdef OPT_AVX512_F_BW_VL_VNNI - if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { + #ifdef OPT_AVX512F + if (features.avx512f) { if (dim % 16 == 0) // no point in aligning if we have an offsetting residual *alignment = 16 * sizeof(float); // handles 16 floats - return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + return Choose_SQ8_Cosine_implementation_AVX512F(dim); } #endif // #ifdef OPT_AVX From 66c49e8b3dd4b8f2eb1764c64befb3ff290b9e39 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:21:25 +0300 Subject: [PATCH 08/52] remove Choose_SQ8_Cosine_implementation_AVX512F --- src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index d06a68d02..3ce3b46ad 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -62,7 +62,7 @@ dist_func_t Choose_SQ8_IP_implementation_AVX512F(size_t dim) { CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F); return ret_dist_func; } -dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { +dist_func_t Choose_SQ8_Cosine_implementation_AVX512F(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F); return ret_dist_func; From aa26c717782351b857ebec52ba02780bd5ae0bd0 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:22:49 +0300 Subject: [PATCH 09/52] in test --- tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index 197765e85..e5f457d9c 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -39,8 +39,8 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512_F_BW_VL_VNNI functions #ifdef OPT_AVX512_F_BW_VL_VNNI -bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 32, +bool avx512_f_bw_vl_vnni_supported = opt.avx512f; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F, 32, avx512_f_bw_vl_vnni_supported); // INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, // avx512_f_bw_vl_vnni_supported); From 43b58a8e370c10d124bcd5ac84979335532a39c1 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:23:46 +0300 Subject: [PATCH 10/52] alignemnt --- tests/unit/test_spaces.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index fa840655f..a251cb28b 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2203,7 +2203,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX512 with dim " << dim; - ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; optimization.avx512f = 0; } #endif From 1e12fa34153904c032fe28f42c809f7909a48702 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:35:13 +0300 Subject: [PATCH 11/52] back to bw --- ...512F_SQ8.h => IP_AVX512F_SQ8_BW_VL_VNNI.h} | 4 ++-- src/VecSim/spaces/IP/IP_AVX_SQ8.h | 22 +++++++++++++++++-- src/VecSim/spaces/IP_space.cpp | 12 +++++----- src/VecSim/spaces/functions/AVX512F.cpp | 1 + src/VecSim/spaces/functions/AVX512F.h | 3 +++ .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 10 ++++----- .../spaces/functions/AVX512F_BW_VL_VNNI.h | 5 ++--- 7 files changed, 39 insertions(+), 18 deletions(-) rename src/VecSim/spaces/IP/{IP_AVX512F_SQ8.h => IP_AVX512F_SQ8_BW_VL_VNNI.h} (96%) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h similarity index 96% rename from src/VecSim/spaces/IP/IP_AVX512F_SQ8.h rename to src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index 36b2d0ff4..b33b3629c 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -91,7 +91,7 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi } template // 0..15 -float SQ8_InnerProductSIMD16_AVX512F(const void *pVec1v, +float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { // Calculate inner product using common implementation @@ -102,7 +102,7 @@ float SQ8_InnerProductSIMD16_AVX512F(const void *pVec1v, } template // 0..15 -float SQ8_CosineSIMD16_AVX512F(const void *pVec1v, const void *pVec2v, +float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { // Get the inverse norm factor stored after min_val and delta const uint8_t *pVec2 = static_cast(pVec2v); diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h index 38c836652..b68de3c4d 100644 --- a/src/VecSim/spaces/IP/IP_AVX_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h @@ -33,7 +33,7 @@ static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 } template // 0..15 -float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { float *pVect1 = (float *)pVect1v; uint8_t *quantized = (uint8_t *)pVect2v; @@ -112,5 +112,23 @@ float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_ InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); } while (pVect1 < pEnd1); - return 1.0f - my_mm256_reduce_add_ps(sum256); + return my_mm256_reduce_add_ps(sum256); +} + +float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { + return SQ8_InnerProductImp<0>(pVect1v, pVect2v, dimension); +} + +template // 0..15 +float SQ8_CosineSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { + // Get dequantization parameters from the end of quantized vector + const uint8_t *pVect2 = static_cast(pVect2v); + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Calculate inner product using common implementation with normalization + float ip = SQ8_InnerProductImp(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + // The inv_norm parameter is stored after min_val and delta in the quantized vector + return 1.0f - ip * inv_norm; } diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 07b849f0c..3ba81a92e 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -66,11 +66,11 @@ namespace spaces { if (dim < 16) { return ret_dist_func; } - #ifdef OPT_AVX512F - if (features.avx512f) { + #ifdef OPT_AVX512F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { if (dim % 16 == 0) // no point in aligning if we have an offsetting residual *alignment = 16 * sizeof(float); // handles 16 floats - return Choose_SQ8_IP_implementation_AVX512F(dim); + return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); } #endif #ifdef OPT_AVX @@ -124,11 +124,11 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, if (dim < 16) { return ret_dist_func; } - #ifdef OPT_AVX512F - if (features.avx512f) { + #ifdef OPT_AVX512F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { if (dim % 16 == 0) // no point in aligning if we have an offsetting residual *alignment = 16 * sizeof(float); // handles 16 floats - return Choose_SQ8_Cosine_implementation_AVX512F(dim); + return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } #endif // #ifdef OPT_AVX diff --git a/src/VecSim/spaces/functions/AVX512F.cpp b/src/VecSim/spaces/functions/AVX512F.cpp index bcddbea91..c9124f3b4 100644 --- a/src/VecSim/spaces/functions/AVX512F.cpp +++ b/src/VecSim/spaces/functions/AVX512F.cpp @@ -16,6 +16,7 @@ #include "VecSim/spaces/IP/IP_AVX512F_FP32.h" #include "VecSim/spaces/IP/IP_AVX512F_FP64.h" + namespace spaces { #include "implementation_chooser.h" diff --git a/src/VecSim/spaces/functions/AVX512F.h b/src/VecSim/spaces/functions/AVX512F.h index 9a9e9b48a..cce00f0f1 100644 --- a/src/VecSim/spaces/functions/AVX512F.h +++ b/src/VecSim/spaces/functions/AVX512F.h @@ -20,4 +20,7 @@ dist_func_t Choose_FP16_L2_implementation_AVX512F(size_t dim); dist_func_t Choose_FP32_L2_implementation_AVX512F(size_t dim); dist_func_t Choose_FP64_L2_implementation_AVX512F(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_AVX512F(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_AVX512F(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 3ce3b46ad..76809f6b5 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -14,7 +14,7 @@ #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" -#include "VecSim/spaces/IP/IP_AVX512F_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h" namespace spaces { @@ -57,14 +57,14 @@ dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t return ret_dist_func; } -dist_func_t Choose_SQ8_IP_implementation_AVX512F(size_t dim) { +dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } -dist_func_t Choose_SQ8_Cosine_implementation_AVX512F(size_t dim) { +dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index c44dfe635..e2d587ef0 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -22,8 +22,7 @@ dist_func_t Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) dist_func_t Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); -dist_func_t Choose_SQ8_IP_implementation_AVX512F(size_t dim); -dist_func_t Choose_SQ8_Cosine_implementation_AVX512F(size_t dim); - +dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); } // namespace spaces From 984a0305adf2af668f8df2fd2b817d8fb83f026e Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:38:49 +0300 Subject: [PATCH 12/52] back again --- src/VecSim/spaces/IP/IP_AVX_SQ8.h | 3 ++- tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp | 5 +++-- tests/unit/test_spaces.cpp | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h index b68de3c4d..c917f7787 100644 --- a/src/VecSim/spaces/IP/IP_AVX_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h @@ -115,8 +115,9 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen return my_mm256_reduce_add_ps(sum256); } +template // 0..15 float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { - return SQ8_InnerProductImp<0>(pVect1v, pVect2v, dimension); + return SQ8_InnerProductImp(pVect1v, pVect2v, dimension); } template // 0..15 diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index e5f457d9c..cbf0b7e5b 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -39,8 +39,9 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512_F_BW_VL_VNNI functions #ifdef OPT_AVX512_F_BW_VL_VNNI -bool avx512_f_bw_vl_vnni_supported = opt.avx512f; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F, 32, +bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && + opt.avx512vl && opt.avx512vnni; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 32, avx512_f_bw_vl_vnni_supported); // INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, // avx512_f_bw_vl_vnni_supported); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index a251cb28b..7d4ddfdd0 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2199,7 +2199,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { if (optimization.avx512f) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F(dim)) + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX512 with dim " << dim; @@ -2307,7 +2307,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F(dim)) + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX512 with dim " << dim; From c3670a85a497123c0f448fb8f6f676b0ceac0d8b Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:41:46 +0300 Subject: [PATCH 13/52] again --- tests/unit/test_spaces.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 7d4ddfdd0..7ad503c0f 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2195,8 +2195,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); // Test different optimizations based on CPU features - #ifdef OPT_AVX512F - if (optimization.avx512f) { + #ifdef OPT_AVX512F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) @@ -2303,8 +2303,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // Test different optimizations based on CPU features #ifdef OPT_AVX512F - if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && - optimization.avx512vnni) { + if (features.avx512f && features.avx512bw && features.avx512vnni) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) From 11303b7e89e43f344051b513c0ab2ad52cfc591b Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:42:49 +0300 Subject: [PATCH 14/52] optimization --- tests/unit/test_spaces.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 7ad503c0f..8994db979 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2196,7 +2196,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // Test different optimizations based on CPU features #ifdef OPT_AVX512F_BW_VL_VNNI - if (features.avx512f && features.avx512bw && features.avx512vnni) { + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) @@ -2303,7 +2303,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // Test different optimizations based on CPU features #ifdef OPT_AVX512F - if (features.avx512f && features.avx512bw && features.avx512vnni) { + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) From 7474c059513c8756212e23313f51069b206c2342 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:43:53 +0300 Subject: [PATCH 15/52] more BW --- tests/unit/test_spaces.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 8994db979..aaa1c7ef8 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2302,7 +2302,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { #endif // Test different optimizations based on CPU features - #ifdef OPT_AVX512F + #ifdef OPT_AVX512F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); From 2cfd9b699547b83869b670f73b6f7e005f8fd986 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:55:11 +0300 Subject: [PATCH 16/52] fix avx --- src/VecSim/spaces/IP/IP_AVX_SQ8.h | 66 +++++++++---------------------- 1 file changed, 19 insertions(+), 47 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h index c917f7787..2fbd4401f 100644 --- a/src/VecSim/spaces/IP/IP_AVX_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h @@ -9,7 +9,7 @@ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" -static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 &sum256, +static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, const __m256 &min_val_vec, const __m256 &delta_vec) { // Load 8 float elements from pVect1 __m256 v1 = _mm256_loadu_ps(pVect1); @@ -34,58 +34,30 @@ static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 template // 0..15 float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { - float *pVect1 = (float *)pVect1v; - uint8_t *quantized = (uint8_t *)pVect2v; - - // Get dequantization parameters from the end of quantized vector - float min = *(float *)(quantized + dimension); - float delta = *(float *)(quantized + dimension + sizeof(float)); + const float *pVect1 = static_cast(pVect1v); + // pVect2 is a quantized uint8_t vector + const uint8_t *pVect2 = static_cast(pVect2v); + const float *pEnd1 = pVect1 + dimension; + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); // Create broadcast vectors for SIMD operations - __m256 min_val_vec = _mm256_set1_ps(min); + __m256 min_val_vec = _mm256_set1_ps(min_val); __m256 delta_vec = _mm256_set1_ps(delta); - const float *pEnd1 = pVect1 + dimension; - __m256 sum256 = _mm256_setzero_ps(); - // Deal with 1-7 floats with mask loading, if needed + // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one + // 16-float block, so mask loading is guaranteed to be safe. if constexpr (residual % 8) { __mmask8 constexpr mask = (1 << (residual % 8)) - 1; - - // Load masked float elements __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); pVect1 += residual % 8; - // Load masked uint8 elements - __m128i v2_128; - if constexpr (residual % 8 <= 4) { - // Load 4 or fewer bytes directly using unaligned loads and shifts - uint32_t temp = 0; - // Direct byte-by-byte loading to avoid memcpy - switch (residual % 8) { - case 4: temp |= (uint32_t)quantized[3] << 24; - case 3: temp |= (uint32_t)quantized[2] << 16; - case 2: temp |= (uint32_t)quantized[1] << 8; - case 1: temp |= quantized[0]; - } - v2_128 = _mm_cvtsi32_si128(temp); - } else { - // Load 5-7 bytes directly using unaligned loads and shifts - uint64_t temp = 0; - // Direct byte-by-byte loading to avoid memcpy - switch (residual % 8) { - case 7: temp |= (uint64_t)quantized[6] << 48; - case 6: temp |= (uint64_t)quantized[5] << 40; - case 5: temp |= (uint64_t)quantized[4] << 32; - case 4: temp |= (uint64_t)quantized[3] << 24; - case 3: temp |= (uint64_t)quantized[2] << 16; - case 2: temp |= (uint64_t)quantized[1] << 8; - case 1: temp |= quantized[0]; - } - v2_128 = _mm_cvtsi64_si128(temp); - } - quantized += residual % 8; + // Load quantized values and dequantize + __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2); + pVect2 += residual % 8; // Zero-extend uint8 to int32 __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); @@ -93,7 +65,7 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - // Dequantize: (val * delta) + min + // Dequantize: (val * delta) + min_val __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); // Compute dot product with masking @@ -102,14 +74,14 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen // If the reminder is >=8, have another step of 8 floats if constexpr (residual >= 8) { - InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); } // We dealt with the residual part. We are left with some multiple of 16 floats. // In each iteration we calculate 16 floats = 512 bits. do { - InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); - InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); } while (pVect1 < pEnd1); return my_mm256_reduce_add_ps(sum256); @@ -117,7 +89,7 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen template // 0..15 float SQ8_InnerProductSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { - return SQ8_InnerProductImp(pVect1v, pVect2v, dimension); + return 1.0f - SQ8_InnerProductImp(pVect1v, pVect2v, dimension); } template // 0..15 From 3cdf05ee4658c8990d44e1a7f585848b0a4b461b Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:56:25 +0300 Subject: [PATCH 17/52] add avx cosine test --- tests/unit/test_spaces.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index aaa1c7ef8..5cf9655c0 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2315,6 +2315,18 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { optimization.avx512f = 0; } #endif + #ifdef OPT_AVX + if (optimization.avx) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + // We don't align SQ8 vectors with cosine distance + // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim; + optimization.avx = 0; + } #ifdef OPT_SSE if (optimization.sse) { From fc8bc7ded3d05e93951d05f24e78ab53a5e2a8d6 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 13:58:30 +0300 Subject: [PATCH 18/52] avx --- src/VecSim/spaces/IP_space.cpp | 14 +++++++------- src/VecSim/spaces/functions/AVX.h | 2 ++ src/VecSim/spaces/functions/AVX2.h | 1 - tests/unit/test_spaces.cpp | 1 + 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 3ba81a92e..f3d3dc07e 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -131,13 +131,13 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } #endif - // #ifdef OPT_AVX - // if (features.avx) { - // if (dim % 8 == 0) // no point in aligning if we have an offsetting residual - // *alignment = 8 * sizeof(float); // handles 8 floats - // return Choose_SQ8_Cosine_implementation_AVX(dim); - // } - // #endif + #ifdef OPT_AVX + if (features.avx) { + if (dim % 8 == 0) // no point in aligning if we have an offsetting residual + *alignment = 8 * sizeof(float); // handles 8 floats + return Choose_SQ8_Cosine_implementation_AVX(dim); + } + #endif #ifdef OPT_SSE if (features.sse) { if (dim % 4 == 0) // no point in aligning if we have an offsetting residual diff --git a/src/VecSim/spaces/functions/AVX.h b/src/VecSim/spaces/functions/AVX.h index 7f2c38b1f..ccdede166 100644 --- a/src/VecSim/spaces/functions/AVX.h +++ b/src/VecSim/spaces/functions/AVX.h @@ -13,6 +13,8 @@ namespace spaces { dist_func_t Choose_SQ8_IP_implementation_AVX(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_AVX(size_t dim); + dist_func_t Choose_FP32_IP_implementation_AVX(size_t dim); dist_func_t Choose_FP64_IP_implementation_AVX(size_t dim); diff --git a/src/VecSim/spaces/functions/AVX2.h b/src/VecSim/spaces/functions/AVX2.h index 06b0269de..8ad04a8a5 100644 --- a/src/VecSim/spaces/functions/AVX2.h +++ b/src/VecSim/spaces/functions/AVX2.h @@ -14,6 +14,5 @@ namespace spaces { dist_func_t Choose_BF16_IP_implementation_AVX2(size_t dim); dist_func_t Choose_BF16_L2_implementation_AVX2(size_t dim); -dist_func_t Choose_SQ8_IP_implementation_AVX2(size_t dim); } // namespace spaces diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 5cf9655c0..307bf6c5c 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2327,6 +2327,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim; optimization.avx = 0; } + #endif #ifdef OPT_SSE if (optimization.sse) { From 513839b639bd77c70a46f7d925964d315e250d98 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 14:00:19 +0300 Subject: [PATCH 19/52] add impl --- src/VecSim/spaces/functions/AVX.cpp | 6 ++++++ src/VecSim/spaces/functions/AVX2.cpp | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/VecSim/spaces/functions/AVX.cpp b/src/VecSim/spaces/functions/AVX.cpp index d0e5b6fbe..33ef7b4dc 100644 --- a/src/VecSim/spaces/functions/AVX.cpp +++ b/src/VecSim/spaces/functions/AVX.cpp @@ -25,6 +25,12 @@ dist_func_t Choose_SQ8_IP_implementation_AVX(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_Cosine_implementation_AVX(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX); + return ret_dist_func; +} + dist_func_t Choose_FP32_IP_implementation_AVX(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX); diff --git a/src/VecSim/spaces/functions/AVX2.cpp b/src/VecSim/spaces/functions/AVX2.cpp index 5e0bde6c8..b7df68ce9 100644 --- a/src/VecSim/spaces/functions/AVX2.cpp +++ b/src/VecSim/spaces/functions/AVX2.cpp @@ -28,12 +28,6 @@ dist_func_t Choose_BF16_L2_implementation_AVX2(size_t dim) { return ret_dist_func; } -dist_func_t Choose_SQ8_IP_implementation_AVX2(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2); - return ret_dist_func; -} - #include "implementation_chooser_cleanup.h" } // namespace spaces From f676c1bbc8593e10ae697dc3ebe4adf41ef3eca5 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 14:14:12 +0300 Subject: [PATCH 20/52] add l2 --- .../spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h | 98 +++++++++++++++++++ src/VecSim/spaces/L2_space.cpp | 26 ++--- src/VecSim/spaces/L2_space.h | 4 +- .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 6 ++ .../spaces/functions/AVX512F_BW_VL_VNNI.h | 1 + tests/unit/test_spaces.cpp | 86 ++++++++-------- 6 files changed, 163 insertions(+), 58 deletions(-) create mode 100644 src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h new file mode 100644 index 000000000..448388932 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). +*/ +#include "VecSim/spaces/space_includes.h" + +// Helper function to perform L2 squared distance calculation for a chunk of 16 elements +static inline void +SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum, + const __m512 &min_val_vec, const __m512 &delta_vec, const __m512 &inv_norm_vec) { + // Load 16 float elements from pVect1 + __m512 v1 = _mm512_loadu_ps(pVect1); + + // Load 16 uint8 elements from pVect2 and convert to __m512i + __m128i v2_128 = _mm_loadu_si128((__m128i*)pVect2); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + + // Convert uint8 to float + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize: (val * delta + min_val) * inv_norm + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + dequantized = _mm512_mul_ps(dequantized, inv_norm_vec); + + // Compute difference + __m512 diff = _mm512_sub_ps(v1, dequantized); + + // Square difference and add to sum + sum = _mm512_fmadd_ps(diff, diff, sum); + + // Advance pointers + pVect1 += 16; + pVect2 += 16; +} + +template // 0..15 +float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, + size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); + const float *pEnd1 = pVect1 + dimension; + + // Get dequantization parameters from the end of pVect2 + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m512 min_val_vec = _mm512_set1_ps(min_val); + __m512 delta_vec = _mm512_set1_ps(delta); + __m512 inv_norm_vec = _mm512_set1_ps(inv_norm); + + // Initialize sum accumulator + __m512 sum = _mm512_setzero_ps(); + + // Handle residual elements (0 to 15) + if constexpr (residual > 0) { + // Create mask for residual elements + __mmask16 mask = (1U << residual) - 1; + + // Load masked float elements from pVect1 + __m512 v1 = _mm512_maskz_loadu_ps(mask, pVect1); + + // Load masked uint8 elements from pVect2 + __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVect2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize: (val * delta + min_val) * inv_norm + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + dequantized = _mm512_mul_ps(dequantized, inv_norm_vec); + + // Compute difference + __m512 diff = _mm512_sub_ps(v1, dequantized); + + // Square difference and add to sum (with mask) + __m512 squared = _mm512_mul_ps(diff, diff); + sum = _mm512_mask_add_ps(sum, mask, sum, squared); + + // Advance pointers + pVect1 += residual; + pVect2 += residual; + } + + // Process remaining full chunks of 16 elements + do { + SQ8_L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec, inv_norm_vec); + }while (pVect1 < pEnd1); + + // Horizontal sum + float result = _mm512_reduce_add_ps(sum); + + return result; +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 488e2fe5a..ff9976fe0 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -60,19 +60,19 @@ namespace spaces { // #endif // #endif - // #ifdef CPU_FEATURES_ARCH_X86_64 - // // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + #ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. - // if (dim < 16) { - // return ret_dist_func; - // } - // #ifdef OPT_AVX512F - // if (features.avx512f) { - // if (dim % 16 == 0) // no point in aligning if we have an offsetting residual - // *alignment = 16 * sizeof(float); // handles 16 floats - // return Choose_SQ8_L2_implementation_AVX512F(dim); - // } - // #endif + if (dim < 16) { + return ret_dist_func; + } + #ifdef OPT_AVX512F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { + if (dim % 16 == 0) // no point in aligning if we have an offsetting residual + *alignment = 16 * sizeof(float); // handles 16 floats + return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim); + } + #endif // #ifdef OPT_AVX // if (features.avx) { // if (dim % 8 == 0) // no point in aligning if we have an offsetting residual @@ -87,7 +87,7 @@ namespace spaces { // return Choose_SQ8_L2_implementation_SSE(dim); // } // #endif - // #endif // __x86_64__ + #endif // __x86_64__ return ret_dist_func; } diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h index 1bdd52473..a58fcd7e4 100644 --- a/src/VecSim/spaces/L2_space.h +++ b/src/VecSim/spaces/L2_space.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #pragma once #include "VecSim/spaces/spaces.h" @@ -22,4 +22,6 @@ dist_func_t L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nu const void *arch_opt = nullptr); dist_func_t L2_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 76809f6b5..889725204 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -15,6 +15,7 @@ #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" #include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h" +#include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h" namespace spaces { @@ -67,6 +68,11 @@ dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t di CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } +dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} #include "implementation_chooser_cleanup.h" diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index e2d587ef0..77eff5d57 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -24,5 +24,6 @@ dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); } // namespace spaces diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 307bf6c5c..664d78f25 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2106,56 +2106,54 @@ std::vector CreateSQ8CompressedVector(const float *original, size_t dim class SQ8SpacesOptimizationTest : public testing::TestWithParam {}; -// TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { -// auto optimization = getCpuOptimizationFeatures(); -// size_t dim = GetParam(); +TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); -// // Create original vectors -// std::vector v1_orig(dim); -// std::vector v2_orig(dim); -// for (size_t i = 0; i < dim; i++) { -// v1_orig[i] = float(i + 1.5); -// v2_orig[i] = float(i * 0.75 + 1.0); -// } + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } -// // Create SQ8 compressed version of v2 -// std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim, false); + // Create SQ8 compressed version of v2 + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); -// auto expected_alignment = [](size_t reg_bit_size, size_t dim) { -// size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; -// return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0; -// }; + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0; + }; -// dist_func_t arch_opt_func; -// float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); - -// // Test different optimizations based on CPU features -// #ifdef OPT_AVX512_F_BW_VL_VNNI -// if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && -// optimization.avx512vnni) { -// unsigned char alignment = 0; -// arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); -// ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) -// << "Unexpected distance function chosen for dim " << dim; -// ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) -// << "AVX512 with dim " << dim; -// ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; -// // Unset optimizations flag, so we'll choose the next optimization. -// optimization.avx512f = optimization.avx512bw = optimization.avx512vl = -// optimization.avx512vnni = 0; -// } -// #endif + dist_func_t arch_opt_func; + float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); -// // Add other optimizations as needed (SVE2, SVE, NEON, etc.) + // Test different optimizations based on CPU features + #ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX512 with dim " << dim; + ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.avx512f = 0; + } + #endif -// // Test default implementation -// unsigned char alignment = 0; -// arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); -// ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; -// ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) -// << "No optimization with dim " << dim; -// ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; -// } + // Add other optimizations as needed (SVE2, SVE, NEON, etc.) + + // Test default implementation + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { auto optimization = getCpuOptimizationFeatures(); From 9a899ccf6ce85a8b966c9da12fd5ae8591a5ec70 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 14:23:44 +0300 Subject: [PATCH 21/52] replace OPT_AVX512_F_BW_VL_VNNI --- src/VecSim/spaces/IP_space.cpp | 4 ++-- src/VecSim/spaces/L2_space.cpp | 2 +- tests/unit/test_spaces.cpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index f3d3dc07e..52aa5760f 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -66,7 +66,7 @@ namespace spaces { if (dim < 16) { return ret_dist_func; } - #ifdef OPT_AVX512F_BW_VL_VNNI + #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vnni) { if (dim % 16 == 0) // no point in aligning if we have an offsetting residual *alignment = 16 * sizeof(float); // handles 16 floats @@ -124,7 +124,7 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, if (dim < 16) { return ret_dist_func; } - #ifdef OPT_AVX512F_BW_VL_VNNI + #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vnni) { if (dim % 16 == 0) // no point in aligning if we have an offsetting residual *alignment = 16 * sizeof(float); // handles 16 floats diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index ff9976fe0..4febd8057 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -66,7 +66,7 @@ namespace spaces { if (dim < 16) { return ret_dist_func; } - #ifdef OPT_AVX512F_BW_VL_VNNI + #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vnni) { if (dim % 16 == 0) // no point in aligning if we have an offsetting residual *alignment = 16 * sizeof(float); // handles 16 floats diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 664d78f25..0eb70aa16 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2193,7 +2193,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); // Test different optimizations based on CPU features - #ifdef OPT_AVX512F_BW_VL_VNNI + #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2300,7 +2300,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { #endif // Test different optimizations based on CPU features - #ifdef OPT_AVX512F_BW_VL_VNNI + #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); From 4fa53277add948335d7e33fac313c162f1b3ee01 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 14:24:37 +0300 Subject: [PATCH 22/52] align --- tests/unit/test_spaces.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 0eb70aa16..8e54f8a2a 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2138,7 +2138,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX512 with dim " << dim; - ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = 0; } From 1379d6d260f789fae3009ea3c62d4debd08b8c98 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 15:59:52 +0300 Subject: [PATCH 23/52] Fix avx --- src/VecSim/spaces/IP/IP_AVX_SQ8.h | 2 + src/VecSim/spaces/L2/L2.cpp | 10 +- .../spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h | 8 +- src/VecSim/spaces/L2/L2_AVX_SQ8.h | 99 ++++++++++++++++--- src/VecSim/spaces/L2_space.cpp | 14 +-- src/VecSim/spaces/functions/AVX.cpp | 7 ++ src/VecSim/spaces/functions/AVX.h | 1 + tests/unit/test_spaces.cpp | 18 +++- 8 files changed, 125 insertions(+), 34 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h index 2fbd4401f..d28a13a4f 100644 --- a/src/VecSim/spaces/IP/IP_AVX_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h @@ -67,6 +67,8 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen // Dequantize: (val * delta) + min_val __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); + v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); + // Compute dot product with masking sum256 = _mm256_mul_ps(v1, v2_dequant); diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 08ea8674c..85e78edb2 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -10,6 +10,7 @@ #include "VecSim/types/bfloat16.h" #include "VecSim/types/float16.h" #include +#include using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; @@ -22,14 +23,17 @@ float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { // The last two values are used to dequantize the vector. const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); float res = 0; for (size_t i = 0; i < dimension; i++) { - auto dequantized_normalized_V2 = (pVect2[i] * delta + min_val) * inv_norm; - float t = pVect1[i] - dequantized_normalized_V2; + auto dequantized_V2 = (pVect2[i] * delta + min_val); + std::cout << dequantized_V2 << " "; + float t = pVect1[i] - dequantized_V2; res += t * t; } + // The last value is used to normalize the vector. + // The normalization is done by multiplying the result by the inverse of the norm. + std::cout << std::endl; return res; } diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h index 448388932..c3d06d1a3 100644 --- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h @@ -11,7 +11,7 @@ // Helper function to perform L2 squared distance calculation for a chunk of 16 elements static inline void SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum, - const __m512 &min_val_vec, const __m512 &delta_vec, const __m512 &inv_norm_vec) { + const __m512 &min_val_vec, const __m512 &delta_vec) { // Load 16 float elements from pVect1 __m512 v1 = _mm512_loadu_ps(pVect1); @@ -24,7 +24,6 @@ SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum, // Dequantize: (val * delta + min_val) * inv_norm __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); - dequantized = _mm512_mul_ps(dequantized, inv_norm_vec); // Compute difference __m512 diff = _mm512_sub_ps(v1, dequantized); @@ -47,12 +46,10 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2 // Get dequantization parameters from the end of pVect2 const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); // Create broadcast vectors for SIMD operations __m512 min_val_vec = _mm512_set1_ps(min_val); __m512 delta_vec = _mm512_set1_ps(delta); - __m512 inv_norm_vec = _mm512_set1_ps(inv_norm); // Initialize sum accumulator __m512 sum = _mm512_setzero_ps(); @@ -72,7 +69,6 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2 // Dequantize: (val * delta + min_val) * inv_norm __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); - dequantized = _mm512_mul_ps(dequantized, inv_norm_vec); // Compute difference __m512 diff = _mm512_sub_ps(v1, dequantized); @@ -88,7 +84,7 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2 // Process remaining full chunks of 16 elements do { - SQ8_L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec, inv_norm_vec); + SQ8_L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); }while (pVect1 < pEnd1); // Horizontal sum diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h index e4cf82c45..715b147f3 100644 --- a/src/VecSim/spaces/L2/L2_AVX_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h @@ -8,21 +8,48 @@ */ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" +#include -static inline void L2SqrStep(float *&pVect1, float *&pVect2, __m256 &sum) { +static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m256 &sum, + const __m256 &min_val_vec, const __m256 &delta_vec) { + // Load 8 float elements from pVect1 __m256 v1 = _mm256_loadu_ps(pVect1); + + // Load 8 uint8 elements from pVect2 + __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2); + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: (val * delta) + min_val + __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute difference + __m256 diff = _mm256_sub_ps(v1, v2_dequant); + + // Square difference and add to sum + sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); + + // Advance pointers pVect1 += 8; - __m256 v2 = _mm256_loadu_ps(pVect2); pVect2 += 8; - __m256 diff = _mm256_sub_ps(v1, v2); - // sum = _mm256_fmadd_ps(diff, diff, sum); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); } template // 0..15 -float FP32_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { float *pVect1 = (float *)pVect1v; - float *pVect2 = (float *)pVect2v; + uint8_t *pVect2 = (uint8_t *)pVect2v; + float *pVect1_debug = (float *)pVect1v; + uint8_t *pVect2_debug = (uint8_t *)pVect2v; + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Create broadcast vectors for SIMD operations + __m256 min_val_vec = _mm256_set1_ps(min_val); + __m256 delta_vec = _mm256_set1_ps(delta); const float *pEnd1 = pVect1 + dimension; @@ -30,25 +57,65 @@ float FP32_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dime // Deal with 1-7 floats with mask loading, if needed if constexpr (residual % 8) { - __mmask8 constexpr mask8 = (1 << (residual % 8)) - 1; - __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); + __mmask8 constexpr mask = (1 << (residual % 8)) - 1; + __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); pVect1 += residual % 8; - __m256 v2 = my_mm256_maskz_loadu_ps(pVect2); + + uint8_t temp_buf[8] = {0}; + // Manually copy elements + for (size_t i = 0; i < residual % 8; i++) { + temp_buf[i] = pVect2[i]; + } + // Load from buffer + __m128i v2_128 = _mm_loadl_epi64((__m128i*)temp_buf); pVect2 += residual % 8; - __m256 diff = _mm256_sub_ps(v1, v2); + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: (val * delta) + min_val + __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); + // print debug information + // std::cout << "v2_dequant before: "; + // for (size_t i = 0; i < 8; i++) { + // std::cout << v2_dequant[i] << " "; + // } + // std::cout << std::endl; + + v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); + // std::cout << "v2_dequant after: "; + // for (size_t i = 0; i < 8; i++) { + // std::cout << v2_dequant[i] << " "; + // } + // std::cout << std::endl; + + __m256 diff = _mm256_sub_ps(v1, v2_dequant); + + sum = _mm256_mul_ps(diff, diff); + // print sum } - // If the reminder is >=8, have another step of 8 floats + // If the reminder is >= 8, have another step of 8 floats if constexpr (residual >= 8) { - L2SqrStep(pVect1, pVect2, sum); + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); } - + float naive_sum = 0; + for (size_t i = 0; i < residual; i++) { + auto dequantized_V2 = (pVect2_debug[i] * delta + min_val); + float t = pVect1_debug[i] - dequantized_V2; + naive_sum += t * t; + } + std::cout <<"residual: " << (int)residual << " " << naive_sum << " " << my_mm256_reduce_add_ps(sum) << std::endl; + // We dealt with the residual part. We are left with some multiple of 16 floats. // In each iteration we calculate 16 floats = 512 bits. do { - L2SqrStep(pVect1, pVect2, sum); - L2SqrStep(pVect1, pVect2, sum); + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); } while (pVect1 < pEnd1); return my_mm256_reduce_add_ps(sum); diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 4febd8057..d7136c82f 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -73,13 +73,13 @@ namespace spaces { return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim); } #endif - // #ifdef OPT_AVX - // if (features.avx) { - // if (dim % 8 == 0) // no point in aligning if we have an offsetting residual - // *alignment = 8 * sizeof(float); // handles 8 floats - // return Choose_SQ8_L2_implementation_AVX(dim); - // } - // #endif + #ifdef OPT_AVX + if (features.avx) { + if (dim % 8 == 0) // no point in aligning if we have an offsetting residual + *alignment = 8 * sizeof(float); // handles 8 floats + return Choose_SQ8_L2_implementation_AVX(dim); + } + #endif // #ifdef OPT_SSE // if (features.sse) { // if (dim % 4 == 0) // no point in aligning if we have an offsetting residual diff --git a/src/VecSim/spaces/functions/AVX.cpp b/src/VecSim/spaces/functions/AVX.cpp index 33ef7b4dc..75ee8bf17 100644 --- a/src/VecSim/spaces/functions/AVX.cpp +++ b/src/VecSim/spaces/functions/AVX.cpp @@ -12,6 +12,7 @@ #include "VecSim/spaces/L2/L2_AVX_FP64.h" #include "VecSim/spaces/IP/IP_AVX_SQ8.h" +#include "VecSim/spaces/L2/L2_AVX_SQ8.h" #include "VecSim/spaces/IP/IP_AVX_FP32.h" #include "VecSim/spaces/IP/IP_AVX_FP64.h" @@ -31,6 +32,12 @@ dist_func_t Choose_SQ8_Cosine_implementation_AVX(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_L2_implementation_AVX(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX); + return ret_dist_func; +} + dist_func_t Choose_FP32_IP_implementation_AVX(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX); diff --git a/src/VecSim/spaces/functions/AVX.h b/src/VecSim/spaces/functions/AVX.h index ccdede166..416c8d5f8 100644 --- a/src/VecSim/spaces/functions/AVX.h +++ b/src/VecSim/spaces/functions/AVX.h @@ -14,6 +14,7 @@ namespace spaces { dist_func_t Choose_SQ8_IP_implementation_AVX(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_AVX(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_AVX(size_t dim); dist_func_t Choose_FP32_IP_implementation_AVX(size_t dim); dist_func_t Choose_FP64_IP_implementation_AVX(size_t dim); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 8e54f8a2a..2477a5fc9 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -435,6 +435,7 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); // Find min and max for quantization float min_val = v2_orig[0]; float max_val = v2_orig[0]; @@ -2117,7 +2118,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { v1_orig[i] = float(i + 1.5); v2_orig[i] = float(i * 0.75 + 1.0); } - + // Create SQ8 compressed version of v2 std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); @@ -2128,7 +2129,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { dist_func_t arch_opt_func; float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); - + std::cout << "baseline: " << baseline << std::endl; // Test different optimizations based on CPU features #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { @@ -2143,6 +2144,19 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { optimization.avx512f = 0; } #endif + #ifdef OPT_AVX + if (optimization.avx) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; + // Unset avx flag as well, so we'll choose the next optimization (SSE). + optimization.avx = 0; + } + #endif // Add other optimizations as needed (SVE2, SVE, NEON, etc.) From f7fdb2b25cbe5b9ee4c6d0064922541f23183afe Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 16:27:26 +0300 Subject: [PATCH 24/52] add l2 sse --- src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 30 ++--- src/VecSim/spaces/L2/L2_SSE_SQ8.h | 126 ++++++++++++++++++ src/VecSim/spaces/L2_space.cpp | 14 +- src/VecSim/spaces/functions/SSE.cpp | 7 + src/VecSim/spaces/functions/SSE.h | 2 + .../spaces_benchmarks/bm_spaces_sq8.cpp | 70 +++++----- tests/unit/test_spaces.cpp | 13 ++ 7 files changed, 198 insertions(+), 64 deletions(-) create mode 100644 src/VecSim/spaces/L2/L2_SSE_SQ8.h diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 6d0dd4af7..df2f134f1 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -36,14 +36,14 @@ static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 template // 0..15 float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { float *pVect1 = (float *)pVect1v; - uint8_t *quantized = (uint8_t *)pVect2v; + uint8_t *pVect2 = (uint8_t *)pVect2v; // Get dequantization parameters from the end of quantized vector - float min = *(float *)(quantized + dimension); - float delta = *(float *)(quantized + dimension + sizeof(float)); + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); // Create broadcast vectors for SIMD operations - __m256 min_val_vec = _mm256_set1_ps(min); + __m256 min_val_vec = _mm256_set1_ps(min_val); __m256 delta_vec = _mm256_set1_ps(delta); const float *pEnd1 = pVect1 + dimension; @@ -60,19 +60,8 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size pVect1 += residual % 8; // Load masked uint8 elements - __m128i v2_128; - if constexpr (residual % 8 <= 4) { - // Load 4 or fewer bytes - uint32_t temp = 0; - memcpy(&temp, quantized, residual % 8); - v2_128 = _mm_cvtsi32_si128(temp); - } else { - // Load 5-7 bytes - uint64_t temp = 0; - memcpy(&temp, quantized, residual % 8); - v2_128 = _mm_cvtsi64_si128(temp); - } - quantized += residual % 8; + __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2); + pVect2 += residual % 8; // Zero-extend uint8 to int32 (AVX2 instruction) __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); @@ -82,6 +71,7 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size // Dequantize: (val * delta) + min (using FMA) __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); // Compute dot product with masking sum256 = _mm256_mul_ps(v1, v2_dequant); @@ -89,14 +79,14 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size // If the reminder is >=8, have another step of 8 floats if constexpr (residual >= 8) { - InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); } // We dealt with the residual part. We are left with some multiple of 16 floats. // In each iteration we calculate 16 floats = 512 bits. while (pVect1 < pEnd1) { - InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); - InnerProductStepSQ8(pVect1, quantized, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); } // Horizontal sum - AVX2 can use more efficient reduction diff --git a/src/VecSim/spaces/L2/L2_SSE_SQ8.h b/src/VecSim/spaces/L2/L2_SSE_SQ8.h new file mode 100644 index 000000000..89cd7db1a --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SSE_SQ8.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). +*/ +#include "VecSim/spaces/space_includes.h" +#include + +static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m128 &sum, + const __m128 &min_val_vec, const __m128 &delta_vec) { + // Load 4 float elements from pVect1 + __m128 v1 = _mm_loadu_ps(pVect1); + pVect1 += 4; + + // Load 4 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float*)pVect2))); + pVect2 += 4; + + // Convert int32 to float + __m128 v2_f = _mm_cvtepi32_ps(v2_i); + + // Dequantize: (val * delta) + min_val + __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute difference + __m128 diff = _mm_sub_ps(v1, v2_dequant); + + // Square difference and add to sum + sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); +} + +template // 0..15 +float SQ8_L2SqrSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dimension) { + float *pVect1 = (float *)pVect1v; + uint8_t *pVect2 = (uint8_t *)pVect2v; + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m128 min_val_vec = _mm_set1_ps(min_val); + __m128 delta_vec = _mm_set1_ps(delta); + + const float *pEnd1 = pVect1 + dimension; + + __m128 sum = _mm_setzero_ps(); + + // Process residual elements if needed + if constexpr (residual) { + // Handle residual elements (1-3) + if constexpr (residual % 4) { + __m128 v1; + __m128 v2_dequant = _mm_setzero_ps(); + + if constexpr (residual % 4 == 3) { + // Load 3 floats and set the last one to 0 + v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0 + v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part + + // Dequantize first value + float dequant0 = pVect2[0] * delta + min_val; + v2_dequant = _mm_load_ss(&dequant0); + + // Dequantize next two values + float dequant_high[2] = { + pVect2[1] * delta + min_val, + pVect2[2] * delta + min_val + }; + v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high); + + } else if constexpr (residual % 4 == 2) { + // Load 2 floats and set the last two to 0 + v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1); + + // Dequantize two values + float dequant_high[2] = { + pVect2[0] * delta + min_val, + pVect2[1] * delta + min_val + }; + v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high); + + } else if constexpr (residual % 4 == 1) { + // Load 1 float and set the last three to 0 + v1 = _mm_load_ss(pVect1); + + // Dequantize one value + float dequant0 = pVect2[0] * delta + min_val; + v2_dequant = _mm_load_ss(&dequant0); + } + + pVect1 += residual % 4; + pVect2 += residual % 4; + + // Compute difference + __m128 diff = _mm_sub_ps(v1, v2_dequant); + + // Square difference and initialize sum + sum = _mm_mul_ps(diff, diff); + } + + // Process remaining blocks of 4 elements based on residual + if constexpr (residual >= 12) + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + if constexpr (residual >= 8) + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + if constexpr (residual >= 4) + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + } + + // Process 16 elements at a time (4 elements per step, 4 steps) + while (pVect1 < pEnd1) { + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + } + + // TmpRes must be 16 bytes aligned + float PORTABLE_ALIGN16 TmpRes[4]; + _mm_store_ps(TmpRes, sum); + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index d7136c82f..cf142924e 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -80,13 +80,13 @@ namespace spaces { return Choose_SQ8_L2_implementation_AVX(dim); } #endif - // #ifdef OPT_SSE - // if (features.sse) { - // if (dim % 4 == 0) // no point in aligning if we have an offsetting residual - // *alignment = 4 * sizeof(float); // handles 4 floats - // return Choose_SQ8_L2_implementation_SSE(dim); - // } - // #endif + #ifdef OPT_SSE + if (features.sse) { + if (dim % 4 == 0) // no point in aligning if we have an offsetting residual + *alignment = 4 * sizeof(float); // handles 4 floats + return Choose_SQ8_L2_implementation_SSE(dim); + } + #endif #endif // __x86_64__ return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/SSE.cpp b/src/VecSim/spaces/functions/SSE.cpp index dd218d957..f08395fab 100644 --- a/src/VecSim/spaces/functions/SSE.cpp +++ b/src/VecSim/spaces/functions/SSE.cpp @@ -10,6 +10,7 @@ #include "VecSim/spaces/L2/L2_SSE_FP32.h" #include "VecSim/spaces/L2/L2_SSE_FP64.h" +#include "VecSim/spaces/L2/L2_SSE_SQ8.h" #include "VecSim/spaces/IP/IP_SSE_FP32.h" #include "VecSim/spaces/IP/IP_SSE_FP64.h" @@ -31,6 +32,12 @@ dist_func_t Choose_SQ8_Cosine_implementation_SSE(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_L2_implementation_SSE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_SSE); + return ret_dist_func; +} + dist_func_t Choose_FP32_IP_implementation_SSE(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_SSE); diff --git a/src/VecSim/spaces/functions/SSE.h b/src/VecSim/spaces/functions/SSE.h index a86921a9c..d7ee3349e 100644 --- a/src/VecSim/spaces/functions/SSE.h +++ b/src/VecSim/spaces/functions/SSE.h @@ -14,6 +14,8 @@ namespace spaces { dist_func_t Choose_SQ8_IP_implementation_SSE(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_SSE(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_SSE(size_t dim); + dist_func_t Choose_FP32_IP_implementation_SSE(size_t dim); dist_func_t Choose_FP64_IP_implementation_SSE(size_t dim); diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index cbf0b7e5b..5d7a6bb7b 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -5,69 +5,65 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "bm_spaces.h" #include "utils/tests_utils.h" class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { - protected: - std::mt19937 rng; - size_t dim; - float *v1; - uint8_t *v2; - - public: +protected: + std::mt19937 rng; + size_t dim; + float *v1; + uint8_t *v2; + +public: BM_VecSimSpaces_SQ8() { rng.seed(47); } - ~BM_VecSimSpaces_SQ8() = default; - - void SetUp(const ::benchmark::State &state) { - dim = state.range(0); - v1 = new float[dim]; - test_utils::populate_float_vec(v1, dim, 123); - // Allocate vector with extra space for min, delta and cosine calculations - v2 = new uint8_t[dim + sizeof(float) * 3]; - test_utils::populate_float_vec_to_sq8(v2, dim, 1234); - } - void TearDown(const ::benchmark::State &state) { - delete v1; - delete v2; - } - }; + ~BM_VecSimSpaces_SQ8() = default; + + void SetUp(const ::benchmark::State &state) { + dim = state.range(0); + v1 = new float[dim]; + test_utils::populate_float_vec(v1, dim, 123); + // Allocate vector with extra space for min, delta and cosine calculations + v2 = new uint8_t[dim + sizeof(float) * 3]; + test_utils::populate_float_vec_to_sq8(v2, dim, 1234); + } + void TearDown(const ::benchmark::State &state) { + delete v1; + delete v2; + } +}; #ifdef CPU_FEATURES_ARCH_X86_64 cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512_F_BW_VL_VNNI functions #ifdef OPT_AVX512_F_BW_VL_VNNI -bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && - opt.avx512vl && opt.avx512vnni; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 32, +bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 16, avx512_f_bw_vl_vnni_supported); -// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, -// avx512_f_bw_vl_vnni_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 16, + avx512_f_bw_vl_vnni_supported); #endif // AVX512_F_BW_VL_VNNI #ifdef AVX2 // AVX2 functions bool avx2_supported = opt.avx2; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 32, avx2_supported); -// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX2, 32, -// avx2_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported); #endif // AVX2 // AVX functions #ifdef OPT_AVX bool avx_supported = opt.avx; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, AVX, 32, avx_supported); -// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX, 32, -// avx_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX, 16, avx_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX, 16, avx_supported); #endif // AVX // SSE functions #ifdef OPT_SSE bool sse_supported = opt.sse; -INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8, SQ8, SSE, 32, sse_supported); -// INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SSE, 32, -// sse_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SSE, 16, sse_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SSE, 16, sse_supported); #endif // SSE #endif // x86_64 diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 2477a5fc9..d1b854073 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2157,6 +2157,19 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { optimization.avx = 0; } #endif + #ifdef OPT_SSE + if (optimization.sse) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SSE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SSE with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; + // Unset sse flag as well, so we'll choose the next optimization (default). + optimization.sse = 0; + } + #endif // Add other optimizations as needed (SVE2, SVE, NEON, etc.) From 4fa88b2a3b15526ee4327b390e1c0f9d78346529 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 16:31:59 +0300 Subject: [PATCH 25/52] Remove prints --- src/VecSim/spaces/IP/IP_SSE_SQ8.h | 5 +---- src/VecSim/spaces/L2/L2_AVX_SQ8.h | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_SSE_SQ8.h b/src/VecSim/spaces/IP/IP_SSE_SQ8.h index deced094c..05b31da8d 100644 --- a/src/VecSim/spaces/IP/IP_SSE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE_SQ8.h @@ -114,7 +114,6 @@ float SQ8_InnerProductSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_ return 1.0f - SQ8_InnerProductSIMD16_SSE_IMP(pVect1v, pVect2v, dimension); } - template // 0..15 float SQ8_CosineSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dimension) { @@ -125,9 +124,7 @@ float SQ8_CosineSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dime // Compute inner product with dequantization using the common function // We need to cast away const for the inner product function, but it doesn't modify the vectors const float res = SQ8_InnerProductSIMD16_SSE_IMP(pVect1v, pVect2v, dimension); - - std::cout << "res before normalization sse: " << res << std::endl; - std::cout << "inv_norm: " << inv_norm << std::endl; + // For cosine, we need to account for the vector norms // The inv_norm parameter is stored after min_val and delta in the quantized vector return 1.0f - res * inv_norm; diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h index 715b147f3..0d21d6476 100644 --- a/src/VecSim/spaces/L2/L2_AVX_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h @@ -109,7 +109,6 @@ float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimen float t = pVect1_debug[i] - dequantized_V2; naive_sum += t * t; } - std::cout <<"residual: " << (int)residual << " " << naive_sum << " " << my_mm256_reduce_add_ps(sum) << std::endl; // We dealt with the residual part. We are left with some multiple of 16 floats. // In each iteration we calculate 16 floats = 512 bits. From 4476833b4673279f8dd63da8ca3c95f3717972dc Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 11 May 2025 17:17:26 +0300 Subject: [PATCH 26/52] sve2 l2 --- src/VecSim/spaces/IP/IP.cpp | 3 - src/VecSim/spaces/L2/L2_SVE_SQ8.h | 134 +++++++++++++++++++++++++++ src/VecSim/spaces/L2_space.cpp | 42 ++++----- src/VecSim/spaces/functions/SVE.cpp | 7 ++ src/VecSim/spaces/functions/SVE.h | 1 + src/VecSim/spaces/functions/SVE2.cpp | 7 ++ src/VecSim/spaces/functions/SVE2.h | 1 + tests/unit/test_spaces.cpp | 134 +++++++++++---------------- 8 files changed, 223 insertions(+), 106 deletions(-) create mode 100644 src/VecSim/spaces/L2/L2_SVE_SQ8.h diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index a1e5cb8e7..fd666341a 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -19,14 +19,11 @@ using float16 = vecsim_types::float16; float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, float min_val, float delta, float inv_norm) { float res = 0; - std::cout << "\nQuantized values: "; for (size_t i = 0; i < dimension; i++) { float dequantized_V2 = (pVect2v[i] * delta + min_val) * inv_norm; std::cout << dequantized_V2 << ", "; res += pVect1v[i] * dequantized_V2; } - std::cout << "\n"; - std::cout << "res before normalization: " << res << std::endl; return res; } diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h new file mode 100644 index 000000000..e52fe5e21 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). +*/ +#include "VecSim/spaces/space_includes.h" +#include + +// Helper function to perform L2 squared distance calculation for a chunk of elements +static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, + svfloat32_t &sum, const svfloat32_t &min_val_vec, + const svfloat32_t &delta_vec) { + svbool_t pg = svptrue_b32(); + + // Load float elements from pVect1 + svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); + + // Load uint8 elements from pVect2, convert to int32, then to float + svbool_t pg_b8 = svptrue_b8(); + svuint8_t v2_u8 = svld1_u8(pg_b8, pVect2 + offset); + + // Convert uint8 to uint32 + svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8)); + + // Convert uint32 to float32 + svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32); + + // Dequantize: (val * delta) + min_val + svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec); + + // Compute difference + svfloat32_t diff = svsub_f32_z(pg, v1, v2_dequant); + + // Square difference and add to sum + sum = svmla_f32_z(pg, sum, diff, diff); + + // Move to the next set of elements + offset += svcntw(); +} + +template +float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + float *pVect1 = (float *)pVect1v; + uint8_t *pVect2 = (uint8_t *)pVect2v; + size_t offset = 0; + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + svbool_t pg = svptrue_b32(); + svfloat32_t min_val_vec = svdup_f32(min_val); + svfloat32_t delta_vec = svdup_f32(delta); + + // Get the number of 32-bit elements per vector at runtime + uint64_t sve_word_count = svcntw(); + + // Multiple accumulators to increase instruction-level parallelism + svfloat32_t sum0 = svdup_f32(0.0f); + svfloat32_t sum1 = svdup_f32(0.0f); + svfloat32_t sum2 = svdup_f32(0.0f); + svfloat32_t sum3 = svdup_f32(0.0f); + + // Handle partial chunk if needed + if constexpr (partial_chunk) { + size_t remaining = dimension % sve_word_count; + if (remaining > 0) { + // Create predicate for the remaining elements + svbool_t pg_partial = svwhilelt_b32(0, remaining); + + // Load float elements from pVect1 with predicate + svfloat32_t v1 = svld1_f32(pg_partial, pVect1); + + // Load uint8 elements from pVect2 with predicate, convert to int32, then to float + svbool_t pg_b8_partial = svwhilelt_b8(0, remaining); + svuint8_t v2_u8 = svld1_u8(pg_b8_partial, pVect2); + + // Convert uint8 to uint32 + svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8)); + + // Convert uint32 to float32 + svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); + + // Dequantize: (val * delta) + min_val + svfloat32_t v2_dequant = svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec); + + // Compute difference + svfloat32_t diff = svsub_f32_z(pg_partial, v1, v2_dequant); + + // Square difference and add to sum + sum0 = svmla_f32_z(pg_partial, sum0, diff, diff); + + // Move pointers past the partial chunk + offset += remaining; + } + } + // Handle remaining steps (0-3) + if constexpr (additional_steps > 0) { + L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); + } + if constexpr (additional_steps > 1) { + L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec); + } + if constexpr (additional_steps > 2) { + L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); + } + + + // Process 4 chunks at a time in the main loop + auto chunk_size = 4 * sve_word_count; + size_t number_of_chunks = dimension / chunk_size; + + for (size_t i = 0; i < number_of_chunks; i++) { + L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec); + } + + + // Combine the accumulators + svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); + sum = svadd_f32_z(pg, sum, sum2); + sum = svadd_f32_z(pg, sum, sum3); + + // Horizontal sum of all elements in the vector + float result = svaddv_f32(pg, sum); + + return result; +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index cf142924e..6e4086f74 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -38,31 +38,31 @@ namespace spaces { if (!alignment) { alignment = &dummy_alignment; } - + dist_func_t ret_dist_func = SQ8_L2Sqr; - + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); - // #ifdef CPU_FEATURES_ARCH_AARCH64 - // #ifdef OPT_SVE2 - // if (features.sve2) { - // return Choose_FP32_L2_implementation_SVE2(dim); - // } - // #endif - // #ifdef OPT_SVE - // if (features.sve) { - // return Choose_FP32_L2_implementation_SVE(dim); - // } - // #endif - // #ifdef OPT_NEON - // if (features.asimd) { - // return Choose_FP32_L2_implementation_NEON(dim); - // } - // #endif - // #endif - + #ifdef CPU_FEATURES_ARCH_AARCH64 + #ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_L2_implementation_SVE2(dim); + } + #endif + #ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_L2_implementation_SVE(dim); + } + #endif + #ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_L2_implementation_NEON(dim); + } + #endif + #endif + #ifdef CPU_FEATURES_ARCH_X86_64 // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. - + if (dim < 16) { return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index 39098bd8c..d4952ef38 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -23,6 +23,7 @@ #include "VecSim/spaces/L2/L2_SVE_UINT8.h" #include "VecSim/spaces/IP/IP_SVE_UINT8.h" #include "VecSim/spaces/IP/IP_SVE_SQ8.h" +#include "VecSim/spaces/L2/L2_SVE_SQ8.h" namespace spaces { @@ -109,6 +110,12 @@ dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_L2SqrSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index 86f7a7094..a24dfe326 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -31,5 +31,6 @@ dist_func_t Choose_UINT8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim); } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index 52ba020a4..c5f1626f9 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -21,6 +21,7 @@ #include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/IP/IP_SVE_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8.h" // SVE2 implementation is identical to SVE namespace spaces { @@ -107,6 +108,12 @@ dist_func_t Choose_SQ8_Cosine_implementation_SVE2(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_L2SqrSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index cd3570caf..57f1b8694 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -31,5 +31,6 @@ dist_func_t Choose_UINT8_IP_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_IP_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim); } // namespace spaces diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index d1b854073..b24d17782 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2171,7 +2171,32 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { } #endif - // Add other optimizations as needed (SVE2, SVE, NEON, etc.) + #ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } + #endif + #ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } + #endif // Test default implementation unsigned char alignment = 0; @@ -2256,6 +2281,32 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { optimization.sse = 0; } #endif + #ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } + #endif + #ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } + #endif // Test default implementation @@ -2376,84 +2427,3 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } - -// Instantiate the test suite with dimensions to test -INSTANTIATE_TEST_SUITE_P(SQ8CosineTest, SQ8SpacesOptimizationTest, - testing::Range(16UL, 16 * 2UL + 1)); - -// TEST_P(SQ8SpacesOptimizationTest, SQ8_full_range_test) { -// auto optimization = getCpuOptimizationFeatures(); -// constexpr size_t dim = 512; - -// // Create vectors with full range of values -// std::vector v1(dim); -// std::vector v2(dim); - -// // v1: 0..255 followed by 255..0 -// for (size_t i = 0; i < 256; i++) { -// v1[i] = static_cast(i) / 255.0f; -// v1[256 + i] = static_cast(255 - i) / 255.0f; -// } - -// // v2: 255..0 followed by 0..255 -// for (size_t i = 0; i < 256; i++) { -// v2[i] = static_cast(255 - i) / 255.0f; -// v2[256 + i] = static_cast(i) / 255.0f; -// } - -// // Create SQ8 compressed version of v2 -// std::vector v2_compressed = CreateSQ8CompressedVector(v2.data(), dim, false); - -// // Create normalized version of v1 for cosine -// std::vector v1_norm(v1); -// spaces::GetNormalizeFunc()(v1_norm.data(), dim); - -// // Create normalized SQ8 compressed version of v2 for cosine -// std::vector v2_compressed_norm = CreateSQ8CompressedVector(v2.data(), dim, true); - -// float baseline_l2 = SQ8_L2Sqr(v1.data(), v2_compressed.data(), dim); -// float baseline_ip = SQ8_InnerProduct(v1.data(), v2_compressed.data(), dim); -// float baseline_cosine = SQ8_Cosine(v1_norm.data(), v2_compressed_norm.data(), dim); - -// dist_func_t arch_opt_func; - -// // Test different optimizations for each metric -// #ifdef OPT_AVX512F -// if (optimization.avx512f) { -// // L2 test -// arch_opt_func = Choose_SQ8_L2_implementation_AVX512F(dim); -// ASSERT_NEAR(baseline_l2, arch_opt_func(v1.data(), v2_compressed.data(), dim), 0.01) -// << "L2 AVX512 with dim " << dim; - -// // IP test -// arch_opt_func = Choose_SQ8_IP_implementation_AVX512F(dim); -// ASSERT_NEAR(baseline_ip, arch_opt_func(v1.data(), v2_compressed.data(), dim), 0.01) -// << "IP AVX512 with dim " << dim; - -// // Cosine test -// arch_opt_func = Choose_SQ8_Cosine_implementation_AVX512F(dim); -// ASSERT_NEAR(baseline_cosine, arch_opt_func(v1_norm.data(), v2_compressed_norm.data(), -// dim), 0.01) -// << "Cosine AVX512 with dim " << dim; - -// optimization.avx512f = 0; -// } -// #endif - -// // Add other optimizations as needed (SVE2, SVE, NEON, etc.) - - -// Instantiate the test suite with dimensions to test -INSTANTIATE_TEST_SUITE_P(SQ8OptFuncs, SQ8SpacesOptimizationTest, - testing::Range(16UL, 16 * 2UL + 1)); - -// #endif // defined(OPT_AVX512_FP16_VL) || defined(CPU_FEATURES_ARCH_AARCH64) - -// class INT8SpacesOptimizationTest : public testing::TestWithParam {}; - -// TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { -// auto optimization = getCpuOptimizationFeatures(); -// size_t dim = GetParam(); -// int8_t v1[dim]; -// int8_t v2[dim]; -// test_utils::populate_int8_vec(v1, dim From 2a7477c67d1cbe14c68cc2d346a409af6ee73fad Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 12 May 2025 09:46:08 +0300 Subject: [PATCH 27/52] add neon --- src/VecSim/spaces/IP/IP_NEON_SQ8.h | 128 ++++++++++++++++++ src/VecSim/spaces/IP_space.cpp | 10 +- src/VecSim/spaces/L2/L2_NEON_SQ8.h | 112 +++++++++++++++ src/VecSim/spaces/functions/NEON.cpp | 20 +++ src/VecSim/spaces/functions/NEON.h | 4 + .../spaces_benchmarks/bm_spaces_sq8.cpp | 20 +++ tests/unit/test_spaces.cpp | 13 ++ 7 files changed, 302 insertions(+), 5 deletions(-) create mode 100644 src/VecSim/spaces/IP/IP_NEON_SQ8.h create mode 100644 src/VecSim/spaces/L2/L2_NEON_SQ8.h diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h new file mode 100644 index 000000000..a95f6da20 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). +*/ +#include "VecSim/spaces/space_includes.h" +#include + +static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, float32x4_t &sum, + const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { + // Load 4 float elements from pVect1 + float32x4_t v1 = vld1q_f32(pVect1); + pVect1 += 4; + + // Load 4 uint8 elements from pVect2 + uint8x8_t v2_u8 = vld1_u8(pVect2); + pVect2 += 4; + + // Convert uint8 to uint32 + uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); + + // Convert uint32 to float32 + float32x4_t v2_f = vcvtq_f32_u32(v2_u32); + + // Dequantize: (val * delta) + min_val + float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec); + + // Compute dot product and add to sum + sum = vmlaq_f32(sum, v1, v2_dequant); +} + +template // 0..15 +float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { + float *pVect1 = (float *)pVect1v; + uint8_t *pVect2 = (uint8_t *)pVect2v; + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + float32x4_t min_val_vec = vdupq_n_f32(min_val); + float32x4_t delta_vec = vdupq_n_f32(delta); + + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + float32x4_t sum3 = vdupq_n_f32(0.0f); + + const size_t num_of_chunks = dimension / 16; + + // Process 16 elements at a time in the main loop + for (size_t i = 0; i < num_of_chunks; i++) { + InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, sum3, min_val_vec, delta_vec); + } + + // Handle remaining complete 4-float blocks within residual + if constexpr (residual >= 4) { + InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + } + if constexpr (residual >= 8) { + InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + } + if constexpr (residual >= 12) { + InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + } + + // Handle final residual elements (0-3 elements) + constexpr size_t final_residual = residual % 4; + if constexpr (final_residual > 0) { + float32x4_t v1 = vdupq_n_f32(0.0f); + float32x4_t v2_dequant = vdupq_n_f32(0.0f); + + if constexpr (final_residual >= 1) { + v1 = vld1q_lane_f32(pVect1, v1, 0); + float dequant0 = pVect2[0] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0); + } + if constexpr (final_residual >= 2) { + v1 = vld1q_lane_f32(pVect1 + 1, v1, 1); + float dequant1 = pVect2[1] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1); + } + if constexpr (final_residual >= 3) { + v1 = vld1q_lane_f32(pVect1 + 2, v1, 2); + float dequant2 = pVect2[2] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2); + } + + sum3 = vmlaq_f32(sum3, v1, v2_dequant); + } + + // Combine all four sum accumulators + float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3)); + + // Horizontal sum of the 4 elements in the combined NEON register + float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined)); + float32x2_t summed = vpadd_f32(sum_halves, sum_halves); + float sum = vget_lane_f32(summed, 0); + + return sum; +} + +template // 0..15 +float SQ8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); +} + +template // 0..15 +float SQ8_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + const uint8_t *pVect2 = static_cast(pVect2v); + + // Get quantization parameters + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Compute inner product with dequantization using the common function + const float res = SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + // The inv_norm parameter is stored after min_val and delta in the quantized vector + return 1.0f - res * inv_norm; +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 52aa5760f..93609475d 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -53,11 +53,11 @@ namespace spaces { return Choose_SQ8_IP_implementation_SVE(dim); } #endif - // #ifdef OPT_NEON - // if (features.asimd) { - // return Choose_SQ8_IP_implementation_NEON(dim); - // } - // #endif + #ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_IP_implementation_NEON(dim); + } + #endif #endif diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8.h b/src/VecSim/spaces/L2/L2_NEON_SQ8.h new file mode 100644 index 000000000..617389cbb --- /dev/null +++ b/src/VecSim/spaces/L2/L2_NEON_SQ8.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). +*/ +#include "VecSim/spaces/space_includes.h" +#include + +static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, float32x4_t &sum, + const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { + // Load 4 float elements from pVect1 + float32x4_t v1 = vld1q_f32(pVect1); + pVect1 += 4; + + // Load 4 uint8 elements from pVect2 + uint8x8_t v2_u8 = vld1_u8(pVect2); + pVect2 += 4; + + // Convert uint8 to uint32 + uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); + + // Convert uint32 to float32 + float32x4_t v2_f = vcvtq_f32_u32(v2_u32); + + // Dequantize: (val * delta) + min_val + float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec); + + // Compute difference + float32x4_t diff = vsubq_f32(v1, v2_dequant); + + // Square difference and add to sum + sum = vmlaq_f32(sum, diff, diff); +} + +template // 0..15 +float SQ8_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + float *pVect1 = (float *)pVect1v; + uint8_t *pVect2 = (uint8_t *)pVect2v; + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + float32x4_t min_val_vec = vdupq_n_f32(min_val); + float32x4_t delta_vec = vdupq_n_f32(delta); + + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + float32x4_t sum3 = vdupq_n_f32(0.0f); + + const size_t num_of_chunks = dimension / 16; + + // Process 16 elements at a time in the main loop + for (size_t i = 0; i < num_of_chunks; i++) { + L2SqrStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum3, min_val_vec, delta_vec); + } + + // Handle remaining complete 4-float blocks within residual + if constexpr (residual >= 4) { + L2SqrStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + } + if constexpr (residual >= 8) { + L2SqrStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + } + if constexpr (residual >= 12) { + L2SqrStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + } + + // Handle final residual elements (0-3 elements) + constexpr size_t final_residual = residual % 4; + if constexpr (final_residual > 0) { + float32x4_t v1 = vdupq_n_f32(0.0f); + float32x4_t v2_dequant = vdupq_n_f32(0.0f); + + if constexpr (final_residual >= 1) { + v1 = vld1q_lane_f32(pVect1, v1, 0); + float dequant0 = pVect2[0] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0); + } + if constexpr (final_residual >= 2) { + v1 = vld1q_lane_f32(pVect1 + 1, v1, 1); + float dequant1 = pVect2[1] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1); + } + if constexpr (final_residual >= 3) { + v1 = vld1q_lane_f32(pVect1 + 2, v1, 2); + float dequant2 = pVect2[2] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2); + } + + float32x4_t diff = vsubq_f32(v1, v2_dequant); + sum3 = vmlaq_f32(sum3, diff, diff); + } + + // Combine all four sum accumulators + float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3)); + + // Horizontal sum of the 4 elements in the combined NEON register + float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined)); + float32x2_t summed = vpadd_f32(sum_halves, sum_halves); + float sum = vget_lane_f32(summed, 0); + + return sum; +} diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp index bd15c6577..debfa90c0 100644 --- a/src/VecSim/spaces/functions/NEON.cpp +++ b/src/VecSim/spaces/functions/NEON.cpp @@ -15,6 +15,8 @@ #include "VecSim/spaces/IP/IP_NEON_UINT8.h" #include "VecSim/spaces/L2/L2_NEON_FP64.h" #include "VecSim/spaces/IP/IP_NEON_FP64.h" +#include "VecSim/spaces/L2/L2_NEON_SQ8.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8.h" namespace spaces { @@ -79,6 +81,24 @@ dist_func_t Choose_FP64_L2_implementation_NEON(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_L2_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_NEON); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_IP_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_NEON); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_NEON); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h index 7da0de6b8..4478cc149 100644 --- a/src/VecSim/spaces/functions/NEON.h +++ b/src/VecSim/spaces/functions/NEON.h @@ -26,4 +26,8 @@ dist_func_t Choose_FP32_L2_implementation_NEON(size_t dim); dist_func_t Choose_FP64_IP_implementation_NEON(size_t dim); dist_func_t Choose_FP64_L2_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim); + } // namespace spaces diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index 5d7a6bb7b..03e9d5477 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -34,6 +34,26 @@ class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { } }; +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; + +// NEON implementation for ARMv8-a +#ifdef OPT_NEON +bool neon_supported = opt.asimd; // ARMv8-a always supports NEON +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported); +#endif +// SVE implementation +#ifdef OPT_SVE +bool sve_supported = opt.sve; // Check for SVE support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported); +#endif +// SVE2 implementation +#ifdef OPT_SVE2 +bool sve2_supported = opt.sve2; // Check for SVE2 support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported); +#endif +#endif // AARCH64 + #ifdef CPU_FEATURES_ARCH_X86_64 cpu_features::X86Features opt = cpu_features::GetX86Info().features; diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index b24d17782..6f88bff62 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2376,6 +2376,19 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { optimization.sve = 0; } #endif + #ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "NEON with dim " << dim; + // We don't align SQ8 vectors with cosine distance + // ASSERT_EQ(alignment, 0) << "NEON with dim " << dim; + optimization.asimd = 0; + } + #endif // Test different optimizations based on CPU features #ifdef OPT_AVX512_F_BW_VL_VNNI From b1f502c18a20aa0edd6d2cb33fe33589ae62fce4 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 12 May 2025 10:33:11 +0300 Subject: [PATCH 28/52] fix sve --- src/VecSim/spaces/IP/IP.cpp | 2 -- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 43 +++++++++++++------------------ src/VecSim/spaces/L2/L2.cpp | 2 -- src/VecSim/spaces/L2/L2_SVE_SQ8.h | 16 +++--------- tests/unit/test_spaces.cpp | 41 +++++++++++++++++++---------- 5 files changed, 49 insertions(+), 55 deletions(-) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index fd666341a..395e69dce 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -21,7 +21,6 @@ float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, s float res = 0; for (size_t i = 0; i < dimension; i++) { float dequantized_V2 = (pVect2v[i] * delta + min_val) * inv_norm; - std::cout << dequantized_V2 << ", "; res += pVect1v[i] * dequantized_V2; } return res; @@ -48,7 +47,6 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - std::cout << "inv_norm: " << inv_norm << std::endl; // Compute inner product with dequantization const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm); return 1.0f - res; diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index d6c0faa3d..bc80a8785 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -19,12 +19,8 @@ static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, size_t &of // Load float elements from pVect1 svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); - // Load uint8 elements from pVect2, convert to int32, then to float - svbool_t pg_b8 = svptrue_b8(); - svuint8_t v2_u8 = svld1_u8(pg_b8, pVect2 + offset); - // Convert uint8 to uint32 - svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8)); + svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32); @@ -42,12 +38,12 @@ static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, size_t &of template float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { float *pVect1 = (float *)pVect1v; - uint8_t *quantized = (uint8_t *)pVect2v; + uint8_t *pVect2 = (uint8_t *)pVect2v; size_t offset = 0; // Get dequantization parameters from the end of quantized vector - float min = *(float *)(quantized + dimension); - float delta = *(float *)(quantized + dimension + sizeof(float)); + float min = *(float *)(pVect2 + dimension); + float delta = *(float *)(pVect2 + dimension + sizeof(float)); // Create broadcast vectors for SIMD operations svbool_t pg = svptrue_b32(); @@ -68,17 +64,15 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz size_t remaining = dimension % sve_word_count; if (remaining > 0) { // Create predicate for the remaining elements - svbool_t pg_partial = svwhilelt_b32(0, remaining); - + svbool_t pg_partial = svwhilelt_b32(static_cast(0), static_cast(remaining)); + // Load float elements from pVect1 with predicate svfloat32_t v1 = svld1_f32(pg_partial, pVect1); - // Load uint8 elements from pVect2 with predicate, convert to int32, then to float - svbool_t pg_b8_partial = svwhilelt_b8(0, remaining); - svuint8_t v2_u8 = svld1_u8(pg_b8_partial, quantized); - - // Convert uint8 to uint32 - svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8)); + + // load 8-bit bytes from pVect2+offset and zero-extend each into a 32-bit lane + svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset); // LD1UB: load 8-bit, zero-extend to 32-bit :contentReference[oaicite:0]{index=0} + // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); @@ -90,8 +84,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant); // Move pointers past the partial chunk - pVect1 += remaining; - quantized += remaining; + offset += remaining; } } @@ -100,21 +93,21 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz const size_t number_of_chunks = (dimension - (partial_chunk ? dimension % sve_word_count : 0)) / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - InnerProductStep(pVect1, quantized, offset, sum0, min_val_vec, delta_vec); - InnerProductStep(pVect1, quantized, offset, sum1, min_val_vec, delta_vec); - InnerProductStep(pVect1, quantized, offset, sum2, min_val_vec, delta_vec); - InnerProductStep(pVect1, quantized, offset, sum3, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec); } // Handle remaining steps (0-3) if constexpr (additional_steps > 0) { - InnerProductStep(pVect1, quantized, offset, sum0, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); } if constexpr (additional_steps > 1) { - InnerProductStep(pVect1, quantized, offset, sum1, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec); } if constexpr (additional_steps > 2) { - InnerProductStep(pVect1, quantized, offset, sum2, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); } // Combine the accumulators diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 85e78edb2..a8a1f5040 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -27,13 +27,11 @@ float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { float res = 0; for (size_t i = 0; i < dimension; i++) { auto dequantized_V2 = (pVect2[i] * delta + min_val); - std::cout << dequantized_V2 << " "; float t = pVect1[i] - dequantized_V2; res += t * t; } // The last value is used to normalize the vector. // The normalization is done by multiplying the result by the inverse of the norm. - std::cout << std::endl; return res; } diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h index e52fe5e21..2cfdb15ad 100644 --- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h @@ -10,7 +10,7 @@ #include // Helper function to perform L2 squared distance calculation for a chunk of elements -static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, +static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, size_t &offset, svfloat32_t &sum, const svfloat32_t &min_val_vec, const svfloat32_t &delta_vec) { svbool_t pg = svptrue_b32(); @@ -18,12 +18,8 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_ // Load float elements from pVect1 svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); - // Load uint8 elements from pVect2, convert to int32, then to float - svbool_t pg_b8 = svptrue_b8(); - svuint8_t v2_u8 = svld1_u8(pg_b8, pVect2 + offset); - // Convert uint8 to uint32 - svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8)); + svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32); @@ -70,17 +66,13 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi size_t remaining = dimension % sve_word_count; if (remaining > 0) { // Create predicate for the remaining elements - svbool_t pg_partial = svwhilelt_b32(0, remaining); + svbool_t pg_partial = svwhilelt_b32(static_cast(0), static_cast(remaining)); // Load float elements from pVect1 with predicate svfloat32_t v1 = svld1_f32(pg_partial, pVect1); // Load uint8 elements from pVect2 with predicate, convert to int32, then to float - svbool_t pg_b8_partial = svwhilelt_b8(0, remaining); - svuint8_t v2_u8 = svld1_u8(pg_b8_partial, pVect2); - - // Convert uint8 to uint32 - svuint32_t v2_u32 = svzext_u32(svreinterpret_u32_u8(v2_u8)); + svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset); // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 6f88bff62..a977be3b0 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -470,8 +470,6 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { params[0] = min_val; params[1] = delta; params[2] = inv_norm; - std::cout << "min_val: " << min_val << ", delta: " << delta << ", inv_norm: " << inv_norm - << std::endl; float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim); ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance"; @@ -2129,7 +2127,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { dist_func_t arch_opt_func; float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); - std::cout << "baseline: " << baseline << std::endl; // Test different optimizations based on CPU features #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { @@ -2197,6 +2194,20 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { optimization.sve = 0; } #endif + #ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "NEON with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.asimd = 0; + } + #endif + // Test default implementation unsigned char alignment = 0; @@ -2219,17 +2230,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { v2_orig[i] = float(i * 0.75 + 1.0); } spaces::GetNormalizeFunc()(v1_orig.data(), dim); - // print v1_orig - std::cout << "v1_normalized: "; - for (size_t i = 0; i < dim; i++) { - std::cout << v1_orig[i] << ", "; - } - std::cout << std::endl; - std::cout << "v2_orig: "; - for (size_t i = 0; i < dim; i++) { - std::cout << v2_orig[i] << ", "; - } - std::cout << std::endl; // Create SQ8 compressed version of v2 std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); @@ -2307,6 +2307,19 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { optimization.sve = 0; } #endif + #ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "NEON with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.asimd = 0; + } + #endif // Test default implementation From dc154b5fa4ea49fbad29160bf1be6f5925d59b0d Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 12 May 2025 10:42:33 +0300 Subject: [PATCH 29/52] add sq8 cosine test --- tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index 03e9d5477..ddee91c49 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -41,16 +41,19 @@ cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; #ifdef OPT_NEON bool neon_supported = opt.asimd; // ARMv8-a always supports NEON INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported); #endif // SVE implementation #ifdef OPT_SVE bool sve_supported = opt.sve; // Check for SVE support INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported); #endif // SVE2 implementation #ifdef OPT_SVE2 bool sve2_supported = opt.sve2; // Check for SVE2 support INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported); #endif #endif // AARCH64 From 25a9400ba6593b6c92aae2672aa553402c8780fc Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 12 May 2025 13:34:31 +0300 Subject: [PATCH 30/52] test utils --- src/VecSim/spaces/IP/IP.cpp | 7 +- src/VecSim/spaces/IP/IP.h | 8 +- src/VecSim/spaces/L2/L2.cpp | 2 - src/VecSim/spaces/L2/L2.h | 1 + src/VecSim/spaces/L2/L2_AVX_SQ8.h | 25 +--- src/VecSim/spaces/L2_space.h | 2 +- src/VecSim/spaces/computer/preprocessors.h | 131 --------------------- tests/unit/test_spaces.cpp | 78 ++---------- tests/utils/tests_utils.h | 39 +++--- 9 files changed, 43 insertions(+), 250 deletions(-) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 395e69dce..d93671058 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -10,7 +10,6 @@ #include "VecSim/types/bfloat16.h" #include "VecSim/types/float16.h" #include -#include using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; @@ -20,16 +19,16 @@ float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, s float delta, float inv_norm) { float res = 0; for (size_t i = 0; i < dimension; i++) { - float dequantized_V2 = (pVect2v[i] * delta + min_val) * inv_norm; + float dequantized_V2 = (pVect2v[i] * delta + min_val); res += pVect1v[i] * dequantized_V2; } - return res; + return res * inv_norm; } float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); - // pVect2 is a vector of int8_t, so we need to dequantize it, normalize it and then multiply it. + // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply it. // it is structured as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm (float)] // The last two values are used to dequantize the vector. const float min_val = *reinterpret_cast(pVect2 + dimension); diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index 7dfad24ce..d4796cbd6 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -5,15 +5,15 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #pragma once #include -/* - pVect1v vector of type fp32 and pVect2v vector of type int8 -*/ +// pVect1v vector of type fp32 and pVect2v vector of type uint8 float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); + +// pVect1v vector of type fp32 and pVect2v vector of type uint8 float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index a8a1f5040..42f219409 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -30,8 +30,6 @@ float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { float t = pVect1[i] - dequantized_V2; res += t * t; } - // The last value is used to normalize the vector. - // The normalization is done by multiplying the result by the inverse of the norm. return res; } diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h index 055e8c630..6f1d25927 100644 --- a/src/VecSim/spaces/L2/L2.h +++ b/src/VecSim/spaces/L2/L2.h @@ -10,6 +10,7 @@ #include +// pVect1v vector of type fp32 and pVect2v vector of type uint8 float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h index 0d21d6476..53034df0e 100644 --- a/src/VecSim/spaces/L2/L2_AVX_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h @@ -8,7 +8,6 @@ */ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" -#include static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m256 &sum, const __m256 &min_val_vec, const __m256 &delta_vec) { @@ -42,8 +41,6 @@ template // 0..15 float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { float *pVect1 = (float *)pVect1v; uint8_t *pVect2 = (uint8_t *)pVect2v; - float *pVect1_debug = (float *)pVect1v; - uint8_t *pVect2_debug = (uint8_t *)pVect2v; // Get dequantization parameters from the end of quantized vector const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); @@ -78,38 +75,22 @@ float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimen // Dequantize: (val * delta) + min_val __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); - // print debug information - // std::cout << "v2_dequant before: "; - // for (size_t i = 0; i < 8; i++) { - // std::cout << v2_dequant[i] << " "; - // } - // std::cout << std::endl; + v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); - // std::cout << "v2_dequant after: "; - // for (size_t i = 0; i < 8; i++) { - // std::cout << v2_dequant[i] << " "; - // } - // std::cout << std::endl; __m256 diff = _mm256_sub_ps(v1, v2_dequant); sum = _mm256_mul_ps(diff, diff); - // print sum + } // If the reminder is >= 8, have another step of 8 floats if constexpr (residual >= 8) { L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); } - float naive_sum = 0; - for (size_t i = 0; i < residual; i++) { - auto dequantized_V2 = (pVect2_debug[i] * delta + min_val); - float t = pVect1_debug[i] - dequantized_V2; - naive_sum += t * t; - } - + // We dealt with the residual part. We are left with some multiple of 16 floats. // In each iteration we calculate 16 floats = 512 bits. do { diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h index a58fcd7e4..c26757be4 100644 --- a/src/VecSim/spaces/L2_space.h +++ b/src/VecSim/spaces/L2_space.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). - */ +*/ #pragma once #include "VecSim/spaces/spaces.h" diff --git a/src/VecSim/spaces/computer/preprocessors.h b/src/VecSim/spaces/computer/preprocessors.h index ae434ea69..1045299b4 100644 --- a/src/VecSim/spaces/computer/preprocessors.h +++ b/src/VecSim/spaces/computer/preprocessors.h @@ -111,134 +111,3 @@ class CosinePreprocessor : public PreprocessorInterface { spaces::normalizeVector_f normalize_func; const size_t dim; }; - -template -class QuantPreprocessor : public PreprocessorInterface { -public: - QuantPreprocessor(std::shared_ptr allocator, size_t dim, size_t bits_per_dim = 8) - : PreprocessorInterface(allocator), dim(dim), bits_per_dim(bits_per_dim), - compressed_bytes_count(calculateCompressedSize(dim)) {} - - void preprocess(const void *original_blob, void *&storage_blob, void *&query_blob, - size_t processed_bytes_count, unsigned char alignment) const override { - // Case 1: Blobs are different (one might be null, or both are allocated and processed separately) - if (storage_blob != query_blob) { - // Process storage blob (compress) - if (storage_blob == nullptr) { - storage_blob = this->allocator->allocate(compressed_bytes_count); - quantize(original_blob, storage_blob); - } - - // Query blob remains uncompressed - if (query_blob == nullptr) { - query_blob = this->allocator->allocate_aligned(processed_bytes_count, alignment); - memcpy(query_blob, original_blob, processed_bytes_count); - } - } else { // Case 2: Blobs are the same or both null - if (query_blob == nullptr) { - // For query, we keep the original format - query_blob = this->allocator->allocate_aligned(processed_bytes_count, alignment); - memcpy(query_blob, original_blob, processed_bytes_count); - - // For storage, we compress - storage_blob = this->allocator->allocate(compressed_bytes_count); - quantize(original_blob, storage_blob); - } else { - // If both point to the same memory, we need to separate them - void* new_storage = this->allocator->allocate(compressed_bytes_count); - quantize(query_blob, new_storage); - storage_blob = new_storage; - } - } - } - - void preprocessForStorage(const void *original_blob, void *&blob, - size_t processed_bytes_count) const override { - if (blob == nullptr) { - blob = this->allocator->allocate(compressed_bytes_count); - quantize(original_blob, blob); - } else { - // If blob is already allocated, we need to compress in-place - void* temp = this->allocator->allocate(compressed_bytes_count); - quantize(blob, temp); - this->allocator->free_allocation(blob); - blob = temp; - } - } - - void preprocessQuery(const void *original_blob, void *&blob, size_t processed_bytes_count, - unsigned char alignment) const override { - // For query, we keep the original format - if (blob == nullptr) { - blob = this->allocator->allocate_aligned(processed_bytes_count, alignment); - memcpy(blob, original_blob, processed_bytes_count); - } - } - - void preprocessQueryInPlace(void *blob, size_t processed_bytes_count, - unsigned char alignment) const override { - // No compression for query vectors - assert(blob); - } - - void preprocessStorageInPlace(void *blob, size_t processed_bytes_count) const override { - assert(blob); - // Create temporary storage for compressed data - void* temp = this->allocator->allocate(compressed_bytes_count); - quantize(blob, temp); - - // Copy compressed data back to original location - // Note: This assumes blob has enough space for the compressed data - memcpy(blob, temp, compressed_bytes_count); - this->allocator->free_allocation(temp); - } - -private: - const size_t dim; - const size_t bits_per_dim; - const size_t compressed_bytes_count; - - // Calculate the size needed for the compressed vector - static size_t calculateCompressedSize(size_t dim) { - // Quantized values (int8 per dimension) + min (float32) + delta (float32) - return dim * sizeof(int8_t) + 2 * sizeof(float); - } - - // Quantize the vector from original format to compressed format - void quantize(const void *src, void *dst) const { - const DataType* src_data = static_cast(src); - - // Find min and max values in the vector - DataType min_val = src_data[0]; - DataType max_val = src_data[0]; - - for (size_t i = 0; i < dim; i++) { - DataType val = src_data[i]; - min_val = val < min_val ? val : min_val; - max_val = val > max_val ? val : max_val; - } - - // Calculate delta (quantization step) - float delta = (max_val - min_val) / 255.0f; - if (delta == 0){ - delta = 1.0f; // Avoid division by zero if all values are the same - } - - // Structure of compressed data: - // [quantized values (int8_t * dim)][min_val (float)][delta (float)] - int8_t* quant_values = static_cast(dst); // convert to int8_t pointer - float* params = reinterpret_cast(quant_values + dim); // convert to float pointer starting after quantized values - - // Store min and delta values for dequantization - params[0] = static_cast(min_val); - params[1] = delta; - - // Quantize each value - for (size_t i = 0; i < dim; i++) { - float normalized = (src_data[i] - min_val) / delta; - if (normalized < 0) normalized = 0; - if (normalized > 255) normalized = 255; - quant_values[i] = static_cast(normalized); - } - } -}; diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index a977be3b0..0374a774b 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -316,43 +316,16 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { v2_orig[i] = float(i + 1.5); } - // Create SQ8 compressed version of v2 - // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + std::vector v2_compressed(compressed_size); if (should_normalize) { spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); } - - // Find min and max for quantization - float min_val = v2_orig[0]; - float max_val = v2_orig[0]; - for (size_t i = 1; i < dim; i++) { - min_val = std::min(min_val, v2_orig[i]); - max_val = std::max(max_val, v2_orig[i]); - } - - // Calculate delta and inverse norm - float delta = (max_val - min_val) / 255.0f; - if (delta == 0) - delta = 1.0f; // Avoid division by zero - - std::vector v2_compressed(compressed_size); - - // Quantize v2 - uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); - float *params = reinterpret_cast(quant_values + dim); - - // Store parameters - params[0] = min_val; - params[1] = delta; - - // Quantize each value - for (size_t i = 0; i < dim; i++) { - float normalized = (v2_orig[i] - min_val) / delta; - normalized = std::max(0.0f, std::min(255.0f, normalized)); - quant_values[i] = static_cast(std::round(normalized)); - } + + // Create SQ8 compressed version of v2 + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + test_utils::quantize_float_vec_to_uint8(v2_orig, dim, v2_compressed.data()); float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim); @@ -380,47 +353,20 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { v2_orig[i] = float(i + 1.5); } - // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); - spaces::GetNormalizeFunc()(v1_orig, dim); - // Find min and max for quantization - float min_val = v2_orig[0]; - float max_val = v2_orig[0]; - for (size_t i = 1; i < dim; i++) { - min_val = std::min(min_val, v2_orig[i]); - max_val = std::max(max_val, v2_orig[i]); - } - // Calculate delta and inverse norm - float delta = (max_val - min_val) / 255.0f; - if (delta == 0) - delta = 1.0f; // Avoid division by zero - - // Compress v2 std::vector v2_compressed(compressed_size); - uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); - float *params = reinterpret_cast(quant_values + dim); - // Quantize each value - for (size_t i = 0; i < dim; i++) { - float normalized = (v2_orig[i] - min_val) / delta; - normalized = std::max(0.0f, std::min(255.0f, normalized)); - quant_values[i] = static_cast(std::round(normalized)); - } - // Calculate inverse norm from decompressed values - float inv_norm = 0.0f; - for (size_t i = 0; i < dim; i++) { - float decompressed_value = min_val + quant_values[i] * delta; - inv_norm += decompressed_value * decompressed_value; - } - inv_norm = 1.0f / std::sqrt(inv_norm); - // Store parameters - params[0] = min_val; - params[1] = delta; - params[2] = inv_norm; + spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); + + // Create SQ8 compressed version of v2 + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + test_utils::quantize_float_vec_to_uint8(v2_orig, dim, v2_compressed.data()); float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance"; } + TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { // create a vector with extra space for the norm size_t dim = 5; diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index bb041b0af..1485d332f 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -50,44 +50,43 @@ static void populate_float_vec(float *v, size_t dim, int seed = 1234) { } } -static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { - - std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed - std::uniform_real_distribution dis(-1.0f, 1.0f); - std::vector vec_copy(dim); - for (size_t i = 0; i < dim; i++) { - vec_copy[i] = dis(gen); - } +static void quantize_float_vec_to_uint8(float *v, size_t dim, uint8_t *qv, int seed = 1234) { - // Find min and max for quantization - float min_val = vec_copy[0]; - float max_val = vec_copy[0]; + float min_val = v[0]; + float max_val = v[0]; for (size_t i = 1; i < dim; i++) { - min_val = std::min(min_val, vec_copy[i]); - max_val = std::max(max_val, vec_copy[i]); + min_val = std::min(min_val, v[i]); + max_val = std::max(max_val, v[i]); } - // Calculate delta float delta = (max_val - min_val) / 255.0f; if (delta == 0) delta = 1.0f; // Avoid division by zero - float norm = 0.0f; // Quantize each value for (size_t i = 0; i < dim; i++) { - float normalized = (vec_copy[i] - min_val) / delta; + float normalized = (v[i] - min_val) / delta; normalized = std::max(0.0f, std::min(255.0f, normalized)); - v[i] = static_cast(std::round(normalized)); - norm += (v[i] * delta + min_val) * (v[i] * delta + min_val); + qv[i] = static_cast(std::round(normalized)); + norm += (qv[i] * delta + min_val) * (qv[i] * delta + min_val); } - float inv_norm = 1.0f / std::sqrt(norm); // Store parameters - float *params = reinterpret_cast(v + dim); + float *params = reinterpret_cast(qv + dim); params[0] = min_val; params[1] = delta; params[2] = inv_norm; +} + +static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { + std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed + std::uniform_real_distribution dis(-1.0f, 1.0f); + std::vector vec(dim); + for (size_t i = 0; i < dim; i++) { + vec[i] = dis(gen); + } + quantize_float_vec_to_uint8(vec.data(), dim, v, seed); } From 9ced0be4649f6b03cf3361254b7975dfbb7a1a69 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 12 May 2025 13:39:46 +0300 Subject: [PATCH 31/52] static const --- src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 6 +++--- src/VecSim/spaces/IP/IP_NEON_SQ8.h | 6 +++--- src/VecSim/spaces/IP/IP_SSE_SQ8.h | 6 +++--- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 6 +++--- src/VecSim/spaces/L2/L2_AVX_SQ8.h | 6 +++--- src/VecSim/spaces/L2/L2_NEON_SQ8.h | 6 +++--- src/VecSim/spaces/L2/L2_SSE_SQ8.h | 6 +++--- src/VecSim/spaces/L2/L2_SVE_SQ8.h | 7 +++---- 8 files changed, 24 insertions(+), 25 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index df2f134f1..67e10bad1 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -9,7 +9,7 @@ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" -static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 &sum256, +static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, const __m256 &min_val_vec, const __m256 &delta_vec) { // Load 8 float elements from pVect1 __m256 v1 = _mm256_loadu_ps(pVect1); @@ -35,8 +35,8 @@ static inline void InnerProductStepSQ8(float *&pVect1, uint8_t *&pVect2, __m256 template // 0..15 float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { - float *pVect1 = (float *)pVect1v; - uint8_t *pVect2 = (uint8_t *)pVect2v; + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); // Get dequantization parameters from the end of quantized vector const float min_val = *reinterpret_cast(pVect2 + dimension); diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h index a95f6da20..cafe2cab4 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -9,7 +9,7 @@ #include "VecSim/spaces/space_includes.h" #include -static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, float32x4_t &sum, +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum, const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { // Load 4 float elements from pVect1 float32x4_t v1 = vld1q_f32(pVect1); @@ -34,8 +34,8 @@ static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, float32x4_ template // 0..15 float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { - float *pVect1 = (float *)pVect1v; - uint8_t *pVect2 = (uint8_t *)pVect2v; + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); // Get dequantization parameters from the end of quantized vector const float min_val = *reinterpret_cast(pVect2 + dimension); diff --git a/src/VecSim/spaces/IP/IP_SSE_SQ8.h b/src/VecSim/spaces/IP/IP_SSE_SQ8.h index 05b31da8d..f7bae253e 100644 --- a/src/VecSim/spaces/IP/IP_SSE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE_SQ8.h @@ -10,7 +10,7 @@ #include #include -static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, __m128 &sum_prod, +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod, const __m128 &min_val_vec, const __m128 &delta_vec) { // Load 4 float elements from pVect1 __m128 v1 = _mm_loadu_ps(pVect1); @@ -32,8 +32,8 @@ static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, __m128 &su template // 0..15 float SQ8_InnerProductSIMD16_SSE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { - float *pVect1 = (float *)pVect1v; - uint8_t *quantized = (uint8_t *)pVect2v; + const float *pVect1 = static_cast(pVect1v); + const uint8_t *quantized = static_cast(pVect2v); // Get dequantization parameters from the end of quantized vector float min = *(float *)(quantized + dimension); diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index bc80a8785..bbbe328d7 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -11,7 +11,7 @@ #include #include -static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, size_t &offset, +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, svfloat32_t &sum, const svfloat32_t &min_val_vec, const svfloat32_t &delta_vec) { svbool_t pg = svptrue_b32(); @@ -37,8 +37,8 @@ static inline void InnerProductStep(float *&pVect1, uint8_t *&pVect2, size_t &of template float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { - float *pVect1 = (float *)pVect1v; - uint8_t *pVect2 = (uint8_t *)pVect2v; + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); size_t offset = 0; // Get dequantization parameters from the end of quantized vector diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h index 53034df0e..be7e77fba 100644 --- a/src/VecSim/spaces/L2/L2_AVX_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h @@ -9,7 +9,7 @@ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" -static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m256 &sum, +static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum, const __m256 &min_val_vec, const __m256 &delta_vec) { // Load 8 float elements from pVect1 __m256 v1 = _mm256_loadu_ps(pVect1); @@ -39,8 +39,8 @@ static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m256 &sum, template // 0..15 float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimension) { - float *pVect1 = (float *)pVect1v; - uint8_t *pVect2 = (uint8_t *)pVect2v; + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); // Get dequantization parameters from the end of quantized vector const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8.h b/src/VecSim/spaces/L2/L2_NEON_SQ8.h index 617389cbb..24f6047a7 100644 --- a/src/VecSim/spaces/L2/L2_NEON_SQ8.h +++ b/src/VecSim/spaces/L2/L2_NEON_SQ8.h @@ -9,7 +9,7 @@ #include "VecSim/spaces/space_includes.h" #include -static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, float32x4_t &sum, +static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum, const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { // Load 4 float elements from pVect1 float32x4_t v1 = vld1q_f32(pVect1); @@ -37,8 +37,8 @@ static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, float32x4_t &sum, template // 0..15 float SQ8_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { - float *pVect1 = (float *)pVect1v; - uint8_t *pVect2 = (uint8_t *)pVect2v; + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); // Get dequantization parameters from the end of quantized vector const float min_val = *reinterpret_cast(pVect2 + dimension); diff --git a/src/VecSim/spaces/L2/L2_SSE_SQ8.h b/src/VecSim/spaces/L2/L2_SSE_SQ8.h index 89cd7db1a..ded00b166 100644 --- a/src/VecSim/spaces/L2/L2_SSE_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SSE_SQ8.h @@ -9,7 +9,7 @@ #include "VecSim/spaces/space_includes.h" #include -static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m128 &sum, +static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum, const __m128 &min_val_vec, const __m128 &delta_vec) { // Load 4 float elements from pVect1 __m128 v1 = _mm_loadu_ps(pVect1); @@ -34,8 +34,8 @@ static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, __m128 &sum, template // 0..15 float SQ8_L2SqrSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dimension) { - float *pVect1 = (float *)pVect1v; - uint8_t *pVect2 = (uint8_t *)pVect2v; + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); // Get dequantization parameters from the end of quantized vector const float min_val = *reinterpret_cast(pVect2 + dimension); diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h index 2cfdb15ad..8f76ce56f 100644 --- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h @@ -9,8 +9,7 @@ #include "VecSim/spaces/space_includes.h" #include -// Helper function to perform L2 squared distance calculation for a chunk of elements -static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, size_t &offset, +static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, svfloat32_t &sum, const svfloat32_t &min_val_vec, const svfloat32_t &delta_vec) { svbool_t pg = svptrue_b32(); @@ -39,8 +38,8 @@ static inline void L2SqrStep(float *&pVect1, uint8_t *&pVect2, size_t &offset, template float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { - float *pVect1 = (float *)pVect1v; - uint8_t *pVect2 = (uint8_t *)pVect2v; + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); size_t offset = 0; // Get dequantization parameters from the end of quantized vector From 6028dd7ed870bcc4988198b58bded62cec2e7a06 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 12 May 2025 15:24:07 +0300 Subject: [PATCH 32/52] format --- src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 32 +-- .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h | 39 ++-- src/VecSim/spaces/IP/IP_AVX_SQ8.h | 35 ++- src/VecSim/spaces/IP/IP_NEON_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_SSE_SQ8.h | 46 ++-- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 63 ++--- src/VecSim/spaces/IP_space.cpp | 221 +++++++++--------- src/VecSim/spaces/IP_space.h | 2 +- src/VecSim/spaces/L2/L2.cpp | 4 +- .../spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h | 19 +- src/VecSim/spaces/L2/L2_AVX_SQ8.h | 33 ++- src/VecSim/spaces/L2/L2_SVE_SQ8.h | 50 ++-- src/VecSim/spaces/L2_space.cpp | 110 ++++----- src/VecSim/spaces/functions/AVX512F.cpp | 1 - .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 1 - .../spaces/functions/AVX512F_BW_VL_VNNI.h | 4 +- tests/unit/test_bf16.cpp | 2 +- tests/unit/test_spaces.cpp | 94 ++++---- tests/utils/tests_utils.h | 1 - 19 files changed, 372 insertions(+), 387 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 67e10bad1..78151bf44 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -5,30 +5,30 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). - */ +*/ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, - const __m256 &min_val_vec, const __m256 &delta_vec) { + const __m256 &min_val_vec, const __m256 &delta_vec) { // Load 8 float elements from pVect1 __m256 v1 = _mm256_loadu_ps(pVect1); pVect1 += 8; - + // Load 8 uint8 elements from pVect2, convert to int32, then to float - __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2); + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); pVect2 += 8; - + // Zero-extend uint8 to int32 (AVX2 instruction) __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - + // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - + // Dequantize: (val * delta) + min_val // Use FMA instruction available in AVX2 for better performance __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); - + // Compute dot product and add to sum (using FMA) sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256); } @@ -41,7 +41,7 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size // Get dequantization parameters from the end of quantized vector const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - + // Create broadcast vectors for SIMD operations __m256 min_val_vec = _mm256_set1_ps(min_val); __m256 delta_vec = _mm256_set1_ps(delta); @@ -54,25 +54,25 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size if constexpr (residual % 8) { // AVX2 doesn't have native mask loading, so we use the helper function __mmask8 constexpr mask = (1 << (residual % 8)) - 1; - + // Load masked float elements __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); pVect1 += residual % 8; - + // Load masked uint8 elements - __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2); + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); pVect2 += residual % 8; - + // Zero-extend uint8 to int32 (AVX2 instruction) __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - + // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - + // Dequantize: (val * delta) + min (using FMA) __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); - + // Compute dot product with masking sum256 = _mm256_mul_ps(v1, v2_dequant); } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index b33b3629c..8bc0569da 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -5,20 +5,19 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). - */ +*/ #pragma once #include "VecSim/spaces/space_includes.h" #include #include -static inline void -SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum, - const __m512 &min_val_vec, const __m512 &delta_vec) { +static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum, + const __m512 &min_val_vec, const __m512 &delta_vec) { // Load 16 float elements from pVec1 __m512 v1 = _mm512_loadu_ps(pVec1); // Load 16 uint8 elements from pVec2 and convert to __m512i - __m128i v2_128 = _mm_loadu_si128((__m128i*)pVec2); + __m128i v2_128 = _mm_loadu_si128((__m128i *)pVec2); __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); // Convert uint8 to float @@ -37,7 +36,8 @@ SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum, // Common implementation for both inner product and cosine similarity template // 0..15 -float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, float inv_norm = 1.0f) { +float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, + float inv_norm = 1.0f) { const float *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); const float *pEnd1 = pVec1 + dimension; @@ -62,56 +62,53 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); // Load masked uint8 elements - __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVec2)); + __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVec2)); __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - // Dequantize __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); - + // Compute dot product __m512 product = _mm512_mul_ps(v1, dequantized); - // Apply mask to product and add to sum sum = _mm512_mask_add_ps(sum, mask, sum, product); - + pVec1 += residual; pVec2 += residual; } - + // Process remaining full chunks of 16 elements do { SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); } while (pVec1 < pEnd1); // Return the raw inner product result - return _mm512_reduce_add_ps(sum);; + return _mm512_reduce_add_ps(sum); + ; } template // 0..15 -float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, - const void *pVec2v, - size_t dimension) { +float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { // Calculate inner product using common implementation float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); - + // The inner product similarity is 1 - ip return 1.0f - ip; } template // 0..15 float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, - size_t dimension) { + size_t dimension) { // Get the inverse norm factor stored after min_val and delta const uint8_t *pVec2 = static_cast(pVec2v); const float inv_norm = *reinterpret_cast(pVec2 + dimension + 2 * sizeof(float)); - + // Calculate inner product using common implementation with normalization float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension, inv_norm); - + // The cosine similarity is 1 - ip return 1.0f - ip; } - diff --git a/src/VecSim/spaces/IP/IP_AVX_SQ8.h b/src/VecSim/spaces/IP/IP_AVX_SQ8.h index d28a13a4f..385f7967e 100644 --- a/src/VecSim/spaces/IP/IP_AVX_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX_SQ8.h @@ -5,29 +5,29 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). - */ +*/ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, - const __m256 &min_val_vec, const __m256 &delta_vec) { + const __m256 &min_val_vec, const __m256 &delta_vec) { // Load 8 float elements from pVect1 __m256 v1 = _mm256_loadu_ps(pVect1); pVect1 += 8; - + // Load 8 uint8 elements from pVect2, convert to int32, then to float - __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2); + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); pVect2 += 8; - + // Zero-extend uint8 to int32 __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - + // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - + // Dequantize: (val * delta) + min_val __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); - + // Compute dot product and add to sum sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant)); } @@ -38,7 +38,7 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen // pVect2 is a quantized uint8_t vector const uint8_t *pVect2 = static_cast(pVect2v); const float *pEnd1 = pVect1 + dimension; - + // Get dequantization parameters from the end of quantized vector const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); @@ -54,22 +54,21 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen __mmask8 constexpr mask = (1 << (residual % 8)) - 1; __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); pVect1 += residual % 8; - + // Load quantized values and dequantize - __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2); + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); pVect2 += residual % 8; - + // Zero-extend uint8 to int32 __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - + // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - + // Dequantize: (val * delta) + min_val __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); - - + // Compute dot product with masking sum256 = _mm256_mul_ps(v1, v2_dequant); } @@ -99,10 +98,10 @@ float SQ8_CosineSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dime // Get dequantization parameters from the end of quantized vector const uint8_t *pVect2 = static_cast(pVect2v); const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - + // Calculate inner product using common implementation with normalization float ip = SQ8_InnerProductImp(pVect1v, pVect2v, dimension); - + // For cosine, we need to account for the vector norms // The inv_norm parameter is stored after min_val and delta in the quantized vector return 1.0f - ip * inv_norm; diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h index cafe2cab4..b2529439c 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -10,7 +10,7 @@ #include static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum, - const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { + const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { // Load 4 float elements from pVect1 float32x4_t v1 = vld1q_f32(pVect1); pVect1 += 4; diff --git a/src/VecSim/spaces/IP/IP_SSE_SQ8.h b/src/VecSim/spaces/IP/IP_SSE_SQ8.h index f7bae253e..a28f2cf12 100644 --- a/src/VecSim/spaces/IP/IP_SSE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE_SQ8.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). - */ +*/ #include "VecSim/spaces/space_includes.h" #include #include @@ -15,17 +15,17 @@ static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2 // Load 4 float elements from pVect1 __m128 v1 = _mm_loadu_ps(pVect1); pVect1 += 4; - + // Load 4 uint8 elements from pVect2, convert to int32, then to float - __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float*)pVect2))); + __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float *)pVect2))); pVect2 += 4; - + // Convert int32 to float __m128 v2_f = _mm_cvtepi32_ps(v2_i); - + // Dequantize: (val * delta) + min_val __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec); - + // Compute dot product and add to sum sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant)); } @@ -38,7 +38,7 @@ float SQ8_InnerProductSIMD16_SSE_IMP(const void *pVect1v, const void *pVect2v, s // Get dequantization parameters from the end of quantized vector float min = *(float *)(quantized + dimension); float delta = *(float *)(quantized + dimension + sizeof(float)); - + // Create broadcast vectors for SIMD operations __m128 min_val_vec = _mm_set1_ps(min); __m128 delta_vec = _mm_set1_ps(delta); @@ -53,43 +53,37 @@ float SQ8_InnerProductSIMD16_SSE_IMP(const void *pVect1v, const void *pVect2v, s if constexpr (residual % 4) { __m128 v1; __m128 v2_dequant = _mm_setzero_ps(); - + if constexpr (residual % 4 == 3) { // Load 3 floats and set the last one to 0 - v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0 + v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0 v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part - + // Dequantize first value float dequant0 = quantized[0] * delta + min; v2_dequant = _mm_load_ss(&dequant0); - + // Dequantize next two values - float dequant_high[2] = { - quantized[1] * delta + min, - quantized[2] * delta + min - }; + float dequant_high[2] = {quantized[1] * delta + min, quantized[2] * delta + min}; v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high); - + } else if constexpr (residual % 4 == 2) { // Load 2 floats and set the last two to 0 v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1); - + // Dequantize two values - float dequant_high[2] = { - quantized[0] * delta + min, - quantized[1] * delta + min - }; + float dequant_high[2] = {quantized[0] * delta + min, quantized[1] * delta + min}; v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high); - + } else if constexpr (residual % 4 == 1) { // Load 1 float and set the last three to 0 v1 = _mm_load_ss(pVect1); - + // Dequantize one value float dequant0 = quantized[0] * delta + min; v2_dequant = _mm_load_ss(&dequant0); } - + pVect1 += residual % 4; quantized += residual % 4; sum = _mm_mul_ps(v1, v2_dequant); @@ -100,7 +94,7 @@ float SQ8_InnerProductSIMD16_SSE_IMP(const void *pVect1v, const void *pVect2v, s while (pVect1 < pEnd1) { InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec); } - + // TmpRes must be 16 bytes aligned. float PORTABLE_ALIGN16 TmpRes[4]; _mm_store_ps(TmpRes, sum); @@ -120,7 +114,7 @@ float SQ8_CosineSIMD16_SSE(const void *pVect1v, const void *pVect2v, size_t dime const uint8_t *pVect2 = static_cast(pVect2v); // Get quantization parameters const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - + // Compute inner product with dequantization using the common function // We need to cast away const for the inner product function, but it doesn't modify the vectors const float res = SQ8_InnerProductSIMD16_SSE_IMP(pVect1v, pVect2v, dimension); diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index bbbe328d7..4fe6ad5bb 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -5,32 +5,32 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). - */ +*/ #include "VecSim/spaces/space_includes.h" #include #include #include static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, - svfloat32_t &sum, const svfloat32_t &min_val_vec, + svfloat32_t &sum, const svfloat32_t &min_val_vec, const svfloat32_t &delta_vec) { svbool_t pg = svptrue_b32(); - + // Load float elements from pVect1 svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); - + // Convert uint8 to uint32 svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa - + // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32); - + // Dequantize: (val * delta) + min_val svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec); - + // Compute dot product and add to sum sum = svmla_f32_z(pg, sum, v1, v2_dequant); - + // Move to the next set of elements offset += svcntw(); } @@ -44,7 +44,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz // Get dequantization parameters from the end of quantized vector float min = *(float *)(pVect2 + dimension); float delta = *(float *)(pVect2 + dimension + sizeof(float)); - + // Create broadcast vectors for SIMD operations svbool_t pg = svptrue_b32(); svfloat32_t min_val_vec = svdup_f32(min); @@ -52,7 +52,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz // Get the number of 32-bit elements per vector at runtime uint64_t sve_word_count = svcntw(); - + // Multiple accumulators to increase instruction-level parallelism svfloat32_t sum0 = svdup_f32(0.0f); svfloat32_t sum1 = svdup_f32(0.0f); @@ -64,25 +64,27 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz size_t remaining = dimension % sve_word_count; if (remaining > 0) { // Create predicate for the remaining elements - svbool_t pg_partial = svwhilelt_b32(static_cast(0), static_cast(remaining)); + svbool_t pg_partial = + svwhilelt_b32(static_cast(0), static_cast(remaining)); // Load float elements from pVect1 with predicate svfloat32_t v1 = svld1_f32(pg_partial, pVect1); - // load 8-bit bytes from pVect2+offset and zero-extend each into a 32-bit lane - svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset); // LD1UB: load 8-bit, zero-extend to 32-bit :contentReference[oaicite:0]{index=0} + svuint32_t v2_u32 = svld1ub_u32( + pg_partial, pVect2 + offset); // LD1UB: load 8-bit, zero-extend to 32-bit + // :contentReference[oaicite:0]{index=0} - // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); - + // Dequantize: (val * delta) + min_val - svfloat32_t v2_dequant = svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec); - + svfloat32_t v2_dequant = + svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec); + // Compute dot product and add to sum sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant); - + // Move pointers past the partial chunk offset += remaining; } @@ -90,15 +92,16 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz // Process 4 chunks at a time in the main loop auto chunk_size = 4 * sve_word_count; - const size_t number_of_chunks = (dimension - (partial_chunk ? dimension % sve_word_count : 0)) / chunk_size; - + const size_t number_of_chunks = + (dimension - (partial_chunk ? dimension % sve_word_count : 0)) / chunk_size; + for (size_t i = 0; i < number_of_chunks; i++) { InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec); InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec); } - + // Handle remaining steps (0-3) if constexpr (additional_steps > 0) { InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); @@ -109,33 +112,35 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz if constexpr (additional_steps > 2) { InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); } - + // Combine the accumulators svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); sum = svadd_f32_z(pg, sum, sum2); sum = svadd_f32_z(pg, sum, sum3); - + // Horizontal sum of all elements in the vector float result = svaddv_f32(pg, sum); - + return result; } template float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); + return 1.0f - SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, + dimension); } template float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { const uint8_t *pVect2 = static_cast(pVect2v); - + // Get quantization parameters const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - + // Compute inner product with dequantization using the common function - const float res = SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); - + const float res = + SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); + // For cosine, we need to account for the vector norms // The inv_norm parameter is stored after min_val and delta in the quantized vector return 1.0f - res * inv_norm; diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 93609475d..9d49d072d 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -33,121 +33,122 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; namespace spaces { - dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { - unsigned char dummy_alignment; - if (alignment == nullptr) { - alignment = &dummy_alignment; - } - - dist_func_t ret_dist_func = SQ8_InnerProduct; - [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); - #ifdef CPU_FEATURES_ARCH_AARCH64 - - #ifdef OPT_SVE2 - if (features.sve2) { - return Choose_SQ8_IP_implementation_SVE2(dim); - } - #endif - #ifdef OPT_SVE - if (features.sve) { - return Choose_SQ8_IP_implementation_SVE(dim); - } - #endif - #ifdef OPT_NEON - if (features.asimd) { - return Choose_SQ8_IP_implementation_NEON(dim); - } - #endif - - #endif - - #ifdef CPU_FEATURES_ARCH_X86_64 - // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. - if (dim < 16) { - return ret_dist_func; - } - #ifdef OPT_AVX512_F_BW_VL_VNNI - if (features.avx512f && features.avx512bw && features.avx512vnni) { - if (dim % 16 == 0) // no point in aligning if we have an offsetting residual - *alignment = 16 * sizeof(float); // handles 16 floats - return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); - } - #endif - #ifdef OPT_AVX - if (features.avx) { - if (dim % 8 == 0) // no point in aligning if we have an offsetting residual - *alignment = 8 * sizeof(float); // handles 8 floats - return Choose_SQ8_IP_implementation_AVX(dim); - } - #endif - #ifdef OPT_SSE - if (features.sse) { - if (dim % 4 == 0) // no point in aligning if we have an offsetting residual - *alignment = 4 * sizeof(float); // handles 4 floats - return Choose_SQ8_IP_implementation_SSE(dim); - } - #endif - #endif // __x86_64__ +dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_InnerProduct; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); +#ifdef CPU_FEATURES_ARCH_AARCH64 + +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_IP_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_IP_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_IP_implementation_NEON(dim); + } +#endif + +#endif + +#ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + if (dim < 16) { return ret_dist_func; } +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { + if (dim % 16 == 0) // no point in aligning if we have an offsetting residual + *alignment = 16 * sizeof(float); // handles 16 floats + return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#ifdef OPT_AVX + if (features.avx) { + if (dim % 8 == 0) // no point in aligning if we have an offsetting residual + *alignment = 8 * sizeof(float); // handles 8 floats + return Choose_SQ8_IP_implementation_AVX(dim); + } +#endif +#ifdef OPT_SSE + if (features.sse) { + if (dim % 4 == 0) // no point in aligning if we have an offsetting residual + *alignment = 4 * sizeof(float); // handles 4 floats + return Choose_SQ8_IP_implementation_SSE(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + +dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_Cosine; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); +#ifdef CPU_FEATURES_ARCH_AARCH64 + +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_Cosine_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_Cosine_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_Cosine_implementation_NEON(dim); + } +#endif + +#endif -dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { - unsigned char dummy_alignment; - if (alignment == nullptr) { - alignment = &dummy_alignment; - } - - dist_func_t ret_dist_func = SQ8_Cosine; - [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); - #ifdef CPU_FEATURES_ARCH_AARCH64 - - #ifdef OPT_SVE2 - if (features.sve2) { - return Choose_SQ8_Cosine_implementation_SVE2(dim); - } - #endif - #ifdef OPT_SVE - if (features.sve) { - return Choose_SQ8_Cosine_implementation_SVE(dim); - } - #endif - #ifdef OPT_NEON - if (features.asimd) { - return Choose_SQ8_Cosine_implementation_NEON(dim); - } - #endif - - #endif - - #ifdef CPU_FEATURES_ARCH_X86_64 - // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. - if (dim < 16) { - return ret_dist_func; - } - #ifdef OPT_AVX512_F_BW_VL_VNNI - if (features.avx512f && features.avx512bw && features.avx512vnni) { - if (dim % 16 == 0) // no point in aligning if we have an offsetting residual - *alignment = 16 * sizeof(float); // handles 16 floats - return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); - } - #endif - #ifdef OPT_AVX - if (features.avx) { - if (dim % 8 == 0) // no point in aligning if we have an offsetting residual - *alignment = 8 * sizeof(float); // handles 8 floats - return Choose_SQ8_Cosine_implementation_AVX(dim); - } - #endif - #ifdef OPT_SSE - if (features.sse) { - if (dim % 4 == 0) // no point in aligning if we have an offsetting residual - *alignment = 4 * sizeof(float); // handles 4 floats - return Choose_SQ8_Cosine_implementation_SSE(dim); - } - #endif - #endif // __x86_64__ +#ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + if (dim < 16) { return ret_dist_func; } +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { + if (dim % 16 == 0) // no point in aligning if we have an offsetting residual + *alignment = 16 * sizeof(float); // handles 16 floats + return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#ifdef OPT_AVX + if (features.avx) { + if (dim % 8 == 0) // no point in aligning if we have an offsetting residual + *alignment = 8 * sizeof(float); // handles 8 floats + return Choose_SQ8_Cosine_implementation_AVX(dim); + } +#endif +#ifdef OPT_SSE + if (features.sse) { + if (dim % 4 == 0) // no point in aligning if we have an offsetting residual + *alignment = 4 * sizeof(float); // handles 4 floats + return Choose_SQ8_Cosine_implementation_SSE(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index e375e8e37..db2d0b2d9 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -30,5 +30,5 @@ dist_func_t IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = n dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, - const void *arch_opt = nullptr); + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 42f219409..1b40a587c 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -19,8 +19,8 @@ float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); // pvect2 is a vector of int8_t, so we need to dequantize it, normalize it and then multiply it. - // it structred as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm (float)] - // The last two values are used to dequantize the vector. + // it structred as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm + // (float)] The last two values are used to dequantize the vector. const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h index c3d06d1a3..c90aa35fd 100644 --- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h @@ -9,14 +9,13 @@ #include "VecSim/spaces/space_includes.h" // Helper function to perform L2 squared distance calculation for a chunk of 16 elements -static inline void -SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum, - const __m512 &min_val_vec, const __m512 &delta_vec) { +static inline void SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum, + const __m512 &min_val_vec, const __m512 &delta_vec) { // Load 16 float elements from pVect1 __m512 v1 = _mm512_loadu_ps(pVect1); // Load 16 uint8 elements from pVect2 and convert to __m512i - __m128i v2_128 = _mm_loadu_si128((__m128i*)pVect2); + __m128i v2_128 = _mm_loadu_si128((__m128i *)pVect2); __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); // Convert uint8 to float @@ -38,7 +37,7 @@ SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum, template // 0..15 float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, - size_t dimension) { + size_t dimension) { const float *pVect1 = static_cast(pVect1v); const uint8_t *pVect2 = static_cast(pVect2v); const float *pEnd1 = pVect1 + dimension; @@ -53,7 +52,7 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2 // Initialize sum accumulator __m512 sum = _mm512_setzero_ps(); - + // Handle residual elements (0 to 15) if constexpr (residual > 0) { // Create mask for residual elements @@ -63,7 +62,7 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2 __m512 v1 = _mm512_maskz_loadu_ps(mask, pVect1); // Load masked uint8 elements from pVect2 - __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVect2)); + __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVect2)); __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); __m512 v2_f = _mm512_cvtepi32_ps(v2_512); @@ -83,12 +82,12 @@ float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2 } // Process remaining full chunks of 16 elements - do { + do { SQ8_L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); - }while (pVect1 < pEnd1); + } while (pVect1 < pEnd1); // Horizontal sum float result = _mm512_reduce_add_ps(sum); - + return result; } diff --git a/src/VecSim/spaces/L2/L2_AVX_SQ8.h b/src/VecSim/spaces/L2/L2_AVX_SQ8.h index be7e77fba..f6fceca0d 100644 --- a/src/VecSim/spaces/L2/L2_AVX_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX_SQ8.h @@ -9,29 +9,29 @@ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" -static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum, - const __m256 &min_val_vec, const __m256 &delta_vec) { +static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum, + const __m256 &min_val_vec, const __m256 &delta_vec) { // Load 8 float elements from pVect1 __m256 v1 = _mm256_loadu_ps(pVect1); - + // Load 8 uint8 elements from pVect2 - __m128i v2_128 = _mm_loadl_epi64((__m128i*)pVect2); - + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + // Zero-extend uint8 to int32 __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - + // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - + // Dequantize: (val * delta) + min_val __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); - + // Compute difference __m256 diff = _mm256_sub_ps(v1, v2_dequant); - + // Square difference and add to sum sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - + // Advance pointers pVect1 += 8; pVect2 += 8; @@ -57,33 +57,30 @@ float SQ8_L2SqrSIMD16_AVX(const void *pVect1v, const void *pVect2v, size_t dimen __mmask8 constexpr mask = (1 << (residual % 8)) - 1; __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); pVect1 += residual % 8; - + uint8_t temp_buf[8] = {0}; // Manually copy elements for (size_t i = 0; i < residual % 8; i++) { temp_buf[i] = pVect2[i]; } // Load from buffer - __m128i v2_128 = _mm_loadl_epi64((__m128i*)temp_buf); + __m128i v2_128 = _mm_loadl_epi64((__m128i *)temp_buf); pVect2 += residual % 8; - + // Zero-extend uint8 to int32 __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - + // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - + // Dequantize: (val * delta) + min_val __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); - v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); __m256 diff = _mm256_sub_ps(v1, v2_dequant); - sum = _mm256_mul_ps(diff, diff); - } // If the reminder is >= 8, have another step of 8 floats diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h index 8f76ce56f..7e3db05d5 100644 --- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h @@ -10,28 +10,28 @@ #include static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, - svfloat32_t &sum, const svfloat32_t &min_val_vec, - const svfloat32_t &delta_vec) { + svfloat32_t &sum, const svfloat32_t &min_val_vec, + const svfloat32_t &delta_vec) { svbool_t pg = svptrue_b32(); - + // Load float elements from pVect1 svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); - + // Convert uint8 to uint32 svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); - + // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32); - + // Dequantize: (val * delta) + min_val svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec); - + // Compute difference svfloat32_t diff = svsub_f32_z(pg, v1, v2_dequant); - + // Square difference and add to sum sum = svmla_f32_z(pg, sum, diff, diff); - + // Move to the next set of elements offset += svcntw(); } @@ -45,7 +45,7 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi // Get dequantization parameters from the end of quantized vector const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - + // Create broadcast vectors for SIMD operations svbool_t pg = svptrue_b32(); svfloat32_t min_val_vec = svdup_f32(min_val); @@ -53,7 +53,7 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi // Get the number of 32-bit elements per vector at runtime uint64_t sve_word_count = svcntw(); - + // Multiple accumulators to increase instruction-level parallelism svfloat32_t sum0 = svdup_f32(0.0f); svfloat32_t sum1 = svdup_f32(0.0f); @@ -65,26 +65,28 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi size_t remaining = dimension % sve_word_count; if (remaining > 0) { // Create predicate for the remaining elements - svbool_t pg_partial = svwhilelt_b32(static_cast(0), static_cast(remaining)); - + svbool_t pg_partial = + svwhilelt_b32(static_cast(0), static_cast(remaining)); + // Load float elements from pVect1 with predicate svfloat32_t v1 = svld1_f32(pg_partial, pVect1); - + // Load uint8 elements from pVect2 with predicate, convert to int32, then to float svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset); - + // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); - + // Dequantize: (val * delta) + min_val - svfloat32_t v2_dequant = svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec); - + svfloat32_t v2_dequant = + svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec); + // Compute difference svfloat32_t diff = svsub_f32_z(pg_partial, v1, v2_dequant); - + // Square difference and add to sum sum0 = svmla_f32_z(pg_partial, sum0, diff, diff); - + // Move pointers past the partial chunk offset += remaining; } @@ -99,27 +101,25 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi if constexpr (additional_steps > 2) { L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); } - // Process 4 chunks at a time in the main loop auto chunk_size = 4 * sve_word_count; size_t number_of_chunks = dimension / chunk_size; - + for (size_t i = 0; i < number_of_chunks; i++) { L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec); L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); L2SqrStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec); } - // Combine the accumulators svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); sum = svadd_f32_z(pg, sum, sum2); sum = svadd_f32_z(pg, sum, sum3); - + // Horizontal sum of all elements in the vector float result = svaddv_f32(pg, sum); - + return result; } diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 6e4086f74..363330f29 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -33,63 +33,63 @@ using float16 = vecsim_types::float16; namespace spaces { - dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { - unsigned char dummy_alignment; - if (!alignment) { - alignment = &dummy_alignment; - } - - dist_func_t ret_dist_func = SQ8_L2Sqr; - - [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); - #ifdef CPU_FEATURES_ARCH_AARCH64 - #ifdef OPT_SVE2 - if (features.sve2) { - return Choose_SQ8_L2_implementation_SVE2(dim); - } - #endif - #ifdef OPT_SVE - if (features.sve) { - return Choose_SQ8_L2_implementation_SVE(dim); - } - #endif - #ifdef OPT_NEON - if (features.asimd) { - return Choose_SQ8_L2_implementation_NEON(dim); - } - #endif - #endif - - #ifdef CPU_FEATURES_ARCH_X86_64 - // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. - - if (dim < 16) { - return ret_dist_func; - } - #ifdef OPT_AVX512_F_BW_VL_VNNI - if (features.avx512f && features.avx512bw && features.avx512vnni) { - if (dim % 16 == 0) // no point in aligning if we have an offsetting residual - *alignment = 16 * sizeof(float); // handles 16 floats - return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim); - } - #endif - #ifdef OPT_AVX - if (features.avx) { - if (dim % 8 == 0) // no point in aligning if we have an offsetting residual - *alignment = 8 * sizeof(float); // handles 8 floats - return Choose_SQ8_L2_implementation_AVX(dim); - } - #endif - #ifdef OPT_SSE - if (features.sse) { - if (dim % 4 == 0) // no point in aligning if we have an offsetting residual - *alignment = 4 * sizeof(float); // handles 4 floats - return Choose_SQ8_L2_implementation_SSE(dim); - } - #endif - #endif // __x86_64__ +dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (!alignment) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_L2Sqr; + + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_L2_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_L2_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_L2_implementation_NEON(dim); + } +#endif +#endif + +#ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + + if (dim < 16) { return ret_dist_func; } +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { + if (dim % 16 == 0) // no point in aligning if we have an offsetting residual + *alignment = 16 * sizeof(float); // handles 16 floats + return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#ifdef OPT_AVX + if (features.avx) { + if (dim % 8 == 0) // no point in aligning if we have an offsetting residual + *alignment = 8 * sizeof(float); // handles 8 floats + return Choose_SQ8_L2_implementation_AVX(dim); + } +#endif +#ifdef OPT_SSE + if (features.sse) { + if (dim % 4 == 0) // no point in aligning if we have an offsetting residual + *alignment = 4 * sizeof(float); // handles 4 floats + return Choose_SQ8_L2_implementation_SSE(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} dist_func_t L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; diff --git a/src/VecSim/spaces/functions/AVX512F.cpp b/src/VecSim/spaces/functions/AVX512F.cpp index c9124f3b4..bcddbea91 100644 --- a/src/VecSim/spaces/functions/AVX512F.cpp +++ b/src/VecSim/spaces/functions/AVX512F.cpp @@ -16,7 +16,6 @@ #include "VecSim/spaces/IP/IP_AVX512F_FP32.h" #include "VecSim/spaces/IP/IP_AVX512F_FP64.h" - namespace spaces { #include "implementation_chooser.h" diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 889725204..b383ab4e2 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -21,7 +21,6 @@ namespace spaces { #include "implementation_chooser.h" - dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI); diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index 77eff5d57..745a339fb 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -5,15 +5,13 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #pragma once #include "VecSim/spaces/spaces.h" namespace spaces { - - dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); diff --git a/tests/unit/test_bf16.cpp b/tests/unit/test_bf16.cpp index ebef947f0..458aeb80d 100644 --- a/tests/unit/test_bf16.cpp +++ b/tests/unit/test_bf16.cpp @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "gtest/gtest.h" #include "VecSim/vec_sim.h" diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 0374a774b..fcd75b70b 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -322,7 +322,7 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); } - + // Create SQ8 compressed version of v2 // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) test_utils::quantize_float_vec_to_uint8(v2_orig, dim, v2_compressed.data()); @@ -2062,7 +2062,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { v1_orig[i] = float(i + 1.5); v2_orig[i] = float(i * 0.75 + 1.0); } - + // Create SQ8 compressed version of v2 std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); @@ -2073,8 +2073,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { dist_func_t arch_opt_func; float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); - // Test different optimizations based on CPU features - #ifdef OPT_AVX512_F_BW_VL_VNNI +// Test different optimizations based on CPU features +#ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2086,21 +2086,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = 0; } - #endif - #ifdef OPT_AVX +#endif +#ifdef OPT_AVX if (optimization.avx) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset avx flag as well, so we'll choose the next optimization (SSE). optimization.avx = 0; } - #endif - #ifdef OPT_SSE +#endif +#ifdef OPT_SSE if (optimization.sse) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2112,9 +2112,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset sse flag as well, so we'll choose the next optimization (default). optimization.sse = 0; } - #endif +#endif - #ifdef OPT_SVE2 +#ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2126,8 +2126,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset sve2 flag as well, so we'll choose the next option (default). optimization.sve2 = 0; } - #endif - #ifdef OPT_SVE +#endif +#ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2139,21 +2139,20 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset sve flag as well, so we'll choose the next option (default). optimization.sve = 0; } - #endif - #ifdef OPT_NEON +#endif +#ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "NEON with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. optimization.asimd = 0; } - #endif - +#endif // Test default implementation unsigned char alignment = 0; @@ -2190,8 +2189,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { dist_func_t arch_opt_func; float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); - // Test different optimizations based on CPU features - #ifdef OPT_AVX512_F_BW_VL_VNNI +// Test different optimizations based on CPU features +#ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2202,8 +2201,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; optimization.avx512f = 0; } - #endif - #ifdef OPT_AVX +#endif +#ifdef OPT_AVX if (optimization.avx) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2214,8 +2213,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; optimization.avx = 0; } - #endif - #ifdef OPT_SSE +#endif +#ifdef OPT_SSE if (optimization.sse) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2226,21 +2225,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; optimization.sse = 0; } - #endif - #ifdef OPT_SVE2 +#endif +#ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE2 with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve2 flag as well, so we'll choose the next option (default). optimization.sve2 = 0; } - #endif - #ifdef OPT_SVE +#endif +#ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2252,8 +2251,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // Unset sve flag as well, so we'll choose the next option (default). optimization.sve = 0; } - #endif - #ifdef OPT_NEON +#endif +#ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2265,8 +2264,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // Unset optimizations flag, so we'll choose the next optimization. optimization.asimd = 0; } - #endif - +#endif // Test default implementation unsigned char alignment = 0; @@ -2309,7 +2307,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { dist_func_t arch_opt_func; float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim); - #ifdef OPT_SVE2 +#ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2321,8 +2319,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim; optimization.sve2 = 0; } - #endif - #ifdef OPT_SVE +#endif +#ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2334,8 +2332,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "SVE with dim " << dim; optimization.sve = 0; } - #endif - #ifdef OPT_NEON +#endif +#ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2347,10 +2345,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "NEON with dim " << dim; optimization.asimd = 0; } - #endif +#endif - // Test different optimizations based on CPU features - #ifdef OPT_AVX512_F_BW_VL_VNNI +// Test different optimizations based on CPU features +#ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2362,8 +2360,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim; optimization.avx512f = 0; } - #endif - #ifdef OPT_AVX +#endif +#ifdef OPT_AVX if (optimization.avx) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2375,9 +2373,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim; optimization.avx = 0; } - #endif +#endif - #ifdef OPT_SSE +#ifdef OPT_SSE if (optimization.sse) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2389,13 +2387,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "SSE with dim " << dim; optimization.sse = 0; } - #endif +#endif // Test default implementation unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << - dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 1485d332f..7aa18dbbe 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -89,7 +89,6 @@ static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { quantize_float_vec_to_uint8(vec.data(), dim, v, seed); } - template float integral_compute_norm(const datatype *vec, size_t dim) { return spaces::IntegralType_ComputeNorm(vec, dim); From 3c2ee113dd5911ca5fe244dda65f22e9abb9dac3 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 12 May 2025 15:46:57 +0300 Subject: [PATCH 33/52] change to uint --- src/VecSim/spaces/L2/L2.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 1b40a587c..03ade3885 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -18,8 +18,8 @@ using float16 = vecsim_types::float16; float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); - // pvect2 is a vector of int8_t, so we need to dequantize it, normalize it and then multiply it. - // it structred as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm + // pvect2 is a vector of uint8_t, so we need to dequantize it, normalize it and then multiply it. + // it structred as [quantized values (uint8_t * dim)][min_val (float)][delta (float)][inv_norm // (float)] The last two values are used to dequantize the vector. const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); From ad3985e994e80c41c5ae8eb8d049ac664adaf322 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Mon, 12 May 2025 18:25:46 +0300 Subject: [PATCH 34/52] format --- src/VecSim/spaces/IP/IP.cpp | 16 +++--- src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 2 +- .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h | 2 +- src/VecSim/spaces/IP/IP_NEON_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 2 +- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 2 +- src/VecSim/spaces/L2/L2.cpp | 6 +-- src/VecSim/spaces/L2/L2_AVX2_SQ8.h | 2 +- .../spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h | 2 +- src/VecSim/spaces/L2/L2_NEON_SQ8.h | 4 +- src/VecSim/spaces/L2/L2_SSE4_SQ8.h | 52 ++++++++----------- src/VecSim/spaces/L2/L2_SVE_SQ8.h | 2 +- src/VecSim/spaces/functions/AVX.cpp | 1 - src/VecSim/spaces/functions/SSE4.h | 1 - 14 files changed, 44 insertions(+), 52 deletions(-) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 6db1a6d77..5e2c4b4dc 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -14,9 +14,8 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; - -float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, float min_val, - float delta, float inv_norm) { +float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, + float min_val, float delta, float inv_norm) { float res = 0; for (size_t i = 0; i < dimension; i++) { float dequantized_V2 = (pVect2v[i] * delta + min_val); @@ -28,9 +27,9 @@ float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, s float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); - // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply it. - // it is structured as [quantized values (int8_t * dim)][min_val (float)][delta (float)][inv_norm (float)] - // The last two values are used to dequantize the vector. + // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply + // it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta + // (float)][inv_norm (float)] The last two values are used to dequantize the vector. const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); // Compute inner product with dequantization @@ -41,13 +40,14 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); - + // Get quantization parameters const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); // Compute inner product with dequantization - const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm); + const float res = + FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm); return 1.0f - res; } diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 8a6d745e3..6ea609f2d 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index 8bc0569da..f2f4efd52 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #pragma once #include "VecSim/spaces/space_includes.h" #include diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h index b2529439c..3e632dcdb 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "VecSim/spaces/space_includes.h" #include diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index a0b0b02ff..0a6f3ee8c 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "VecSim/spaces/space_includes.h" #include #include diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 4fe6ad5bb..4beaf81ca 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "VecSim/spaces/space_includes.h" #include #include diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 6b1774316..a68ea5114 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -18,9 +18,9 @@ using float16 = vecsim_types::float16; float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); - // pvect2 is a vector of uint8_t, so we need to dequantize it, normalize it and then multiply it. - // it structred as [quantized values (uint8_t * dim)][min_val (float)][delta (float)][inv_norm - // (float)] The last two values are used to dequantize the vector. + // pvect2 is a vector of uint8_t, so we need to dequantize it, normalize it and then multiply + // it. it structred as [quantized values (uint8_t * dim)][min_val (float)][delta + // (float)][inv_norm (float)] The last two values are used to dequantize the vector. const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); diff --git a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h index 56346ddb9..2d2702763 100644 --- a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h index c90aa35fd..d2775f5be 100644 --- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "VecSim/spaces/space_includes.h" // Helper function to perform L2 squared distance calculation for a chunk of 16 elements diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8.h b/src/VecSim/spaces/L2/L2_NEON_SQ8.h index 24f6047a7..e751d1c00 100644 --- a/src/VecSim/spaces/L2/L2_NEON_SQ8.h +++ b/src/VecSim/spaces/L2/L2_NEON_SQ8.h @@ -5,12 +5,12 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "VecSim/spaces/space_includes.h" #include static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum, - const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { + const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { // Load 4 float elements from pVect1 float32x4_t v1 = vld1q_f32(pVect1); pVect1 += 4; diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h index 12e7251be..3ee673d3d 100644 --- a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h @@ -5,29 +5,29 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "VecSim/spaces/space_includes.h" #include static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum, - const __m128 &min_val_vec, const __m128 &delta_vec) { + const __m128 &min_val_vec, const __m128 &delta_vec) { // Load 4 float elements from pVect1 __m128 v1 = _mm_loadu_ps(pVect1); pVect1 += 4; - + // Load 4 uint8 elements from pVect2, convert to int32, then to float - __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float*)pVect2))); + __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float *)pVect2))); pVect2 += 4; - + // Convert int32 to float __m128 v2_f = _mm_cvtepi32_ps(v2_i); - + // Dequantize: (val * delta) + min_val __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec); - + // Compute difference __m128 diff = _mm_sub_ps(v1, v2_dequant); - + // Square difference and add to sum sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); } @@ -40,7 +40,7 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime // Get dequantization parameters from the end of quantized vector const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - + // Create broadcast vectors for SIMD operations __m128 min_val_vec = _mm_set1_ps(min_val); __m128 delta_vec = _mm_set1_ps(delta); @@ -55,49 +55,43 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime if constexpr (residual % 4) { __m128 v1; __m128 v2_dequant = _mm_setzero_ps(); - + if constexpr (residual % 4 == 3) { // Load 3 floats and set the last one to 0 - v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0 + v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0 v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part - + // Dequantize first value float dequant0 = pVect2[0] * delta + min_val; v2_dequant = _mm_load_ss(&dequant0); - + // Dequantize next two values - float dequant_high[2] = { - pVect2[1] * delta + min_val, - pVect2[2] * delta + min_val - }; + float dequant_high[2] = {pVect2[1] * delta + min_val, pVect2[2] * delta + min_val}; v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high); - + } else if constexpr (residual % 4 == 2) { // Load 2 floats and set the last two to 0 v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1); - + // Dequantize two values - float dequant_high[2] = { - pVect2[0] * delta + min_val, - pVect2[1] * delta + min_val - }; + float dequant_high[2] = {pVect2[0] * delta + min_val, pVect2[1] * delta + min_val}; v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high); - + } else if constexpr (residual % 4 == 1) { // Load 1 float and set the last three to 0 v1 = _mm_load_ss(pVect1); - + // Dequantize one value float dequant0 = pVect2[0] * delta + min_val; v2_dequant = _mm_load_ss(&dequant0); } - + pVect1 += residual % 4; pVect2 += residual % 4; - + // Compute difference __m128 diff = _mm_sub_ps(v1, v2_dequant); - + // Square difference and initialize sum sum = _mm_mul_ps(diff, diff); } @@ -118,7 +112,7 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); } - + // TmpRes must be 16 bytes aligned float PORTABLE_ALIGN16 TmpRes[4]; _mm_store_ps(TmpRes, sum); diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h index 7e3db05d5..8bce46365 100644 --- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h @@ -5,7 +5,7 @@ * Licensed under your choice of the Redis Source Available License 2.0 * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). -*/ + */ #include "VecSim/spaces/space_includes.h" #include diff --git a/src/VecSim/spaces/functions/AVX.cpp b/src/VecSim/spaces/functions/AVX.cpp index 253afce14..4b707a5b5 100644 --- a/src/VecSim/spaces/functions/AVX.cpp +++ b/src/VecSim/spaces/functions/AVX.cpp @@ -18,7 +18,6 @@ namespace spaces { #include "implementation_chooser.h" - dist_func_t Choose_FP32_IP_implementation_AVX(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, FP32_InnerProductSIMD16_AVX); diff --git a/src/VecSim/spaces/functions/SSE4.h b/src/VecSim/spaces/functions/SSE4.h index 654bf7f94..27bbae0e0 100644 --- a/src/VecSim/spaces/functions/SSE4.h +++ b/src/VecSim/spaces/functions/SSE4.h @@ -16,5 +16,4 @@ dist_func_t Choose_SQ8_IP_implementation_SSE4(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_SSE4(size_t dim); dist_func_t Choose_SQ8_L2_implementation_SSE4(size_t dim); - } // namespace spaces From 76d2fdd424a5d4271d353fe250f98a23fe3540a7 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 18 May 2025 12:13:34 +0300 Subject: [PATCH 35/52] added fma avx2 --- cmake/x86_64InstructionFlags.cmake | 4 + src/VecSim/spaces/CMakeLists.txt | 6 + src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h | 112 +++++++++ src/VecSim/spaces/IP/IP_AVX2_SQ8.h | 1 - .../spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h | 7 +- src/VecSim/spaces/IP_space.cpp | 15 ++ src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h | 106 +++++++++ src/VecSim/spaces/L2_space.cpp | 8 + src/VecSim/spaces/functions/AVX2_FMA.cpp | 36 +++ src/VecSim/spaces/functions/AVX2_FMA.h | 20 ++ tests/benchmark/spaces_benchmarks/bm_spaces.h | 1 + .../spaces_benchmarks/bm_spaces_sq8.cpp | 6 + tests/unit/test_spaces.cpp | 217 +++++++++++++----- 13 files changed, 473 insertions(+), 66 deletions(-) create mode 100644 src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h create mode 100644 src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h create mode 100644 src/VecSim/spaces/functions/AVX2_FMA.cpp create mode 100644 src/VecSim/spaces/functions/AVX2_FMA.h diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake index 29281be37..dadd550a8 100644 --- a/cmake/x86_64InstructionFlags.cmake +++ b/cmake/x86_64InstructionFlags.cmake @@ -61,6 +61,10 @@ if(CXX_AVX2) add_compile_definitions(OPT_AVX2) endif() +if(CXX_AVX2 AND CXX_FMA) + add_compile_definitions(OPT_AVX2_FMA) +endif() + if(CXX_AVX) add_compile_definitions(OPT_AVX) endif() diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt index cd6179999..d88750e91 100644 --- a/src/VecSim/spaces/CMakeLists.txt +++ b/src/VecSim/spaces/CMakeLists.txt @@ -56,6 +56,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") list(APPEND OPTIMIZATIONS functions/AVX2.cpp) endif() + if(CXX_AVX2 AND CXX_FMA) + message("Building with AVX2 and FMA") + set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma") + list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp) + endif() + if(CXX_F16C AND CXX_FMA AND CXX_AVX) message("Building with CXX_F16C") set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx") diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h new file mode 100644 index 000000000..822277c93 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" + +static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, + const __m256 &min_val_vec, const __m256 &delta_vec) { + // Load 8 float elements from pVect1 + __m256 v1 = _mm256_loadu_ps(pVect1); + pVect1 += 8; + + // Load 8 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize and compute dot product in one step using FMA + // (val * delta) + min_val -> v2_dequant + // sum256 += v1 * v2_dequant + // Using FMA: sum256 = v1 * v2_dequant + sum256 + + // First, compute v2_dequant = v2_f * delta_vec + min_val_vec + __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Then, compute sum256 += v1 * v2_dequant using FMA + sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256); +} + +template // 0..15 +float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + // pVect2 is a quantized uint8_t vector + const uint8_t *pVect2 = static_cast(pVect2v); + const float *pEnd1 = pVect1 + dimension; + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Create broadcast vectors for SIMD operations + __m256 min_val_vec = _mm256_set1_ps(min_val); + __m256 delta_vec = _mm256_set1_ps(delta); + + __m256 sum256 = _mm256_setzero_ps(); + + // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one + // 16-float block, so mask loading is guaranteed to be safe. + if constexpr (residual % 8) { + __mmask8 constexpr mask = (1 << (residual % 8)) - 1; + __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); + pVect1 += residual % 8; + + // Load quantized values and dequantize + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += residual % 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize using FMA: (val * delta) + min_val + __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product with masking + sum256 = _mm256_mul_ps(v1, v2_dequant); + } + + // If the reminder is >=8, have another step of 8 floats + if constexpr (residual >= 8) { + InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + } + + // We dealt with the residual part. We are left with some multiple of 16 floats. + // In each iteration we calculate 16 floats = 512 bits. + do { + InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + } while (pVect1 < pEnd1); + + return my_mm256_reduce_add_ps(sum256); +} + +template // 0..15 +float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); +} + +template // 0..15 +float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { + // Get dequantization parameters from the end of quantized vector + const uint8_t *pVect2 = static_cast(pVect2v); + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Calculate inner product using common implementation with normalization + float ip = SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + // The inv_norm parameter is stored after min_val and delta in the quantized vector + return 1.0f - ip * inv_norm; +} diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 6ea609f2d..89b1c0b6b 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -67,7 +67,6 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen // Dequantize: (val * delta) + min_val __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); - v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); // Compute dot product with masking sum256 = _mm256_mul_ps(v1, v2_dequant); diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index f2f4efd52..3fd665111 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -61,8 +61,8 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi // Load masked float elements __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); - // Load masked uint8 elements - __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVec2)); + // Load full uint8 elements - we know that the first 16 elements are safe to load + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); __m512 v2_f = _mm512_cvtepi32_ps(v2_512); @@ -73,7 +73,7 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi __m512 product = _mm512_mul_ps(v1, dequantized); // Apply mask to product and add to sum - sum = _mm512_mask_add_ps(sum, mask, sum, product); + sum = _mm512_fmadd_ps(sum, sum, product); pVec1 += residual; pVec2 += residual; @@ -86,7 +86,6 @@ float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimensi // Return the raw inner product result return _mm512_reduce_add_ps(sum); - ; } template // 0..15 diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index b7eb828d0..1bcd3a304 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -20,6 +20,7 @@ #include "VecSim/spaces/functions/AVX512BF16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_FMA.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/SSE4.h" #include "VecSim/spaces/functions/NEON.h" @@ -74,6 +75,13 @@ dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, cons return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); } #endif +#ifdef OPT_AVX2_FMA + if (features.avx2 && features.fma3) { + if (dim % 16 == 0) // no point in aligning if we have an offsetting residual + *alignment = 16 * sizeof(float); // handles 16 floats + return Choose_SQ8_IP_implementation_AVX2_FMA(dim); + } +#endif #ifdef OPT_AVX2 if (features.avx2) { if (dim % 8 == 0) // no point in aligning if we have an offsetting residual @@ -133,6 +141,13 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } #endif +#ifdef OPT_AVX2_FMA + if (features.avx2 && features.fma3) { + if (dim % 16 == 0) // no point in aligning if we have an offsetting residual + *alignment = 16 * sizeof(float); // handles 16 floats + return Choose_SQ8_Cosine_implementation_AVX2_FMA(dim); + } +#endif #ifdef OPT_AVX2 if (features.avx2) { if (dim % 8 == 0) // no point in aligning if we have an offsetting residual diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h new file mode 100644 index 000000000..fd5c38d5a --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" + +static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, + const __m256 &min_val_vec, const __m256 &delta_vec) { + // Load 8 float elements from pVect1 + __m256 v1 = _mm256_loadu_ps(pVect1); + pVect1 += 8; + + // Load 8 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: v2_dequant = v2_f * delta_vec + min_val_vec + __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Calculate squared difference using FMA + // (v1 - v2_dequant)^2 = v1^2 - 2*v1*v2_dequant + v2_dequant^2 + // Using FMA: v1^2 - 2*v1*v2_dequant + v2_dequant^2 + + // First, compute v2_dequant^2 + __m256 v2_dequant_squared = _mm256_mul_ps(v2_dequant, v2_dequant); + + // Then, compute v1^2 + __m256 v1_squared = _mm256_mul_ps(v1, v1); + + // Finally, compute -2*v1*v2_dequant + v2_dequant^2 + v1^2 using FMA + // -2*v1*v2_dequant + v2_dequant^2 = -2 * v1 * v2_dequant + v2_dequant^2 + __m256 neg_2_v1 = _mm256_mul_ps(v1, _mm256_set1_ps(-2.0f)); + __m256 diff_squared = _mm256_fmadd_ps(neg_2_v1, v2_dequant, v2_dequant_squared); + diff_squared = _mm256_add_ps(diff_squared, v1_squared); + + // Add to running sum + sum256 = _mm256_add_ps(sum256, diff_squared); +} + +template // 0..15 +float SQ8_L2SqrSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + // pVect2 is a quantized uint8_t vector + const uint8_t *pVect2 = static_cast(pVect2v); + const float *pEnd1 = pVect1 + dimension; + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Create broadcast vectors for SIMD operations + __m256 min_val_vec = _mm256_set1_ps(min_val); + __m256 delta_vec = _mm256_set1_ps(delta); + + __m256 sum256 = _mm256_setzero_ps(); + + // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one + // 16-float block, so mask loading is guaranteed to be safe. + if constexpr (residual % 8) { + __mmask8 constexpr mask = (1 << (residual % 8)) - 1; + __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); + pVect1 += residual % 8; + + // Load quantized values and dequantize + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += residual % 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize using FMA: (val * delta) + min_val + __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); + + // Calculate squared difference + __m256 diff = _mm256_sub_ps(v1, v2_dequant); + sum256 = _mm256_mul_ps(diff, diff); + } + + // If the reminder is >=8, have another step of 8 floats + if constexpr (residual >= 8) { + L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + } + + // We dealt with the residual part. We are left with some multiple of 16 floats. + // In each iteration we calculate 16 floats = 512 bits. + do { + L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + } while (pVect1 < pEnd1); + + return my_mm256_reduce_add_ps(sum256); +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 6e50a99bb..81f0df91d 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -19,6 +19,7 @@ #include "VecSim/spaces/functions/AVX512FP16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_FMA.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/SSE4.h" #include "VecSim/spaces/functions/NEON.h" @@ -74,6 +75,13 @@ dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, cons return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim); } #endif +#ifdef OPT_AVX2_FMA + if (features.avx2 && features.fma3) { + if (dim % 16 == 0) // no point in aligning if we have an offsetting residual + *alignment = 16 * sizeof(float); // handles 16 floats + return Choose_SQ8_L2_implementation_AVX2_FMA(dim); + } +#endif #ifdef OPT_AVX2 if (features.avx2) { if (dim % 8 == 0) // no point in aligning if we have an offsetting residual diff --git a/src/VecSim/spaces/functions/AVX2_FMA.cpp b/src/VecSim/spaces/functions/AVX2_FMA.cpp new file mode 100644 index 000000000..4dc627c57 --- /dev/null +++ b/src/VecSim/spaces/functions/AVX2_FMA.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "AVX2_FMA.h" +#include "VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h" + +namespace spaces { + +#include "implementation_chooser.h" +// FMA optimized implementations +dist_func_t Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2_FMA); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX2_FMA); + return ret_dist_func; +} +dist_func_t Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX2_FMA); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX2_FMA.h b/src/VecSim/spaces/functions/AVX2_FMA.h new file mode 100644 index 000000000..80d5adb6d --- /dev/null +++ b/src/VecSim/spaces/functions/AVX2_FMA.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim); + + +} // namespace spaces diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index d10e3ac76..d99bcc4ca 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -24,6 +24,7 @@ #include "VecSim/spaces/functions/AVX512BF16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_FMA.h" #include "VecSim/spaces/functions/F16C.h" #include "VecSim/spaces/functions/SSE4.h" #include "VecSim/spaces/functions/SSE3.h" diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index d780b8285..8e7140bba 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -69,6 +69,12 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 1 avx512_f_bw_vl_vnni_supported); #endif // AVX512_F_BW_VL_VNNI +#ifdef OPT_AVX2_FMA +bool avx2_fma3_supported = opt.avx2 && opt.fma3; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported); +#endif // AVX2_FMA + #ifdef AVX2 // AVX2 functions bool avx2_supported = opt.avx2; diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index c9addd484..b660562d3 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -23,13 +23,14 @@ #include "VecSim/spaces/functions/AVX512F.h" #include "VecSim/spaces/functions/AVX.h" #include "VecSim/spaces/functions/SSE.h" -#include "VecSim/spaces/functions/SSE4.h" #include "VecSim/spaces/functions/AVX512BW_VBMI2.h" #include "VecSim/spaces/functions/AVX512BF16_VL.h" #include "VecSim/spaces/functions/AVX512FP16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_FMA.h" #include "VecSim/spaces/functions/SSE3.h" +#include "VecSim/spaces/functions/SSE4.h" #include "VecSim/spaces/functions/F16C.h" #include "VecSim/spaces/functions/NEON.h" #include "VecSim/spaces/functions/NEON_DOTPROD.h" @@ -317,16 +318,43 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { v2_orig[i] = float(i + 1.5); } + // Create SQ8 compressed version of v2 + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); - std::vector v2_compressed(compressed_size); if (should_normalize) { spaces::GetNormalizeFunc()(v1_orig, dim); spaces::GetNormalizeFunc()(v2_orig, dim); } - // Create SQ8 compressed version of v2 - // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) - test_utils::quantize_float_vec_to_uint8(v2_orig, dim, v2_compressed.data()); + // Find min and max for quantization + float min_val = v2_orig[0]; + float max_val = v2_orig[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v2_orig[i]); + max_val = std::max(max_val, v2_orig[i]); + } + + // Calculate delta and inverse norm + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + std::vector v2_compressed(compressed_size); + + // Quantize v2 + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + float *params = reinterpret_cast(quant_values + dim); + + // Store parameters + params[0] = min_val; + params[1] = delta; + + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v2_orig[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + } float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim); @@ -354,20 +382,47 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { v2_orig[i] = float(i + 1.5); } + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); - std::vector v2_compressed(compressed_size); - spaces::GetNormalizeFunc()(v1_orig, dim); - spaces::GetNormalizeFunc()(v2_orig, dim); + // Find min and max for quantization + float min_val = v2_orig[0]; + float max_val = v2_orig[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v2_orig[i]); + max_val = std::max(max_val, v2_orig[i]); + } + // Calculate delta and inverse norm + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero - // Create SQ8 compressed version of v2 - // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) - test_utils::quantize_float_vec_to_uint8(v2_orig, dim, v2_compressed.data()); + // Compress v2 + std::vector v2_compressed(compressed_size); + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + float *params = reinterpret_cast(quant_values + dim); + + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v2_orig[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + } + // Calculate inverse norm from decompressed values + float inv_norm = 0.0f; + for (size_t i = 0; i < dim; i++) { + float decompressed_value = min_val + quant_values[i] * delta; + inv_norm += decompressed_value * decompressed_value; + } + inv_norm = 1.0f / std::sqrt(inv_norm); + // Store parameters + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance"; } - TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { // create a vector with extra space for the norm size_t dim = 5; @@ -2063,7 +2118,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { v1_orig[i] = float(i + 1.5); v2_orig[i] = float(i * 0.75 + 1.0); } - + // Create SQ8 compressed version of v2 std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); @@ -2074,8 +2129,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { dist_func_t arch_opt_func; float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); -// Test different optimizations based on CPU features -#ifdef OPT_AVX512_F_BW_VL_VNNI + // Test different optimizations based on CPU features + #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2087,21 +2142,34 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = 0; } -#endif -#ifdef OPT_AVX2 + #endif + #ifdef OPT_AVX2_FMA + if (optimization.avx2 && optimization.fma3) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2_FMA(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.avx2 = optimization.fma3 = 0; + } + #endif + #ifdef OPT_AVX2 if (optimization.avx2) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; - // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX2 with dim " << dim; - // Unset avx flag as well, so we'll choose the next optimization (SSE4). + // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; + // Unset avx flag as well, so we'll choose the next optimization (SSE). optimization.avx2 = 0; } -#endif -#ifdef OPT_SSE4 + #endif + #ifdef OPT_SSE4 if (optimization.sse4_1) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2109,13 +2177,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SSE with dim " << dim; - // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE4 with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; // Unset sse flag as well, so we'll choose the next optimization (default). optimization.sse4_1 = 0; } -#endif + #endif -#ifdef OPT_SVE2 + #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2127,8 +2195,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset sve2 flag as well, so we'll choose the next option (default). optimization.sve2 = 0; } -#endif -#ifdef OPT_SVE + #endif + #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2140,20 +2208,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset sve flag as well, so we'll choose the next option (default). optimization.sve = 0; } -#endif -#ifdef OPT_NEON + #endif + #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "NEON with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. optimization.asimd = 0; } -#endif + #endif + // Test default implementation unsigned char alignment = 0; @@ -2190,8 +2259,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { dist_func_t arch_opt_func; float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); -// Test different optimizations based on CPU features -#ifdef OPT_AVX512_F_BW_VL_VNNI + // Test different optimizations based on CPU features + #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2202,8 +2271,20 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; optimization.avx512f = 0; } -#endif -#ifdef OPT_AVX + #endif + #ifdef OPT_AVX2_FMA + if (optimization.avx2 && optimization.fma3) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2_FMA(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; + optimization.avx2 = optimization.fma3 = 0; + } + #endif + #ifdef OPT_AVX2 if (optimization.avx2) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2214,8 +2295,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; optimization.avx2 = 0; } -#endif -#ifdef OPT_SSE4 + #endif + #ifdef OPT_SSE if (optimization.sse4_1) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2223,24 +2304,24 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SSE with dim " << dim; - // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE4 with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; optimization.sse4_1 = 0; } -#endif -#ifdef OPT_SVE2 + #endif + #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE2 with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve2 flag as well, so we'll choose the next option (default). optimization.sve2 = 0; } -#endif -#ifdef OPT_SVE + #endif + #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2252,8 +2333,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // Unset sve flag as well, so we'll choose the next option (default). optimization.sve = 0; } -#endif -#ifdef OPT_NEON + #endif + #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2265,7 +2346,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // Unset optimizations flag, so we'll choose the next optimization. optimization.asimd = 0; } -#endif + #endif + // Test default implementation unsigned char alignment = 0; @@ -2308,7 +2390,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { dist_func_t arch_opt_func; float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim); -#ifdef OPT_SVE2 + #ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2320,8 +2402,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim; optimization.sve2 = 0; } -#endif -#ifdef OPT_SVE + #endif + #ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2333,8 +2415,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "SVE with dim " << dim; optimization.sve = 0; } -#endif -#ifdef OPT_NEON + #endif + #ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2346,10 +2428,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "NEON with dim " << dim; optimization.asimd = 0; } -#endif + #endif -// Test different optimizations based on CPU features -#ifdef OPT_AVX512_F_BW_VL_VNNI + // Test different optimizations based on CPU features + #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2361,8 +2443,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim; optimization.avx512f = 0; } -#endif -#ifdef OPT_AVX2 + #endif + #ifdef OPT_AVX2_FMA + if (optimization.avx2 && optimization.fma3) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2_FMA(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + // We don't align SQ8 vectors with cosine distance + // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim; + optimization.avx2 = optimization.fma3 = 0; + } + #endif + #ifdef OPT_AVX2 if (optimization.avx2) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2374,9 +2469,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim; optimization.avx2 = 0; } -#endif + #endif -#ifdef OPT_SSE4 + #ifdef OPT_SSE if (optimization.sse4_1) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2388,13 +2483,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "SSE with dim " << dim; optimization.sse4_1 = 0; } -#endif + #endif // Test default implementation unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << + dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } From b47cc5239c1a7a89c2d62fafdaebee66360cdfce Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 18 May 2025 12:36:01 +0300 Subject: [PATCH 36/52] format --- src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h | 5 +- src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h | 10 +-- src/VecSim/spaces/functions/AVX2_FMA.h | 1 - tests/unit/test_spaces.cpp | 104 ++++++++++++------------- 4 files changed, 59 insertions(+), 61 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h index 822277c93..007ee333e 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -9,8 +9,9 @@ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" -static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, - const __m256 &min_val_vec, const __m256 &delta_vec) { +static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, + __m256 &sum256, const __m256 &min_val_vec, + const __m256 &delta_vec) { // Load 8 float elements from pVect1 __m256 v1 = _mm256_loadu_ps(pVect1); pVect1 += 8; diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h index fd5c38d5a..75ae892f9 100644 --- a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h @@ -10,7 +10,7 @@ #include "VecSim/spaces/AVX_utils.h" static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, - const __m256 &min_val_vec, const __m256 &delta_vec) { + const __m256 &min_val_vec, const __m256 &delta_vec) { // Load 8 float elements from pVect1 __m256 v1 = _mm256_loadu_ps(pVect1); pVect1 += 8; @@ -31,19 +31,19 @@ static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, _ // Calculate squared difference using FMA // (v1 - v2_dequant)^2 = v1^2 - 2*v1*v2_dequant + v2_dequant^2 // Using FMA: v1^2 - 2*v1*v2_dequant + v2_dequant^2 - + // First, compute v2_dequant^2 __m256 v2_dequant_squared = _mm256_mul_ps(v2_dequant, v2_dequant); - + // Then, compute v1^2 __m256 v1_squared = _mm256_mul_ps(v1, v1); - + // Finally, compute -2*v1*v2_dequant + v2_dequant^2 + v1^2 using FMA // -2*v1*v2_dequant + v2_dequant^2 = -2 * v1 * v2_dequant + v2_dequant^2 __m256 neg_2_v1 = _mm256_mul_ps(v1, _mm256_set1_ps(-2.0f)); __m256 diff_squared = _mm256_fmadd_ps(neg_2_v1, v2_dequant, v2_dequant_squared); diff_squared = _mm256_add_ps(diff_squared, v1_squared); - + // Add to running sum sum256 = _mm256_add_ps(sum256, diff_squared); } diff --git a/src/VecSim/spaces/functions/AVX2_FMA.h b/src/VecSim/spaces/functions/AVX2_FMA.h index 80d5adb6d..b81dfd5ab 100644 --- a/src/VecSim/spaces/functions/AVX2_FMA.h +++ b/src/VecSim/spaces/functions/AVX2_FMA.h @@ -16,5 +16,4 @@ dist_func_t Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim); dist_func_t Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim); - } // namespace spaces diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index b660562d3..e1ba1a1bd 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2118,7 +2118,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { v1_orig[i] = float(i + 1.5); v2_orig[i] = float(i * 0.75 + 1.0); } - + // Create SQ8 compressed version of v2 std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); @@ -2129,8 +2129,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { dist_func_t arch_opt_func; float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); - // Test different optimizations based on CPU features - #ifdef OPT_AVX512_F_BW_VL_VNNI +// Test different optimizations based on CPU features +#ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2142,8 +2142,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = 0; } - #endif - #ifdef OPT_AVX2_FMA +#endif +#ifdef OPT_AVX2_FMA if (optimization.avx2 && optimization.fma3) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2155,21 +2155,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset optimizations flag, so we'll choose the next optimization. optimization.avx2 = optimization.fma3 = 0; } - #endif - #ifdef OPT_AVX2 +#endif +#ifdef OPT_AVX2 if (optimization.avx2) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset avx flag as well, so we'll choose the next optimization (SSE). optimization.avx2 = 0; } - #endif - #ifdef OPT_SSE4 +#endif +#ifdef OPT_SSE4 if (optimization.sse4_1) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2181,9 +2181,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset sse flag as well, so we'll choose the next optimization (default). optimization.sse4_1 = 0; } - #endif +#endif - #ifdef OPT_SVE2 +#ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2195,8 +2195,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset sve2 flag as well, so we'll choose the next option (default). optimization.sve2 = 0; } - #endif - #ifdef OPT_SVE +#endif +#ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2208,21 +2208,20 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { // Unset sve flag as well, so we'll choose the next option (default). optimization.sve = 0; } - #endif - #ifdef OPT_NEON +#endif +#ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "NEON with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. optimization.asimd = 0; } - #endif - +#endif // Test default implementation unsigned char alignment = 0; @@ -2259,8 +2258,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { dist_func_t arch_opt_func; float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); - // Test different optimizations based on CPU features - #ifdef OPT_AVX512_F_BW_VL_VNNI +// Test different optimizations based on CPU features +#ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2271,8 +2270,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; optimization.avx512f = 0; } - #endif - #ifdef OPT_AVX2_FMA +#endif +#ifdef OPT_AVX2_FMA if (optimization.avx2 && optimization.fma3) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2283,8 +2282,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; optimization.avx2 = optimization.fma3 = 0; } - #endif - #ifdef OPT_AVX2 +#endif +#ifdef OPT_AVX2 if (optimization.avx2) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2295,8 +2294,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; optimization.avx2 = 0; } - #endif - #ifdef OPT_SSE +#endif +#ifdef OPT_SSE if (optimization.sse4_1) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2307,21 +2306,21 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; optimization.sse4_1 = 0; } - #endif - #ifdef OPT_SVE2 +#endif +#ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE2 with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; // Unset sve2 flag as well, so we'll choose the next option (default). optimization.sve2 = 0; } - #endif - #ifdef OPT_SVE +#endif +#ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2333,8 +2332,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // Unset sve flag as well, so we'll choose the next option (default). optimization.sve = 0; } - #endif - #ifdef OPT_NEON +#endif +#ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2346,8 +2345,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { // Unset optimizations flag, so we'll choose the next optimization. optimization.asimd = 0; } - #endif - +#endif // Test default implementation unsigned char alignment = 0; @@ -2390,7 +2388,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { dist_func_t arch_opt_func; float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim); - #ifdef OPT_SVE2 +#ifdef OPT_SVE2 if (optimization.sve2) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2402,8 +2400,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim; optimization.sve2 = 0; } - #endif - #ifdef OPT_SVE +#endif +#ifdef OPT_SVE if (optimization.sve) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2415,8 +2413,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "SVE with dim " << dim; optimization.sve = 0; } - #endif - #ifdef OPT_NEON +#endif +#ifdef OPT_NEON if (optimization.asimd) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2428,10 +2426,10 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "NEON with dim " << dim; optimization.asimd = 0; } - #endif +#endif - // Test different optimizations based on CPU features - #ifdef OPT_AVX512_F_BW_VL_VNNI +// Test different optimizations based on CPU features +#ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2443,8 +2441,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim; optimization.avx512f = 0; } - #endif - #ifdef OPT_AVX2_FMA +#endif +#ifdef OPT_AVX2_FMA if (optimization.avx2 && optimization.fma3) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2456,8 +2454,8 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim; optimization.avx2 = optimization.fma3 = 0; } - #endif - #ifdef OPT_AVX2 +#endif +#ifdef OPT_AVX2 if (optimization.avx2) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2469,9 +2467,9 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim; optimization.avx2 = 0; } - #endif +#endif - #ifdef OPT_SSE +#ifdef OPT_SSE if (optimization.sse4_1) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); @@ -2483,13 +2481,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { // ASSERT_EQ(alignment, 0) << "SSE with dim " << dim; optimization.sse4_1 = 0; } - #endif +#endif // Test default implementation unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << - dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } From 6566a0b7a3cf60a55abf9cb2f5ad361368ebddf0 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 18 May 2025 14:14:25 +0300 Subject: [PATCH 37/52] remove opt.avx2 --- tests/unit/test_spaces.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index e1ba1a1bd..afdf7d01d 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2153,7 +2153,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. - optimization.avx2 = optimization.fma3 = 0; + optimization.fma3 = 0; } #endif #ifdef OPT_AVX2 @@ -2280,7 +2280,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; - optimization.avx2 = optimization.fma3 = 0; + optimization.fma3 = 0; } #endif #ifdef OPT_AVX2 @@ -2452,7 +2452,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "AVX with dim " << dim; // We don't align SQ8 vectors with cosine distance // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim; - optimization.avx2 = optimization.fma3 = 0; + optimization.fma3 = 0; } #endif #ifdef OPT_AVX2 From d767ea92f6ce1408863165ed79c3ef249aa75b3c Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 18 May 2025 15:20:27 +0300 Subject: [PATCH 38/52] fix OPT_AVX2 bm-spaces --- tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index 8e7140bba..1349a3512 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -75,7 +75,7 @@ INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported); #endif // AVX2_FMA -#ifdef AVX2 +#ifdef OPT_AVX2 // AVX2 functions bool avx2_supported = opt.avx2; INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported); From ea0ac003733097a9e5e054a3887ac84d73d12cd3 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Wed, 21 May 2025 17:19:39 +0300 Subject: [PATCH 39/52] pr chanes --- src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 51 +++++++++++++++++------------- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 8 ++--- src/VecSim/spaces/L2/L2_SVE_SQ8.h | 8 ++--- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index 0a6f3ee8c..9822b03fb 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -10,24 +10,26 @@ #include #include -static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod, - const __m128 &min_val_vec, const __m128 &delta_vec) { - // Load 4 float elements from pVect1 - __m128 v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - - // Load 4 uint8 elements from pVect2, convert to int32, then to float - __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float *)pVect2))); - pVect2 += 4; - - // Convert int32 to float - __m128 v2_f = _mm_cvtepi32_ps(v2_i); - - // Dequantize: (val * delta) + min_val - __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec); - - // Compute dot product and add to sum - sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant)); +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, + __m128 &sum_prod1, __m128 &sum_prod2, + const __m128 &min_val_vec, const __m128 &delta_vec) { + // Load first 4 elements + __m128 v1a = _mm_loadu_ps(pVect1); + __m128i v2a_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2)); + + // Load next 4 elements + __m128 v1b = _mm_loadu_ps(pVect1 + 4); + __m128i v2b_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2 + 4)); + + pVect1 += 8; + pVect2 += 8; + + // Process both sets + __m128 v2a_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2a_i), delta_vec), min_val_vec); + __m128 v2b_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2b_i), delta_vec), min_val_vec); + + sum_prod1 = _mm_add_ps(sum_prod1, _mm_mul_ps(v1a, v2a_dequant)); + sum_prod2 = _mm_add_ps(sum_prod2, _mm_mul_ps(v1b, v2b_dequant)); } template // 0..15 @@ -45,7 +47,9 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, const float *pEnd1 = pVect1 + dimension; - __m128 sum = _mm_setzero_ps(); + // Initialize two sum accumulators + __m128 sum1 = _mm_setzero_ps(); + __m128 sum2 = _mm_setzero_ps(); // Process residual elements if needed if constexpr (residual) { @@ -86,15 +90,18 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, pVect1 += residual % 4; quantized += residual % 4; - sum = _mm_mul_ps(v1, v2_dequant); + sum1 = _mm_mul_ps(v1, v2_dequant); // Use sum1 for residual } } - // Process 4 elements at a time + // Process 8 elements at a time while (pVect1 < pEnd1) { - InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec); + InnerProductStep(pVect1, quantized, sum1, sum2, min_val_vec, delta_vec); } + // Combine the two sums + __m128 sum = _mm_add_ps(sum1, sum2); + // TmpRes must be 16 bytes aligned. float PORTABLE_ALIGN16 TmpRes[4]; _mm_store_ps(TmpRes, sum); diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 4beaf81ca..116bd8325 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -23,13 +23,13 @@ static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2 svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa // Convert uint32 to float32 - svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32); + svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); // Dequantize: (val * delta) + min_val - svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec); + svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec); // Compute dot product and add to sum - sum = svmla_f32_z(pg, sum, v1, v2_dequant); + sum = svmla_f32_x(pg, sum, v1, v2_dequant); // Move to the next set of elements offset += svcntw(); @@ -80,7 +80,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz // Dequantize: (val * delta) + min_val svfloat32_t v2_dequant = - svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec); + svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec); // Compute dot product and add to sum sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant); diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h index 8bce46365..4ab24b297 100644 --- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h @@ -21,16 +21,16 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_ svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // Convert uint32 to float32 - svfloat32_t v2_f = svcvt_f32_u32_z(pg, v2_u32); + svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); // Dequantize: (val * delta) + min_val - svfloat32_t v2_dequant = svadd_f32_z(pg, svmul_f32_z(pg, v2_f, delta_vec), min_val_vec); + svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec); // Compute difference - svfloat32_t diff = svsub_f32_z(pg, v1, v2_dequant); + svfloat32_t diff = svsub_f32_x(pg, v1, v2_dequant); // Square difference and add to sum - sum = svmla_f32_z(pg, sum, diff, diff); + sum = svmla_f32_x(pg, sum, diff, diff); // Move to the next set of elements offset += svcntw(); From ef09ead6d9fb830944c71303c6a26b2b4fcf457d Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 22 May 2025 09:35:19 +0300 Subject: [PATCH 40/52] format --- src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 12 ++++++------ src/VecSim/spaces/IP/IP_SVE_SQ8.h | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index 9822b03fb..b32989838 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -10,24 +10,24 @@ #include #include -static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, - __m128 &sum_prod1, __m128 &sum_prod2, - const __m128 &min_val_vec, const __m128 &delta_vec) { +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod1, + __m128 &sum_prod2, const __m128 &min_val_vec, + const __m128 &delta_vec) { // Load first 4 elements __m128 v1a = _mm_loadu_ps(pVect1); __m128i v2a_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2)); - + // Load next 4 elements __m128 v1b = _mm_loadu_ps(pVect1 + 4); __m128i v2b_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2 + 4)); - + pVect1 += 8; pVect2 += 8; // Process both sets __m128 v2a_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2a_i), delta_vec), min_val_vec); __m128 v2b_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2b_i), delta_vec), min_val_vec); - + sum_prod1 = _mm_add_ps(sum_prod1, _mm_mul_ps(v1a, v2a_dequant)); sum_prod2 = _mm_add_ps(sum_prod2, _mm_mul_ps(v1b, v2b_dequant)); } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 116bd8325..863ef3652 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -79,8 +79,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); // Dequantize: (val * delta) + min_val - svfloat32_t v2_dequant = - svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec); + svfloat32_t v2_dequant = svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec); // Compute dot product and add to sum sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant); From 7567730949ca029e2f5bafe0f17fce6d5651dc71 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 22 May 2025 09:56:14 +0300 Subject: [PATCH 41/52] change to _mm_cvtsi32_si128 --- src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 51 +++++++++++++----------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index b32989838..67ebc4547 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -10,26 +10,24 @@ #include #include -static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod1, - __m128 &sum_prod2, const __m128 &min_val_vec, - const __m128 &delta_vec) { - // Load first 4 elements - __m128 v1a = _mm_loadu_ps(pVect1); - __m128i v2a_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2)); - - // Load next 4 elements - __m128 v1b = _mm_loadu_ps(pVect1 + 4); - __m128i v2b_i = _mm_cvtepu8_epi32(_mm_loadu_si32(pVect2 + 4)); - - pVect1 += 8; - pVect2 += 8; - - // Process both sets - __m128 v2a_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2a_i), delta_vec), min_val_vec); - __m128 v2b_dequant = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v2b_i), delta_vec), min_val_vec); - - sum_prod1 = _mm_add_ps(sum_prod1, _mm_mul_ps(v1a, v2a_dequant)); - sum_prod2 = _mm_add_ps(sum_prod2, _mm_mul_ps(v1b, v2b_dequant)); +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod, + const __m128 &min_val_vec, const __m128 &delta_vec) { + // Load 4 float elements from pVect1 + __m128 v1 = _mm_loadu_ps(pVect1); + pVect1 += 4; + + // Load 4 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)pVect2)); + pVect2 += 4; + + // Convert int32 to float + __m128 v2_f = _mm_cvtepi32_ps(v2_i); + + // Dequantize: (val * delta) + min_val + __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute dot product and add to sum + sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant)); } template // 0..15 @@ -47,9 +45,7 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, const float *pEnd1 = pVect1 + dimension; - // Initialize two sum accumulators - __m128 sum1 = _mm_setzero_ps(); - __m128 sum2 = _mm_setzero_ps(); + __m128 sum = _mm_setzero_ps(); // Process residual elements if needed if constexpr (residual) { @@ -90,18 +86,15 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, pVect1 += residual % 4; quantized += residual % 4; - sum1 = _mm_mul_ps(v1, v2_dequant); // Use sum1 for residual + sum = _mm_mul_ps(v1, v2_dequant); } } - // Process 8 elements at a time + // Process 4 elements at a time while (pVect1 < pEnd1) { - InnerProductStep(pVect1, quantized, sum1, sum2, min_val_vec, delta_vec); + InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec); } - // Combine the two sums - __m128 sum = _mm_add_ps(sum1, sum2); - // TmpRes must be 16 bytes aligned. float PORTABLE_ALIGN16 TmpRes[4]; _mm_store_ps(TmpRes, sum); From a767547476c75edf1d3f4ec24b0e1231c15a1c76 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 22 May 2025 10:07:38 +0300 Subject: [PATCH 42/52] Change in the l2 --- src/VecSim/spaces/L2/L2_SSE4_SQ8.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h index 3ee673d3d..16b60286b 100644 --- a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h @@ -16,7 +16,7 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m12 pVect1 += 4; // Load 4 uint8 elements from pVect2, convert to int32, then to float - __m128i v2_i = _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((float *)pVect2))); + __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)pVect2)); pVect2 += 4; // Convert int32 to float From e6422dc40bd122b68f931ab61b3e585f4b3387a2 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Tue, 27 May 2025 09:17:06 +0300 Subject: [PATCH 43/52] PR changes --- src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h | 19 ++++--------------- src/VecSim/spaces/L2/L2_AVX2_SQ8.h | 11 +++-------- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h index 75ae892f9..2cff76a31 100644 --- a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h @@ -1,3 +1,4 @@ + /* * Copyright (c) 2006-Present, Redis Ltd. * All rights reserved. @@ -28,21 +29,9 @@ static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, _ // Dequantize: v2_dequant = v2_f * delta_vec + min_val_vec __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); - // Calculate squared difference using FMA - // (v1 - v2_dequant)^2 = v1^2 - 2*v1*v2_dequant + v2_dequant^2 - // Using FMA: v1^2 - 2*v1*v2_dequant + v2_dequant^2 - - // First, compute v2_dequant^2 - __m256 v2_dequant_squared = _mm256_mul_ps(v2_dequant, v2_dequant); - - // Then, compute v1^2 - __m256 v1_squared = _mm256_mul_ps(v1, v1); - - // Finally, compute -2*v1*v2_dequant + v2_dequant^2 + v1^2 using FMA - // -2*v1*v2_dequant + v2_dequant^2 = -2 * v1 * v2_dequant + v2_dequant^2 - __m256 neg_2_v1 = _mm256_mul_ps(v1, _mm256_set1_ps(-2.0f)); - __m256 diff_squared = _mm256_fmadd_ps(neg_2_v1, v2_dequant, v2_dequant_squared); - diff_squared = _mm256_add_ps(diff_squared, v1_squared); + // Calculate squared difference - simple and efficient approach + __m256 diff = _mm256_sub_ps(v1, v2_dequant); + __m256 diff_squared = _mm256_mul_ps(diff, diff); // Add to running sum sum256 = _mm256_add_ps(sum256, diff_squared); diff --git a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h index 2d2702763..bdde99e62 100644 --- a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h @@ -58,13 +58,8 @@ float SQ8_L2SqrSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dime __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); pVect1 += residual % 8; - uint8_t temp_buf[8] = {0}; - // Manually copy elements - for (size_t i = 0; i < residual % 8; i++) { - temp_buf[i] = pVect2[i]; - } - // Load from buffer - __m128i v2_128 = _mm_loadl_epi64((__m128i *)temp_buf); + // Direct load - safe because we only process the masked elements + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); pVect2 += residual % 8; // Zero-extend uint8 to int32 @@ -76,10 +71,10 @@ float SQ8_L2SqrSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dime // Dequantize: (val * delta) + min_val __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); + // Apply mask to zero out unused elements v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); __m256 diff = _mm256_sub_ps(v1, v2_dequant); - sum = _mm256_mul_ps(diff, diff); } From 10a609865276f39c5ee4879eae6ec2a46f2c4839 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Tue, 27 May 2025 11:51:24 +0300 Subject: [PATCH 44/52] added chunk to functions --- src/VecSim/spaces/IP/IP_SVE_FP32.h | 22 +++++++++++----------- src/VecSim/spaces/IP/IP_SVE_SQ8.h | 26 +++++++++++++------------- src/VecSim/spaces/L2/L2_SVE_FP32.h | 23 ++++++++++++----------- src/VecSim/spaces/L2/L2_SVE_SQ8.h | 24 ++++++++++++------------ 4 files changed, 48 insertions(+), 47 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_SVE_FP32.h b/src/VecSim/spaces/IP/IP_SVE_FP32.h index c60acb16a..c1cc79ccd 100644 --- a/src/VecSim/spaces/IP/IP_SVE_FP32.h +++ b/src/VecSim/spaces/IP/IP_SVE_FP32.h @@ -11,13 +11,13 @@ #include static inline void InnerProductStep(float *&pVect1, float *&pVect2, size_t &offset, - svfloat32_t &sum) { + svfloat32_t &sum, const size_t chunk) { svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset); svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset); sum = svmla_f32_x(svptrue_b32(), sum, v1, v2); - offset += svcntw(); + offset += chunk; } template @@ -26,33 +26,33 @@ float FP32_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t float *pVect2 = (float *)pVect2v; size_t offset = 0; - uint64_t sve_word_count = svcntw(); + uint64_t chunk = svcntw(); svfloat32_t sum0 = svdup_f32(0.0f); svfloat32_t sum1 = svdup_f32(0.0f); svfloat32_t sum2 = svdup_f32(0.0f); svfloat32_t sum3 = svdup_f32(0.0f); - auto chunk_size = 4 * sve_word_count; + auto chunk_size = 4 * chunk; const size_t number_of_chunks = dimension / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - InnerProductStep(pVect1, pVect2, offset, sum0); - InnerProductStep(pVect1, pVect2, offset, sum1); - InnerProductStep(pVect1, pVect2, offset, sum2); - InnerProductStep(pVect1, pVect2, offset, sum3); + InnerProductStep(pVect1, pVect2, offset, sum0, chunk); + InnerProductStep(pVect1, pVect2, offset, sum1, chunk); + InnerProductStep(pVect1, pVect2, offset, sum2, chunk); + InnerProductStep(pVect1, pVect2, offset, sum3, chunk); } // Process remaining complete SVE vectors that didn't fit into the main loop // These are full vector operations (0-3 elements) if constexpr (additional_steps > 0) { if constexpr (additional_steps >= 1) { - InnerProductStep(pVect1, pVect2, offset, sum0); + InnerProductStep(pVect1, pVect2, offset, sum0, chunk); } if constexpr (additional_steps >= 2) { - InnerProductStep(pVect1, pVect2, offset, sum1); + InnerProductStep(pVect1, pVect2, offset, sum1, chunk); } if constexpr (additional_steps >= 3) { - InnerProductStep(pVect1, pVect2, offset, sum3); + InnerProductStep(pVect1, pVect2, offset, sum3, chunk); } } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 863ef3652..7b9bd86bc 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -13,7 +13,7 @@ static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, svfloat32_t &sum, const svfloat32_t &min_val_vec, - const svfloat32_t &delta_vec) { + const svfloat32_t &delta_vec, const size_t chunk) { svbool_t pg = svptrue_b32(); // Load float elements from pVect1 @@ -32,7 +32,7 @@ static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2 sum = svmla_f32_x(pg, sum, v1, v2_dequant); // Move to the next set of elements - offset += svcntw(); + offset += chunk; } template @@ -51,7 +51,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz svfloat32_t delta_vec = svdup_f32(delta); // Get the number of 32-bit elements per vector at runtime - uint64_t sve_word_count = svcntw(); + uint64_t chunk = svcntw(); // Multiple accumulators to increase instruction-level parallelism svfloat32_t sum0 = svdup_f32(0.0f); @@ -61,7 +61,7 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz // Handle partial chunk if needed if constexpr (partial_chunk) { - size_t remaining = dimension % sve_word_count; + size_t remaining = dimension % chunk; if (remaining > 0) { // Create predicate for the remaining elements svbool_t pg_partial = @@ -90,26 +90,26 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz } // Process 4 chunks at a time in the main loop - auto chunk_size = 4 * sve_word_count; + auto chunk_size = 4 * chunk; const size_t number_of_chunks = - (dimension - (partial_chunk ? dimension % sve_word_count : 0)) / chunk_size; + (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); - InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec); - InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); - InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk); } // Handle remaining steps (0-3) if constexpr (additional_steps > 0) { - InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); } if constexpr (additional_steps > 1) { - InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); } if constexpr (additional_steps > 2) { - InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); } // Combine the accumulators diff --git a/src/VecSim/spaces/L2/L2_SVE_FP32.h b/src/VecSim/spaces/L2/L2_SVE_FP32.h index a3e96c7a8..8367baa97 100644 --- a/src/VecSim/spaces/L2/L2_SVE_FP32.h +++ b/src/VecSim/spaces/L2/L2_SVE_FP32.h @@ -9,7 +9,8 @@ #include "VecSim/spaces/space_includes.h" #include -static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset, svfloat32_t &sum) { +static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset, svfloat32_t &sum, + const size_t chunk) { // Load vectors svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset); svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset); @@ -21,7 +22,7 @@ static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset, sum = svmla_f32_z(svptrue_b32(), sum, diff, diff); // Advance pointers by the vector length - offset += svcntw(); + offset += chunk; } template @@ -31,7 +32,7 @@ float FP32_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimens size_t offset = 0; // Get the number of 32-bit elements per vector at runtime - uint64_t sve_word_count = svcntw(); + uint64_t chunk = svcntw(); // Multiple accumulators to increase instruction-level parallelism svfloat32_t sum0 = svdup_f32(0.0f); @@ -40,27 +41,27 @@ float FP32_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimens svfloat32_t sum3 = svdup_f32(0.0f); // Process vectors in chunks, with unrolling for better pipelining - auto chunk_size = 4 * sve_word_count; + auto chunk_size = 4 * chunk; size_t number_of_chunks = dimension / chunk_size; for (size_t i = 0; i < number_of_chunks; ++i) { // Process 4 chunks with separate accumulators - L2SquareStep(pVect1, pVect2, offset, sum0); - L2SquareStep(pVect1, pVect2, offset, sum1); - L2SquareStep(pVect1, pVect2, offset, sum2); - L2SquareStep(pVect1, pVect2, offset, sum3); + L2SquareStep(pVect1, pVect2, offset, sum0, chunk); + L2SquareStep(pVect1, pVect2, offset, sum1, chunk); + L2SquareStep(pVect1, pVect2, offset, sum2, chunk); + L2SquareStep(pVect1, pVect2, offset, sum3, chunk); } // Process remaining complete SVE vectors that didn't fit into the main loop // These are full vector operations (0-3 elements) if constexpr (additional_steps > 0) { if constexpr (additional_steps >= 1) { - L2SquareStep(pVect1, pVect2, offset, sum0); + L2SquareStep(pVect1, pVect2, offset, sum0, chunk); } if constexpr (additional_steps >= 2) { - L2SquareStep(pVect1, pVect2, offset, sum1); + L2SquareStep(pVect1, pVect2, offset, sum1, chunk); } if constexpr (additional_steps >= 3) { - L2SquareStep(pVect1, pVect2, offset, sum2); + L2SquareStep(pVect1, pVect2, offset, sum2, chunk); } } diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h index 4ab24b297..756f82522 100644 --- a/src/VecSim/spaces/L2/L2_SVE_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h @@ -11,7 +11,7 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, svfloat32_t &sum, const svfloat32_t &min_val_vec, - const svfloat32_t &delta_vec) { + const svfloat32_t &delta_vec, const size_t chunk) { svbool_t pg = svptrue_b32(); // Load float elements from pVect1 @@ -33,7 +33,7 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_ sum = svmla_f32_x(pg, sum, diff, diff); // Move to the next set of elements - offset += svcntw(); + offset += chunk; } template @@ -52,7 +52,7 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi svfloat32_t delta_vec = svdup_f32(delta); // Get the number of 32-bit elements per vector at runtime - uint64_t sve_word_count = svcntw(); + uint64_t chunk = svcntw(); // Multiple accumulators to increase instruction-level parallelism svfloat32_t sum0 = svdup_f32(0.0f); @@ -62,7 +62,7 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi // Handle partial chunk if needed if constexpr (partial_chunk) { - size_t remaining = dimension % sve_word_count; + size_t remaining = dimension % chunk; if (remaining > 0) { // Create predicate for the remaining elements svbool_t pg_partial = @@ -93,24 +93,24 @@ float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimensi } // Handle remaining steps (0-3) if constexpr (additional_steps > 0) { - L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); } if constexpr (additional_steps > 1) { - L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); } if constexpr (additional_steps > 2) { - L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); } // Process 4 chunks at a time in the main loop - auto chunk_size = 4 * sve_word_count; + auto chunk_size = 4 * chunk; size_t number_of_chunks = dimension / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec); - L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec); - L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec); - L2SqrStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); + L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); + L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); + L2SqrStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk); } // Combine the accumulators From 767e1904daae2d07bde95dad9cef780cb90f5809 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Tue, 27 May 2025 13:10:30 +0300 Subject: [PATCH 45/52] diff squared --- src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h index 2cff76a31..708807f98 100644 --- a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h @@ -31,10 +31,9 @@ static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, _ // Calculate squared difference - simple and efficient approach __m256 diff = _mm256_sub_ps(v1, v2_dequant); - __m256 diff_squared = _mm256_mul_ps(diff, diff); - - // Add to running sum - sum256 = _mm256_add_ps(sum256, diff_squared); + + // Use FMA for diff² + sum in one instruction + sum256 = _mm256_fmadd_ps(diff, diff, sum256); } template // 0..15 From 44be2751efee926828c421cce5947f8fdda30e01 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Tue, 27 May 2025 13:19:41 +0300 Subject: [PATCH 46/52] format --- src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h index 708807f98..dfbbaa9e9 100644 --- a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h @@ -31,7 +31,7 @@ static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, _ // Calculate squared difference - simple and efficient approach __m256 diff = _mm256_sub_ps(v1, v2_dequant); - + // Use FMA for diff² + sum in one instruction sum256 = _mm256_fmadd_ps(diff, diff, sum256); } From 3a956bfe49f228b8bad2de2f0196c24f9d0dd0dd Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Tue, 27 May 2025 13:55:59 +0300 Subject: [PATCH 47/52] chnage diff --- tests/unit/test_spaces.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index afdf7d01d..cdc5eb33a 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -421,7 +421,7 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { params[2] = inv_norm; float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); - ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance"; + ASSERT_NEAR(dist, 0.0f, 0.000001f) << "SQ8_Cosine failed to match expected distance"; } TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { // create a vector with extra space for the norm @@ -474,7 +474,7 @@ TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { params[2] = inv_norm; float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim); - ASSERT_NEAR(dist, 0.0f, 0.01f) << "SQ8_Cosine failed to match expected distance"; + ASSERT_NEAR(dist, 0.0f, 0.00001f) << "SQ8_Cosine failed to match expected distance"; } /* ======================== Test Getters ======================== */ From 5840e3fe6cd7805bb865f56531fa105cf0bd4bee Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 5 Jun 2025 17:31:27 +0300 Subject: [PATCH 48/52] Remove align from tests improve sse4 --- src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 39 ++++++++++++++---------------- src/VecSim/spaces/IP_space.cpp | 16 ------------ tests/unit/test_spaces.cpp | 24 ------------------ 3 files changed, 18 insertions(+), 61 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index 67ebc4547..1bad27610 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -52,36 +52,33 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, // Handle residual elements (1-3) if constexpr (residual % 4) { __m128 v1; - __m128 v2_dequant = _mm_setzero_ps(); + __m128 v2_dequant; if constexpr (residual % 4 == 3) { - // Load 3 floats and set the last one to 0 - v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0 - v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part + // Set 3 floats and the last one to 0 + v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]); - // Dequantize first value - float dequant0 = quantized[0] * delta + min; - v2_dequant = _mm_load_ss(&dequant0); - - // Dequantize next two values - float dequant_high[2] = {quantized[1] * delta + min, quantized[2] * delta + min}; - v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high); + // Dequantize and set 3 values + v2_dequant = _mm_set_ps(0.0f, + quantized[2] * delta + min, + quantized[1] * delta + min, + quantized[0] * delta + min); } else if constexpr (residual % 4 == 2) { - // Load 2 floats and set the last two to 0 - v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1); + // Set 2 floats and the last two to 0 + v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]); - // Dequantize two values - float dequant_high[2] = {quantized[0] * delta + min, quantized[1] * delta + min}; - v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high); + // Dequantize and set 2 values + v2_dequant = _mm_set_ps(0.0f, 0.0f, + quantized[1] * delta + min, + quantized[0] * delta + min); } else if constexpr (residual % 4 == 1) { - // Load 1 float and set the last three to 0 - v1 = _mm_load_ss(pVect1); + // Set 1 float and the last three to 0 + v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]); - // Dequantize one value - float dequant0 = quantized[0] * delta + min; - v2_dequant = _mm_load_ss(&dequant0); + // Dequantize and set 1 value + v2_dequant = _mm_set_ps(0.0f, 0.0f, 0.0f, quantized[0] * delta + min); } pVect1 += residual % 4; diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 1bcd3a304..d24c1d142 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -70,29 +70,21 @@ dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, cons } #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vnni) { - if (dim % 16 == 0) // no point in aligning if we have an offsetting residual - *alignment = 16 * sizeof(float); // handles 16 floats return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); } #endif #ifdef OPT_AVX2_FMA if (features.avx2 && features.fma3) { - if (dim % 16 == 0) // no point in aligning if we have an offsetting residual - *alignment = 16 * sizeof(float); // handles 16 floats return Choose_SQ8_IP_implementation_AVX2_FMA(dim); } #endif #ifdef OPT_AVX2 if (features.avx2) { - if (dim % 8 == 0) // no point in aligning if we have an offsetting residual - *alignment = 8 * sizeof(float); // handles 8 floats return Choose_SQ8_IP_implementation_AVX2(dim); } #endif #ifdef OPT_SSE4 if (features.sse4_1) { - if (dim % 4 == 0) // no point in aligning if we have an offsetting residual - *alignment = 4 * sizeof(float); // handles 4 floats return Choose_SQ8_IP_implementation_SSE4(dim); } #endif @@ -136,29 +128,21 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, } #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vnni) { - if (dim % 16 == 0) // no point in aligning if we have an offsetting residual - *alignment = 16 * sizeof(float); // handles 16 floats return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } #endif #ifdef OPT_AVX2_FMA if (features.avx2 && features.fma3) { - if (dim % 16 == 0) // no point in aligning if we have an offsetting residual - *alignment = 16 * sizeof(float); // handles 16 floats return Choose_SQ8_Cosine_implementation_AVX2_FMA(dim); } #endif #ifdef OPT_AVX2 if (features.avx2) { - if (dim % 8 == 0) // no point in aligning if we have an offsetting residual - *alignment = 8 * sizeof(float); // handles 8 floats return Choose_SQ8_Cosine_implementation_AVX2(dim); } #endif #ifdef OPT_SSE4 if (features.sse4_1) { - if (dim % 4 == 0) // no point in aligning if we have an offsetting residual - *alignment = 4 * sizeof(float); // handles 4 floats return Choose_SQ8_Cosine_implementation_SSE4(dim); } #endif diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index cdc5eb33a..dabe9c794 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -2267,7 +2267,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX512 with dim " << dim; - // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; optimization.avx512f = 0; } #endif @@ -2279,7 +2278,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; - // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; optimization.fma3 = 0; } #endif @@ -2291,7 +2289,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; - // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; optimization.avx2 = 0; } #endif @@ -2303,7 +2300,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SSE with dim " << dim; - // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; optimization.sse4_1 = 0; } #endif @@ -2315,8 +2311,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE2 with dim " << dim; - ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; - // Unset sve2 flag as well, so we'll choose the next option (default). optimization.sve2 = 0; } #endif @@ -2328,8 +2322,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE with dim " << dim; - ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; - // Unset sve flag as well, so we'll choose the next option (default). optimization.sve = 0; } #endif @@ -2341,8 +2333,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "NEON with dim " << dim; - ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; - // Unset optimizations flag, so we'll choose the next optimization. optimization.asimd = 0; } #endif @@ -2396,8 +2386,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE2 with dim " << dim; - // We don't align SQ8 vectors with cosine distance - // ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim; optimization.sve2 = 0; } #endif @@ -2409,8 +2397,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SVE with dim " << dim; - // We don't align SQ8 vectors with cosine distance - // ASSERT_EQ(alignment, 0) << "SVE with dim " << dim; optimization.sve = 0; } #endif @@ -2422,8 +2408,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "NEON with dim " << dim; - // We don't align SQ8 vectors with cosine distance - // ASSERT_EQ(alignment, 0) << "NEON with dim " << dim; optimization.asimd = 0; } #endif @@ -2437,8 +2421,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX512 with dim " << dim; - // We don't align SQ8 vectors with cosine distance - // ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim; optimization.avx512f = 0; } #endif @@ -2450,8 +2432,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; - // We don't align SQ8 vectors with cosine distance - // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim; optimization.fma3 = 0; } #endif @@ -2463,8 +2443,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "AVX with dim " << dim; - // We don't align SQ8 vectors with cosine distance - // ASSERT_EQ(alignment, 0) << "AVX with dim " << dim; optimization.avx2 = 0; } #endif @@ -2477,8 +2455,6 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) << "SSE with dim " << dim; - // We don't align SQ8 vectors with cosine distance - // ASSERT_EQ(alignment, 0) << "SSE with dim " << dim; optimization.sse4_1 = 0; } #endif From 2a89dd8d7d9a2d696fc22198b1cec48d655c4096 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 5 Jun 2025 17:33:12 +0300 Subject: [PATCH 49/52] format --- src/VecSim/spaces/IP/IP_SSE4_SQ8.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index 1bad27610..5e47af2b6 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -59,19 +59,16 @@ float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]); // Dequantize and set 3 values - v2_dequant = _mm_set_ps(0.0f, - quantized[2] * delta + min, - quantized[1] * delta + min, - quantized[0] * delta + min); + v2_dequant = _mm_set_ps(0.0f, quantized[2] * delta + min, + quantized[1] * delta + min, quantized[0] * delta + min); } else if constexpr (residual % 4 == 2) { // Set 2 floats and the last two to 0 v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]); // Dequantize and set 2 values - v2_dequant = _mm_set_ps(0.0f, 0.0f, - quantized[1] * delta + min, - quantized[0] * delta + min); + v2_dequant = + _mm_set_ps(0.0f, 0.0f, quantized[1] * delta + min, quantized[0] * delta + min); } else if constexpr (residual % 4 == 1) { // Set 1 float and the last three to 0 From e562a864aab30416817f7f04b683164320839f24 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 8 Jun 2025 18:09:21 +0300 Subject: [PATCH 50/52] applied to l2 --- src/VecSim/spaces/L2/L2_SSE4_SQ8.h | 57 ++++++++++++++---------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h index 16b60286b..4b20ef351 100644 --- a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h @@ -35,11 +35,11 @@ static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m12 template // 0..15 float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { const float *pVect1 = static_cast(pVect1v); - const uint8_t *pVect2 = static_cast(pVect2v); + const uint8_t *quantized = static_cast(pVect2v); // Get dequantization parameters from the end of quantized vector - const float min_val = *reinterpret_cast(pVect2 + dimension); - const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + const float min_val = *reinterpret_cast(quantized + dimension); + const float delta = *reinterpret_cast(quantized + dimension + sizeof(float)); // Create broadcast vectors for SIMD operations __m128 min_val_vec = _mm_set1_ps(min_val); @@ -54,40 +54,35 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime // Handle residual elements (1-3) if constexpr (residual % 4) { __m128 v1; - __m128 v2_dequant = _mm_setzero_ps(); + __m128 v2_dequant; if constexpr (residual % 4 == 3) { // Load 3 floats and set the last one to 0 - v1 = _mm_load_ss(pVect1); // load 1 float, set the rest to 0 - v1 = _mm_loadh_pi(v1, (__m64 *)(pVect1 + 1)); // load 2 more floats into high part + v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]); - // Dequantize first value - float dequant0 = pVect2[0] * delta + min_val; - v2_dequant = _mm_load_ss(&dequant0); - - // Dequantize next two values - float dequant_high[2] = {pVect2[1] * delta + min_val, pVect2[2] * delta + min_val}; - v2_dequant = _mm_loadh_pi(v2_dequant, (__m64 *)dequant_high); + // Dequantize and set 3 values + v2_dequant = _mm_set_ps(0.0f, quantized[2] * delta + min_val, + quantized[1] * delta + min_val, quantized[0] * delta + min_val); } else if constexpr (residual % 4 == 2) { - // Load 2 floats and set the last two to 0 - v1 = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)pVect1); + // Set 2 floats and the last two to 0 + v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]); - // Dequantize two values - float dequant_high[2] = {pVect2[0] * delta + min_val, pVect2[1] * delta + min_val}; - v2_dequant = _mm_loadh_pi(_mm_setzero_ps(), (__m64 *)dequant_high); + // Dequantize and set 2 valuesAdd commentMore actions + v2_dequant = _mm_set_ps(0.0f, 0.0f, + quantized[1] * delta + min_val, + quantized[0] * delta + min_val); } else if constexpr (residual % 4 == 1) { - // Load 1 float and set the last three to 0 - v1 = _mm_load_ss(pVect1); + // Set 1 float and the last three to 0Add commentMore actions + v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]); - // Dequantize one value - float dequant0 = pVect2[0] * delta + min_val; - v2_dequant = _mm_load_ss(&dequant0); + // Dequantize and set 1 value + v2_dequant = _mm_set_ps(0.0f, 0.0f, 0.0f, quantized[0] * delta + min_val); } pVect1 += residual % 4; - pVect2 += residual % 4; + quantized += residual % 4; // Compute difference __m128 diff = _mm_sub_ps(v1, v2_dequant); @@ -98,19 +93,19 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime // Process remaining blocks of 4 elements based on residual if constexpr (residual >= 12) - L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); if constexpr (residual >= 8) - L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); if constexpr (residual >= 4) - L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); } // Process 16 elements at a time (4 elements per step, 4 steps) while (pVect1 < pEnd1) { - L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); - L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); - L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); - L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); } // TmpRes must be 16 bytes aligned From 2a0b4e642424612d0ab74af0192e05209a398570 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 8 Jun 2025 18:28:38 +0300 Subject: [PATCH 51/52] format --- src/VecSim/spaces/L2/L2_SSE4_SQ8.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h index 4b20ef351..cd36a4e91 100644 --- a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h +++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h @@ -61,17 +61,17 @@ float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dime v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]); // Dequantize and set 3 values - v2_dequant = _mm_set_ps(0.0f, quantized[2] * delta + min_val, - quantized[1] * delta + min_val, quantized[0] * delta + min_val); + v2_dequant = + _mm_set_ps(0.0f, quantized[2] * delta + min_val, quantized[1] * delta + min_val, + quantized[0] * delta + min_val); } else if constexpr (residual % 4 == 2) { // Set 2 floats and the last two to 0 v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]); // Dequantize and set 2 valuesAdd commentMore actions - v2_dequant = _mm_set_ps(0.0f, 0.0f, - quantized[1] * delta + min_val, - quantized[0] * delta + min_val); + v2_dequant = _mm_set_ps(0.0f, 0.0f, quantized[1] * delta + min_val, + quantized[0] * delta + min_val); } else if constexpr (residual % 4 == 1) { // Set 1 float and the last three to 0Add commentMore actions From ab18690bc1426bf77fade3f0148af4831308fc75 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Sun, 8 Jun 2025 19:18:40 +0300 Subject: [PATCH 52/52] Remove alignment l2 --- src/VecSim/spaces/L2_space.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 81f0df91d..ed920927d 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -70,29 +70,21 @@ dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, cons } #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vnni) { - if (dim % 16 == 0) // no point in aligning if we have an offsetting residual - *alignment = 16 * sizeof(float); // handles 16 floats return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim); } #endif #ifdef OPT_AVX2_FMA if (features.avx2 && features.fma3) { - if (dim % 16 == 0) // no point in aligning if we have an offsetting residual - *alignment = 16 * sizeof(float); // handles 16 floats return Choose_SQ8_L2_implementation_AVX2_FMA(dim); } #endif #ifdef OPT_AVX2 if (features.avx2) { - if (dim % 8 == 0) // no point in aligning if we have an offsetting residual - *alignment = 8 * sizeof(float); // handles 8 floats return Choose_SQ8_L2_implementation_AVX2(dim); } #endif #ifdef OPT_SSE4 if (features.sse4_1) { - if (dim % 4 == 0) // no point in aligning if we have an offsetting residual - *alignment = 4 * sizeof(float); // handles 4 floats return Choose_SQ8_L2_implementation_SSE4(dim); } #endif