diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake index 4976d60e6..dadd550a8 100644 --- a/cmake/x86_64InstructionFlags.cmake +++ b/cmake/x86_64InstructionFlags.cmake @@ -17,6 +17,7 @@ CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2) CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX) CHECK_CXX_COMPILER_FLAG(-mf16c CXX_F16C) CHECK_CXX_COMPILER_FLAG(-mfma CXX_FMA) +CHECK_CXX_COMPILER_FLAG(-msse4.1 CXX_SSE4) CHECK_CXX_COMPILER_FLAG(-msse3 CXX_SSE3) CHECK_CXX_COMPILER_FLAG(-msse CXX_SSE) @@ -60,10 +61,18 @@ if(CXX_AVX2) add_compile_definitions(OPT_AVX2) endif() +if(CXX_AVX2 AND CXX_FMA) + add_compile_definitions(OPT_AVX2_FMA) +endif() + if(CXX_AVX) add_compile_definitions(OPT_AVX) endif() +if(CXX_SSE4) + add_compile_definitions(OPT_SSE4) +endif() + if(CXX_SSE3) add_compile_definitions(OPT_SSE3) endif() diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt index 7ee0a5844..d88750e91 100644 --- a/src/VecSim/spaces/CMakeLists.txt +++ b/src/VecSim/spaces/CMakeLists.txt @@ -56,6 +56,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") list(APPEND OPTIMIZATIONS functions/AVX2.cpp) endif() + if(CXX_AVX2 AND CXX_FMA) + message("Building with AVX2 and FMA") + set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma") + list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp) + endif() + if(CXX_F16C AND CXX_FMA AND CXX_AVX) message("Building with CXX_F16C") set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx") @@ -74,6 +80,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") list(APPEND OPTIMIZATIONS functions/SSE3.cpp) endif() + if(CXX_SSE4) + message("Building with SSE4") + set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS -msse4.1) + list(APPEND OPTIMIZATIONS functions/SSE4.cpp) + endif() + if(CXX_SSE) message("Building with SSE") set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 3b9253a21..5e2c4b4dc 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -14,6 +14,43 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; +float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, + float min_val, float delta, float inv_norm) { + float res = 0; + for (size_t i = 0; i < dimension; i++) { + float dequantized_V2 = (pVect2v[i] * delta + min_val); + res += pVect1v[i] * dequantized_V2; + } + return res * inv_norm; +} + +float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply + // it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta + // (float)][inv_norm (float)] The last two values are used to dequantize the vector. + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Compute inner product with dequantization + const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, 1.0f); + return 1.0f - res; +} + +float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + + // Get quantization parameters + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + // Compute inner product with dequantization + const float res = + FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm); + return 1.0f - res; +} + float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) { auto *vec1 = (float *)pVect1; auto *vec2 = (float *)pVect2; diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index e38a1c22f..d4796cbd6 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -10,6 +10,12 @@ #include +// pVect1v vector of type fp32 and pVect2v vector of type uint8 +float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); + +// pVect1v vector of type fp32 and pVect2v vector of type uint8 +float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); + float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h new file mode 100644 index 000000000..007ee333e --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" + +static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, + __m256 &sum256, const __m256 &min_val_vec, + const __m256 &delta_vec) { + // Load 8 float elements from pVect1 + __m256 v1 = _mm256_loadu_ps(pVect1); + pVect1 += 8; + + // Load 8 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize and compute dot product in one step using FMA + // (val * delta) + min_val -> v2_dequant + // sum256 += v1 * v2_dequant + // Using FMA: sum256 = v1 * v2_dequant + sum256 + + // First, compute v2_dequant = v2_f * delta_vec + min_val_vec + __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Then, compute sum256 += v1 * v2_dequant using FMA + sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256); +} + +template // 0..15 +float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + // pVect2 is a quantized uint8_t vector + const uint8_t *pVect2 = static_cast(pVect2v); + const float *pEnd1 = pVect1 + dimension; + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Create broadcast vectors for SIMD operations + __m256 min_val_vec = _mm256_set1_ps(min_val); + __m256 delta_vec = _mm256_set1_ps(delta); + + __m256 sum256 = _mm256_setzero_ps(); + + // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one + // 16-float block, so mask loading is guaranteed to be safe. + if constexpr (residual % 8) { + __mmask8 constexpr mask = (1 << (residual % 8)) - 1; + __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); + pVect1 += residual % 8; + + // Load quantized values and dequantize + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += residual % 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize using FMA: (val * delta) + min_val + __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product with masking + sum256 = _mm256_mul_ps(v1, v2_dequant); + } + + // If the reminder is >=8, have another step of 8 floats + if constexpr (residual >= 8) { + InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + } + + // We dealt with the residual part. We are left with some multiple of 16 floats. + // In each iteration we calculate 16 floats = 512 bits. + do { + InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + } while (pVect1 < pEnd1); + + return my_mm256_reduce_add_ps(sum256); +} + +template // 0..15 +float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); +} + +template // 0..15 +float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { + // Get dequantization parameters from the end of quantized vector + const uint8_t *pVect2 = static_cast(pVect2v); + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Calculate inner product using common implementation with normalization + float ip = SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + // The inv_norm parameter is stored after min_val and delta in the quantized vector + return 1.0f - ip * inv_norm; +} diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h new file mode 100644 index 000000000..89b1c0b6b --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" + +static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, + const __m256 &min_val_vec, const __m256 &delta_vec) { + // Load 8 float elements from pVect1 + __m256 v1 = _mm256_loadu_ps(pVect1); + pVect1 += 8; + + // Load 8 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: (val * delta) + min_val + __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute dot product and add to sum + sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant)); +} + +template // 0..15 +float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + // pVect2 is a quantized uint8_t vector + const uint8_t *pVect2 = static_cast(pVect2v); + const float *pEnd1 = pVect1 + dimension; + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Create broadcast vectors for SIMD operations + __m256 min_val_vec = _mm256_set1_ps(min_val); + __m256 delta_vec = _mm256_set1_ps(delta); + + __m256 sum256 = _mm256_setzero_ps(); + + // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one + // 16-float block, so mask loading is guaranteed to be safe. + if constexpr (residual % 8) { + __mmask8 constexpr mask = (1 << (residual % 8)) - 1; + __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); + pVect1 += residual % 8; + + // Load quantized values and dequantize + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += residual % 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: (val * delta) + min_val + __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute dot product with masking + sum256 = _mm256_mul_ps(v1, v2_dequant); + } + + // If the reminder is >=8, have another step of 8 floats + if constexpr (residual >= 8) { + InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); + } + + // We dealt with the residual part. We are left with some multiple of 16 floats. + // In each iteration we calculate 16 floats = 512 bits. + do { + InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); + } while (pVect1 < pEnd1); + + return my_mm256_reduce_add_ps(sum256); +} + +template // 0..15 +float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_InnerProductImp(pVect1v, pVect2v, dimension); +} + +template // 0..15 +float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { + // Get dequantization parameters from the end of quantized vector + const uint8_t *pVect2 = static_cast(pVect2v); + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Calculate inner product using common implementation with normalization + float ip = SQ8_InnerProductImp(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + // The inv_norm parameter is stored after min_val and delta in the quantized vector + return 1.0f - ip * inv_norm; +} diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h new file mode 100644 index 000000000..3fd665111 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include +#include + +static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum, + const __m512 &min_val_vec, const __m512 &delta_vec) { + // Load 16 float elements from pVec1 + __m512 v1 = _mm512_loadu_ps(pVec1); + + // Load 16 uint8 elements from pVec2 and convert to __m512i + __m128i v2_128 = _mm_loadu_si128((__m128i *)pVec2); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + + // Convert uint8 to float + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize: (val * delta) + min_val + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product and add to sum + sum = _mm512_fmadd_ps(v1, dequantized, sum); + + // Advance pointers + pVec1 += 16; + pVec2 += 16; +} + +// Common implementation for both inner product and cosine similarity +template // 0..15 +float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, + float inv_norm = 1.0f) { + const float *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + const float *pEnd1 = pVec1 + dimension; + + // Get dequantization parameters from the end of pVec2 + const float min_val = *reinterpret_cast(pVec2 + dimension); + const float delta = *reinterpret_cast(pVec2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m512 min_val_vec = _mm512_set1_ps(min_val); + __m512 delta_vec = _mm512_set1_ps(delta); + + // Initialize sum accumulator + __m512 sum = _mm512_setzero_ps(); + + // Deal with remainder first + if constexpr (residual > 0) { + // Handle less than 16 elements + __mmask16 mask = (1U << residual) - 1; + + // Load masked float elements + __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); + + // Load full uint8 elements - we know that the first 16 elements are safe to load + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute dot product + __m512 product = _mm512_mul_ps(v1, dequantized); + + // Apply mask to product and add to sum + sum = _mm512_fmadd_ps(sum, sum, product); + + pVec1 += residual; + pVec2 += residual; + } + + // Process remaining full chunks of 16 elements + do { + SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); + } while (pVec1 < pEnd1); + + // Return the raw inner product result + return _mm512_reduce_add_ps(sum); +} + +template // 0..15 +float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // Calculate inner product using common implementation + float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + + // The inner product similarity is 1 - ip + return 1.0f - ip; +} + +template // 0..15 +float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // Get the inverse norm factor stored after min_val and delta + const uint8_t *pVec2 = static_cast(pVec2v); + const float inv_norm = *reinterpret_cast(pVec2 + dimension + 2 * sizeof(float)); + + // Calculate inner product using common implementation with normalization + float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension, inv_norm); + + // The cosine similarity is 1 - ip + return 1.0f - ip; +} diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h new file mode 100644 index 000000000..3e632dcdb --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include + +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum, + const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { + // Load 4 float elements from pVect1 + float32x4_t v1 = vld1q_f32(pVect1); + pVect1 += 4; + + // Load 4 uint8 elements from pVect2 + uint8x8_t v2_u8 = vld1_u8(pVect2); + pVect2 += 4; + + // Convert uint8 to uint32 + uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); + + // Convert uint32 to float32 + float32x4_t v2_f = vcvtq_f32_u32(v2_u32); + + // Dequantize: (val * delta) + min_val + float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec); + + // Compute dot product and add to sum + sum = vmlaq_f32(sum, v1, v2_dequant); +} + +template // 0..15 +float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + float32x4_t min_val_vec = vdupq_n_f32(min_val); + float32x4_t delta_vec = vdupq_n_f32(delta); + + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + float32x4_t sum3 = vdupq_n_f32(0.0f); + + const size_t num_of_chunks = dimension / 16; + + // Process 16 elements at a time in the main loop + for (size_t i = 0; i < num_of_chunks; i++) { + InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + InnerProductStep(pVect1, pVect2, sum3, min_val_vec, delta_vec); + } + + // Handle remaining complete 4-float blocks within residual + if constexpr (residual >= 4) { + InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + } + if constexpr (residual >= 8) { + InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + } + if constexpr (residual >= 12) { + InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + } + + // Handle final residual elements (0-3 elements) + constexpr size_t final_residual = residual % 4; + if constexpr (final_residual > 0) { + float32x4_t v1 = vdupq_n_f32(0.0f); + float32x4_t v2_dequant = vdupq_n_f32(0.0f); + + if constexpr (final_residual >= 1) { + v1 = vld1q_lane_f32(pVect1, v1, 0); + float dequant0 = pVect2[0] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0); + } + if constexpr (final_residual >= 2) { + v1 = vld1q_lane_f32(pVect1 + 1, v1, 1); + float dequant1 = pVect2[1] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1); + } + if constexpr (final_residual >= 3) { + v1 = vld1q_lane_f32(pVect1 + 2, v1, 2); + float dequant2 = pVect2[2] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2); + } + + sum3 = vmlaq_f32(sum3, v1, v2_dequant); + } + + // Combine all four sum accumulators + float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3)); + + // Horizontal sum of the 4 elements in the combined NEON register + float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined)); + float32x2_t summed = vpadd_f32(sum_halves, sum_halves); + float sum = vget_lane_f32(summed, 0); + + return sum; +} + +template // 0..15 +float SQ8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); +} + +template // 0..15 +float SQ8_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + const uint8_t *pVect2 = static_cast(pVect2v); + + // Get quantization parameters + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Compute inner product with dequantization using the common function + const float res = SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + // The inv_norm parameter is stored after min_val and delta in the quantized vector + return 1.0f - res * inv_norm; +} diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h new file mode 100644 index 000000000..5e47af2b6 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include +#include + +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod, + const __m128 &min_val_vec, const __m128 &delta_vec) { + // Load 4 float elements from pVect1 + __m128 v1 = _mm_loadu_ps(pVect1); + pVect1 += 4; + + // Load 4 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)pVect2)); + pVect2 += 4; + + // Convert int32 to float + __m128 v2_f = _mm_cvtepi32_ps(v2_i); + + // Dequantize: (val * delta) + min_val + __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute dot product and add to sum + sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant)); +} + +template // 0..15 +float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + const uint8_t *quantized = static_cast(pVect2v); + + // Get dequantization parameters from the end of quantized vector + float min = *(float *)(quantized + dimension); + float delta = *(float *)(quantized + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m128 min_val_vec = _mm_set1_ps(min); + __m128 delta_vec = _mm_set1_ps(delta); + + const float *pEnd1 = pVect1 + dimension; + + __m128 sum = _mm_setzero_ps(); + + // Process residual elements if needed + if constexpr (residual) { + // Handle residual elements (1-3) + if constexpr (residual % 4) { + __m128 v1; + __m128 v2_dequant; + + if constexpr (residual % 4 == 3) { + // Set 3 floats and the last one to 0 + v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]); + + // Dequantize and set 3 values + v2_dequant = _mm_set_ps(0.0f, quantized[2] * delta + min, + quantized[1] * delta + min, quantized[0] * delta + min); + + } else if constexpr (residual % 4 == 2) { + // Set 2 floats and the last two to 0 + v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]); + + // Dequantize and set 2 values + v2_dequant = + _mm_set_ps(0.0f, 0.0f, quantized[1] * delta + min, quantized[0] * delta + min); + + } else if constexpr (residual % 4 == 1) { + // Set 1 float and the last three to 0 + v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]); + + // Dequantize and set 1 value + v2_dequant = _mm_set_ps(0.0f, 0.0f, 0.0f, quantized[0] * delta + min); + } + + pVect1 += residual % 4; + quantized += residual % 4; + sum = _mm_mul_ps(v1, v2_dequant); + } + } + + // Process 4 elements at a time + while (pVect1 < pEnd1) { + InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec); + } + + // TmpRes must be 16 bytes aligned. + float PORTABLE_ALIGN16 TmpRes[4]; + _mm_store_ps(TmpRes, sum); + float result = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + + return result; +} + +template // 0..15 +float SQ8_InnerProductSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); +} + +template // 0..15 +float SQ8_CosineSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { + + const uint8_t *pVect2 = static_cast(pVect2v); + // Get quantization parameters + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Compute inner product with dequantization using the common function + // We need to cast away const for the inner product function, but it doesn't modify the vectors + const float res = SQ8_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + // The inv_norm parameter is stored after min_val and delta in the quantized vector + return 1.0f - res * inv_norm; +} diff --git a/src/VecSim/spaces/IP/IP_SVE_FP32.h b/src/VecSim/spaces/IP/IP_SVE_FP32.h index c60acb16a..c1cc79ccd 100644 --- a/src/VecSim/spaces/IP/IP_SVE_FP32.h +++ b/src/VecSim/spaces/IP/IP_SVE_FP32.h @@ -11,13 +11,13 @@ #include static inline void InnerProductStep(float *&pVect1, float *&pVect2, size_t &offset, - svfloat32_t &sum) { + svfloat32_t &sum, const size_t chunk) { svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset); svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset); sum = svmla_f32_x(svptrue_b32(), sum, v1, v2); - offset += svcntw(); + offset += chunk; } template @@ -26,33 +26,33 @@ float FP32_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t float *pVect2 = (float *)pVect2v; size_t offset = 0; - uint64_t sve_word_count = svcntw(); + uint64_t chunk = svcntw(); svfloat32_t sum0 = svdup_f32(0.0f); svfloat32_t sum1 = svdup_f32(0.0f); svfloat32_t sum2 = svdup_f32(0.0f); svfloat32_t sum3 = svdup_f32(0.0f); - auto chunk_size = 4 * sve_word_count; + auto chunk_size = 4 * chunk; const size_t number_of_chunks = dimension / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - InnerProductStep(pVect1, pVect2, offset, sum0); - InnerProductStep(pVect1, pVect2, offset, sum1); - InnerProductStep(pVect1, pVect2, offset, sum2); - InnerProductStep(pVect1, pVect2, offset, sum3); + InnerProductStep(pVect1, pVect2, offset, sum0, chunk); + InnerProductStep(pVect1, pVect2, offset, sum1, chunk); + InnerProductStep(pVect1, pVect2, offset, sum2, chunk); + InnerProductStep(pVect1, pVect2, offset, sum3, chunk); } // Process remaining complete SVE vectors that didn't fit into the main loop // These are full vector operations (0-3 elements) if constexpr (additional_steps > 0) { if constexpr (additional_steps >= 1) { - InnerProductStep(pVect1, pVect2, offset, sum0); + InnerProductStep(pVect1, pVect2, offset, sum0, chunk); } if constexpr (additional_steps >= 2) { - InnerProductStep(pVect1, pVect2, offset, sum1); + InnerProductStep(pVect1, pVect2, offset, sum1, chunk); } if constexpr (additional_steps >= 3) { - InnerProductStep(pVect1, pVect2, offset, sum3); + InnerProductStep(pVect1, pVect2, offset, sum3, chunk); } } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h new file mode 100644 index 000000000..7b9bd86bc --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include +#include +#include + +static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, + svfloat32_t &sum, const svfloat32_t &min_val_vec, + const svfloat32_t &delta_vec, const size_t chunk) { + svbool_t pg = svptrue_b32(); + + // Load float elements from pVect1 + svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); + + // Convert uint8 to uint32 + svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa + + // Convert uint32 to float32 + svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); + + // Dequantize: (val * delta) + min_val + svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec); + + // Compute dot product and add to sum + sum = svmla_f32_x(pg, sum, v1, v2_dequant); + + // Move to the next set of elements + offset += chunk; +} + +template +float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); + size_t offset = 0; + + // Get dequantization parameters from the end of quantized vector + float min = *(float *)(pVect2 + dimension); + float delta = *(float *)(pVect2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + svbool_t pg = svptrue_b32(); + svfloat32_t min_val_vec = svdup_f32(min); + svfloat32_t delta_vec = svdup_f32(delta); + + // Get the number of 32-bit elements per vector at runtime + uint64_t chunk = svcntw(); + + // Multiple accumulators to increase instruction-level parallelism + svfloat32_t sum0 = svdup_f32(0.0f); + svfloat32_t sum1 = svdup_f32(0.0f); + svfloat32_t sum2 = svdup_f32(0.0f); + svfloat32_t sum3 = svdup_f32(0.0f); + + // Handle partial chunk if needed + if constexpr (partial_chunk) { + size_t remaining = dimension % chunk; + if (remaining > 0) { + // Create predicate for the remaining elements + svbool_t pg_partial = + svwhilelt_b32(static_cast(0), static_cast(remaining)); + + // Load float elements from pVect1 with predicate + svfloat32_t v1 = svld1_f32(pg_partial, pVect1); + + // load 8-bit bytes from pVect2+offset and zero-extend each into a 32-bit lane + svuint32_t v2_u32 = svld1ub_u32( + pg_partial, pVect2 + offset); // LD1UB: load 8-bit, zero-extend to 32-bit + // :contentReference[oaicite:0]{index=0} + + // Convert uint32 to float32 + svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); + + // Dequantize: (val * delta) + min_val + svfloat32_t v2_dequant = svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec); + + // Compute dot product and add to sum + sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant); + + // Move pointers past the partial chunk + offset += remaining; + } + } + + // Process 4 chunks at a time in the main loop + auto chunk_size = 4 * chunk; + const size_t number_of_chunks = + (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; + + for (size_t i = 0; i < number_of_chunks; i++) { + InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); + InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk); + } + + // Handle remaining steps (0-3) + if constexpr (additional_steps > 0) { + InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); + } + if constexpr (additional_steps > 1) { + InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); + } + if constexpr (additional_steps > 2) { + InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); + } + + // Combine the accumulators + svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); + sum = svadd_f32_z(pg, sum, sum2); + sum = svadd_f32_z(pg, sum, sum3); + + // Horizontal sum of all elements in the vector + float result = svaddv_f32(pg, sum); + + return result; +} + +template +float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, + dimension); +} + +template +float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const uint8_t *pVect2 = static_cast(pVect2v); + + // Get quantization parameters + const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Compute inner product with dequantization using the common function + const float res = + SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); + + // For cosine, we need to account for the vector norms + // The inv_norm parameter is stored after min_val and delta in the quantized vector + return 1.0f - res * inv_norm; +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index ef8b6a7c5..d24c1d142 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -20,7 +20,9 @@ #include "VecSim/spaces/functions/AVX512BF16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_FMA.h" #include "VecSim/spaces/functions/SSE3.h" +#include "VecSim/spaces/functions/SSE4.h" #include "VecSim/spaces/functions/NEON.h" #include "VecSim/spaces/functions/NEON_DOTPROD.h" #include "VecSim/spaces/functions/NEON_HP.h" @@ -33,6 +35,121 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; namespace spaces { +dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_InnerProduct; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); +#ifdef CPU_FEATURES_ARCH_AARCH64 + +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_IP_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_IP_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_IP_implementation_NEON(dim); + } +#endif + +#endif + +#ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { + return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#ifdef OPT_AVX2_FMA + if (features.avx2 && features.fma3) { + return Choose_SQ8_IP_implementation_AVX2_FMA(dim); + } +#endif +#ifdef OPT_AVX2 + if (features.avx2) { + return Choose_SQ8_IP_implementation_AVX2(dim); + } +#endif +#ifdef OPT_SSE4 + if (features.sse4_1) { + return Choose_SQ8_IP_implementation_SSE4(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + +dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_Cosine; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); +#ifdef CPU_FEATURES_ARCH_AARCH64 + +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_Cosine_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_Cosine_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_Cosine_implementation_NEON(dim); + } +#endif + +#endif + +#ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { + return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#ifdef OPT_AVX2_FMA + if (features.avx2 && features.fma3) { + return Choose_SQ8_Cosine_implementation_AVX2_FMA(dim); + } +#endif +#ifdef OPT_AVX2 + if (features.avx2) { + return Choose_SQ8_Cosine_implementation_AVX2(dim); + } +#endif +#ifdef OPT_SSE4 + if (features.sse4_1) { + return Choose_SQ8_Cosine_implementation_SSE4(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (alignment == nullptr) { diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index b976e9580..db2d0b2d9 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -10,6 +10,9 @@ #include "VecSim/spaces/spaces.h" namespace spaces { +dist_func_t IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); + dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); dist_func_t IP_FP64_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, @@ -26,4 +29,6 @@ dist_func_t IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = n const void *arch_opt = nullptr); dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 47ad551df..a68ea5114 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -10,10 +10,29 @@ #include "VecSim/types/bfloat16.h" #include "VecSim/types/float16.h" #include +#include using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; +float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + // pvect2 is a vector of uint8_t, so we need to dequantize it, normalize it and then multiply + // it. it structred as [quantized values (uint8_t * dim)][min_val (float)][delta + // (float)][inv_norm (float)] The last two values are used to dequantize the vector. + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + float res = 0; + for (size_t i = 0; i < dimension; i++) { + auto dequantized_V2 = (pVect2[i] * delta + min_val); + float t = pVect1[i] - dequantized_V2; + res += t * t; + } + return res; +} + float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { float *vec1 = (float *)pVect1v; float *vec2 = (float *)pVect2v; diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h index 9b796f943..ba3e64207 100644 --- a/src/VecSim/spaces/L2/L2.h +++ b/src/VecSim/spaces/L2/L2.h @@ -10,6 +10,9 @@ #include +// pVect1v vector of type fp32 and pVect2v vector of type uint8 +float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); + float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); double FP64_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h new file mode 100644 index 000000000..dfbbaa9e9 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h @@ -0,0 +1,94 @@ + +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" + +static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, + const __m256 &min_val_vec, const __m256 &delta_vec) { + // Load 8 float elements from pVect1 + __m256 v1 = _mm256_loadu_ps(pVect1); + pVect1 += 8; + + // Load 8 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: v2_dequant = v2_f * delta_vec + min_val_vec + __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Calculate squared difference - simple and efficient approach + __m256 diff = _mm256_sub_ps(v1, v2_dequant); + + // Use FMA for diff² + sum in one instruction + sum256 = _mm256_fmadd_ps(diff, diff, sum256); +} + +template // 0..15 +float SQ8_L2SqrSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + // pVect2 is a quantized uint8_t vector + const uint8_t *pVect2 = static_cast(pVect2v); + const float *pEnd1 = pVect1 + dimension; + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Create broadcast vectors for SIMD operations + __m256 min_val_vec = _mm256_set1_ps(min_val); + __m256 delta_vec = _mm256_set1_ps(delta); + + __m256 sum256 = _mm256_setzero_ps(); + + // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one + // 16-float block, so mask loading is guaranteed to be safe. + if constexpr (residual % 8) { + __mmask8 constexpr mask = (1 << (residual % 8)) - 1; + __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); + pVect1 += residual % 8; + + // Load quantized values and dequantize + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += residual % 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize using FMA: (val * delta) + min_val + __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); + v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); + + // Calculate squared difference + __m256 diff = _mm256_sub_ps(v1, v2_dequant); + sum256 = _mm256_mul_ps(diff, diff); + } + + // If the reminder is >=8, have another step of 8 floats + if constexpr (residual >= 8) { + L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + } + + // We dealt with the residual part. We are left with some multiple of 16 floats. + // In each iteration we calculate 16 floats = 512 bits. + do { + L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + } while (pVect1 < pEnd1); + + return my_mm256_reduce_add_ps(sum256); +} diff --git a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h new file mode 100644 index 000000000..bdde99e62 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/AVX_utils.h" + +static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum, + const __m256 &min_val_vec, const __m256 &delta_vec) { + // Load 8 float elements from pVect1 + __m256 v1 = _mm256_loadu_ps(pVect1); + + // Load 8 uint8 elements from pVect2 + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: (val * delta) + min_val + __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute difference + __m256 diff = _mm256_sub_ps(v1, v2_dequant); + + // Square difference and add to sum + sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); + + // Advance pointers + pVect1 += 8; + pVect2 += 8; +} + +template // 0..15 +float SQ8_L2SqrSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Create broadcast vectors for SIMD operations + __m256 min_val_vec = _mm256_set1_ps(min_val); + __m256 delta_vec = _mm256_set1_ps(delta); + + const float *pEnd1 = pVect1 + dimension; + + __m256 sum = _mm256_setzero_ps(); + + // Deal with 1-7 floats with mask loading, if needed + if constexpr (residual % 8) { + __mmask8 constexpr mask = (1 << (residual % 8)) - 1; + __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); + pVect1 += residual % 8; + + // Direct load - safe because we only process the masked elements + __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + pVect2 += residual % 8; + + // Zero-extend uint8 to int32 + __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); + + // Convert int32 to float + __m256 v2_f = _mm256_cvtepi32_ps(v2_256); + + // Dequantize: (val * delta) + min_val + __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); + + // Apply mask to zero out unused elements + v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask); + + __m256 diff = _mm256_sub_ps(v1, v2_dequant); + sum = _mm256_mul_ps(diff, diff); + } + + // If the reminder is >= 8, have another step of 8 floats + if constexpr (residual >= 8) { + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + } + + // We dealt with the residual part. We are left with some multiple of 16 floats. + // In each iteration we calculate 16 floats = 512 bits. + do { + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + } while (pVect1 < pEnd1); + + return my_mm256_reduce_add_ps(sum); +} diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h new file mode 100644 index 000000000..d2775f5be --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" + +// Helper function to perform L2 squared distance calculation for a chunk of 16 elements +static inline void SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum, + const __m512 &min_val_vec, const __m512 &delta_vec) { + // Load 16 float elements from pVect1 + __m512 v1 = _mm512_loadu_ps(pVect1); + + // Load 16 uint8 elements from pVect2 and convert to __m512i + __m128i v2_128 = _mm_loadu_si128((__m128i *)pVect2); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + + // Convert uint8 to float + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize: (val * delta + min_val) * inv_norm + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute difference + __m512 diff = _mm512_sub_ps(v1, dequantized); + + // Square difference and add to sum + sum = _mm512_fmadd_ps(diff, diff, sum); + + // Advance pointers + pVect1 += 16; + pVect2 += 16; +} + +template // 0..15 +float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, + size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); + const float *pEnd1 = pVect1 + dimension; + + // Get dequantization parameters from the end of pVect2 + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m512 min_val_vec = _mm512_set1_ps(min_val); + __m512 delta_vec = _mm512_set1_ps(delta); + + // Initialize sum accumulator + __m512 sum = _mm512_setzero_ps(); + + // Handle residual elements (0 to 15) + if constexpr (residual > 0) { + // Create mask for residual elements + __mmask16 mask = (1U << residual) - 1; + + // Load masked float elements from pVect1 + __m512 v1 = _mm512_maskz_loadu_ps(mask, pVect1); + + // Load masked uint8 elements from pVect2 + __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast(pVect2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Dequantize: (val * delta + min_val) * inv_norm + __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); + + // Compute difference + __m512 diff = _mm512_sub_ps(v1, dequantized); + + // Square difference and add to sum (with mask) + __m512 squared = _mm512_mul_ps(diff, diff); + sum = _mm512_mask_add_ps(sum, mask, sum, squared); + + // Advance pointers + pVect1 += residual; + pVect2 += residual; + } + + // Process remaining full chunks of 16 elements + do { + SQ8_L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec); + } while (pVect1 < pEnd1); + + // Horizontal sum + float result = _mm512_reduce_add_ps(sum); + + return result; +} diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8.h b/src/VecSim/spaces/L2/L2_NEON_SQ8.h new file mode 100644 index 000000000..e751d1c00 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_NEON_SQ8.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include + +static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum, + const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { + // Load 4 float elements from pVect1 + float32x4_t v1 = vld1q_f32(pVect1); + pVect1 += 4; + + // Load 4 uint8 elements from pVect2 + uint8x8_t v2_u8 = vld1_u8(pVect2); + pVect2 += 4; + + // Convert uint8 to uint32 + uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); + + // Convert uint32 to float32 + float32x4_t v2_f = vcvtq_f32_u32(v2_u32); + + // Dequantize: (val * delta) + min_val + float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec); + + // Compute difference + float32x4_t diff = vsubq_f32(v1, v2_dequant); + + // Square difference and add to sum + sum = vmlaq_f32(sum, diff, diff); +} + +template // 0..15 +float SQ8_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + float32x4_t min_val_vec = vdupq_n_f32(min_val); + float32x4_t delta_vec = vdupq_n_f32(delta); + + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + float32x4_t sum3 = vdupq_n_f32(0.0f); + + const size_t num_of_chunks = dimension / 16; + + // Process 16 elements at a time in the main loop + for (size_t i = 0; i < num_of_chunks; i++) { + L2SqrStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + L2SqrStep(pVect1, pVect2, sum3, min_val_vec, delta_vec); + } + + // Handle remaining complete 4-float blocks within residual + if constexpr (residual >= 4) { + L2SqrStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + } + if constexpr (residual >= 8) { + L2SqrStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + } + if constexpr (residual >= 12) { + L2SqrStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + } + + // Handle final residual elements (0-3 elements) + constexpr size_t final_residual = residual % 4; + if constexpr (final_residual > 0) { + float32x4_t v1 = vdupq_n_f32(0.0f); + float32x4_t v2_dequant = vdupq_n_f32(0.0f); + + if constexpr (final_residual >= 1) { + v1 = vld1q_lane_f32(pVect1, v1, 0); + float dequant0 = pVect2[0] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0); + } + if constexpr (final_residual >= 2) { + v1 = vld1q_lane_f32(pVect1 + 1, v1, 1); + float dequant1 = pVect2[1] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1); + } + if constexpr (final_residual >= 3) { + v1 = vld1q_lane_f32(pVect1 + 2, v1, 2); + float dequant2 = pVect2[2] * delta + min_val; + v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2); + } + + float32x4_t diff = vsubq_f32(v1, v2_dequant); + sum3 = vmlaq_f32(sum3, diff, diff); + } + + // Combine all four sum accumulators + float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3)); + + // Horizontal sum of the 4 elements in the combined NEON register + float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined)); + float32x2_t summed = vpadd_f32(sum_halves, sum_halves); + float sum = vget_lane_f32(summed, 0); + + return sum; +} diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h new file mode 100644 index 000000000..cd36a4e91 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include + +static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum, + const __m128 &min_val_vec, const __m128 &delta_vec) { + // Load 4 float elements from pVect1 + __m128 v1 = _mm_loadu_ps(pVect1); + pVect1 += 4; + + // Load 4 uint8 elements from pVect2, convert to int32, then to float + __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)pVect2)); + pVect2 += 4; + + // Convert int32 to float + __m128 v2_f = _mm_cvtepi32_ps(v2_i); + + // Dequantize: (val * delta) + min_val + __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec); + + // Compute difference + __m128 diff = _mm_sub_ps(v1, v2_dequant); + + // Square difference and add to sum + sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); +} + +template // 0..15 +float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + const uint8_t *quantized = static_cast(pVect2v); + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(quantized + dimension); + const float delta = *reinterpret_cast(quantized + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + __m128 min_val_vec = _mm_set1_ps(min_val); + __m128 delta_vec = _mm_set1_ps(delta); + + const float *pEnd1 = pVect1 + dimension; + + __m128 sum = _mm_setzero_ps(); + + // Process residual elements if needed + if constexpr (residual) { + // Handle residual elements (1-3) + if constexpr (residual % 4) { + __m128 v1; + __m128 v2_dequant; + + if constexpr (residual % 4 == 3) { + // Load 3 floats and set the last one to 0 + v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]); + + // Dequantize and set 3 values + v2_dequant = + _mm_set_ps(0.0f, quantized[2] * delta + min_val, quantized[1] * delta + min_val, + quantized[0] * delta + min_val); + + } else if constexpr (residual % 4 == 2) { + // Set 2 floats and the last two to 0 + v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]); + + // Dequantize and set 2 valuesAdd commentMore actions + v2_dequant = _mm_set_ps(0.0f, 0.0f, quantized[1] * delta + min_val, + quantized[0] * delta + min_val); + + } else if constexpr (residual % 4 == 1) { + // Set 1 float and the last three to 0Add commentMore actions + v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]); + + // Dequantize and set 1 value + v2_dequant = _mm_set_ps(0.0f, 0.0f, 0.0f, quantized[0] * delta + min_val); + } + + pVect1 += residual % 4; + quantized += residual % 4; + + // Compute difference + __m128 diff = _mm_sub_ps(v1, v2_dequant); + + // Square difference and initialize sum + sum = _mm_mul_ps(diff, diff); + } + + // Process remaining blocks of 4 elements based on residual + if constexpr (residual >= 12) + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); + if constexpr (residual >= 8) + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); + if constexpr (residual >= 4) + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); + } + + // Process 16 elements at a time (4 elements per step, 4 steps) + while (pVect1 < pEnd1) { + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); + L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec); + } + + // TmpRes must be 16 bytes aligned + float PORTABLE_ALIGN16 TmpRes[4]; + _mm_store_ps(TmpRes, sum); + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; +} diff --git a/src/VecSim/spaces/L2/L2_SVE_FP32.h b/src/VecSim/spaces/L2/L2_SVE_FP32.h index a3e96c7a8..8367baa97 100644 --- a/src/VecSim/spaces/L2/L2_SVE_FP32.h +++ b/src/VecSim/spaces/L2/L2_SVE_FP32.h @@ -9,7 +9,8 @@ #include "VecSim/spaces/space_includes.h" #include -static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset, svfloat32_t &sum) { +static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset, svfloat32_t &sum, + const size_t chunk) { // Load vectors svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset); svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset); @@ -21,7 +22,7 @@ static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset, sum = svmla_f32_z(svptrue_b32(), sum, diff, diff); // Advance pointers by the vector length - offset += svcntw(); + offset += chunk; } template @@ -31,7 +32,7 @@ float FP32_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimens size_t offset = 0; // Get the number of 32-bit elements per vector at runtime - uint64_t sve_word_count = svcntw(); + uint64_t chunk = svcntw(); // Multiple accumulators to increase instruction-level parallelism svfloat32_t sum0 = svdup_f32(0.0f); @@ -40,27 +41,27 @@ float FP32_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimens svfloat32_t sum3 = svdup_f32(0.0f); // Process vectors in chunks, with unrolling for better pipelining - auto chunk_size = 4 * sve_word_count; + auto chunk_size = 4 * chunk; size_t number_of_chunks = dimension / chunk_size; for (size_t i = 0; i < number_of_chunks; ++i) { // Process 4 chunks with separate accumulators - L2SquareStep(pVect1, pVect2, offset, sum0); - L2SquareStep(pVect1, pVect2, offset, sum1); - L2SquareStep(pVect1, pVect2, offset, sum2); - L2SquareStep(pVect1, pVect2, offset, sum3); + L2SquareStep(pVect1, pVect2, offset, sum0, chunk); + L2SquareStep(pVect1, pVect2, offset, sum1, chunk); + L2SquareStep(pVect1, pVect2, offset, sum2, chunk); + L2SquareStep(pVect1, pVect2, offset, sum3, chunk); } // Process remaining complete SVE vectors that didn't fit into the main loop // These are full vector operations (0-3 elements) if constexpr (additional_steps > 0) { if constexpr (additional_steps >= 1) { - L2SquareStep(pVect1, pVect2, offset, sum0); + L2SquareStep(pVect1, pVect2, offset, sum0, chunk); } if constexpr (additional_steps >= 2) { - L2SquareStep(pVect1, pVect2, offset, sum1); + L2SquareStep(pVect1, pVect2, offset, sum1, chunk); } if constexpr (additional_steps >= 3) { - L2SquareStep(pVect1, pVect2, offset, sum2); + L2SquareStep(pVect1, pVect2, offset, sum2, chunk); } } diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h new file mode 100644 index 000000000..756f82522 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "VecSim/spaces/space_includes.h" +#include + +static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, + svfloat32_t &sum, const svfloat32_t &min_val_vec, + const svfloat32_t &delta_vec, const size_t chunk) { + svbool_t pg = svptrue_b32(); + + // Load float elements from pVect1 + svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); + + // Convert uint8 to uint32 + svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); + + // Convert uint32 to float32 + svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); + + // Dequantize: (val * delta) + min_val + svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec); + + // Compute difference + svfloat32_t diff = svsub_f32_x(pg, v1, v2_dequant); + + // Square difference and add to sum + sum = svmla_f32_x(pg, sum, diff, diff); + + // Move to the next set of elements + offset += chunk; +} + +template +float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *pVect1 = static_cast(pVect1v); + const uint8_t *pVect2 = static_cast(pVect2v); + size_t offset = 0; + + // Get dequantization parameters from the end of quantized vector + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Create broadcast vectors for SIMD operations + svbool_t pg = svptrue_b32(); + svfloat32_t min_val_vec = svdup_f32(min_val); + svfloat32_t delta_vec = svdup_f32(delta); + + // Get the number of 32-bit elements per vector at runtime + uint64_t chunk = svcntw(); + + // Multiple accumulators to increase instruction-level parallelism + svfloat32_t sum0 = svdup_f32(0.0f); + svfloat32_t sum1 = svdup_f32(0.0f); + svfloat32_t sum2 = svdup_f32(0.0f); + svfloat32_t sum3 = svdup_f32(0.0f); + + // Handle partial chunk if needed + if constexpr (partial_chunk) { + size_t remaining = dimension % chunk; + if (remaining > 0) { + // Create predicate for the remaining elements + svbool_t pg_partial = + svwhilelt_b32(static_cast(0), static_cast(remaining)); + + // Load float elements from pVect1 with predicate + svfloat32_t v1 = svld1_f32(pg_partial, pVect1); + + // Load uint8 elements from pVect2 with predicate, convert to int32, then to float + svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset); + + // Convert uint32 to float32 + svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); + + // Dequantize: (val * delta) + min_val + svfloat32_t v2_dequant = + svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec); + + // Compute difference + svfloat32_t diff = svsub_f32_z(pg_partial, v1, v2_dequant); + + // Square difference and add to sum + sum0 = svmla_f32_z(pg_partial, sum0, diff, diff); + + // Move pointers past the partial chunk + offset += remaining; + } + } + // Handle remaining steps (0-3) + if constexpr (additional_steps > 0) { + L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); + } + if constexpr (additional_steps > 1) { + L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); + } + if constexpr (additional_steps > 2) { + L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); + } + + // Process 4 chunks at a time in the main loop + auto chunk_size = 4 * chunk; + size_t number_of_chunks = dimension / chunk_size; + + for (size_t i = 0; i < number_of_chunks; i++) { + L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); + L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); + L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); + L2SqrStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk); + } + + // Combine the accumulators + svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); + sum = svadd_f32_z(pg, sum, sum2); + sum = svadd_f32_z(pg, sum, sum3); + + // Horizontal sum of all elements in the vector + float result = svaddv_f32(pg, sum); + + return result; +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 7cd277ca7..ed920927d 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -19,7 +19,9 @@ #include "VecSim/spaces/functions/AVX512FP16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_FMA.h" #include "VecSim/spaces/functions/SSE3.h" +#include "VecSim/spaces/functions/SSE4.h" #include "VecSim/spaces/functions/NEON.h" #include "VecSim/spaces/functions/NEON_DOTPROD.h" #include "VecSim/spaces/functions/NEON_HP.h" @@ -33,6 +35,63 @@ using float16 = vecsim_types::float16; namespace spaces { +dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (!alignment) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_L2Sqr; + + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_L2_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_L2_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_SQ8_L2_implementation_NEON(dim); + } +#endif +#endif + +#ifdef CPU_FEATURES_ARCH_X86_64 + // Optimizations assume at least 16 floats. If we have less, we use the naive implementation. + + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vnni) { + return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#ifdef OPT_AVX2_FMA + if (features.avx2 && features.fma3) { + return Choose_SQ8_L2_implementation_AVX2_FMA(dim); + } +#endif +#ifdef OPT_AVX2 + if (features.avx2) { + return Choose_SQ8_L2_implementation_AVX2(dim); + } +#endif +#ifdef OPT_SSE4 + if (features.sse4_1) { + return Choose_SQ8_L2_implementation_SSE4(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + dist_func_t L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (!alignment) { diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h index 9643aaa71..a58fcd7e4 100644 --- a/src/VecSim/spaces/L2_space.h +++ b/src/VecSim/spaces/L2_space.h @@ -22,4 +22,6 @@ dist_func_t L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nu const void *arch_opt = nullptr); dist_func_t L2_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +dist_func_t L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX2.cpp b/src/VecSim/spaces/functions/AVX2.cpp index e63cf7ee8..3b24060c1 100644 --- a/src/VecSim/spaces/functions/AVX2.cpp +++ b/src/VecSim/spaces/functions/AVX2.cpp @@ -10,6 +10,8 @@ #include "VecSim/spaces/IP/IP_AVX2_BF16.h" #include "VecSim/spaces/L2/L2_AVX2_BF16.h" +#include "VecSim/spaces/IP/IP_AVX2_SQ8.h" +#include "VecSim/spaces/L2/L2_AVX2_SQ8.h" namespace spaces { @@ -27,6 +29,24 @@ dist_func_t Choose_BF16_L2_implementation_AVX2(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_IP_implementation_AVX2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_AVX2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX2); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_L2_implementation_AVX2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX2); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX2.h b/src/VecSim/spaces/functions/AVX2.h index d1377765f..ecc28f01f 100644 --- a/src/VecSim/spaces/functions/AVX2.h +++ b/src/VecSim/spaces/functions/AVX2.h @@ -12,6 +12,10 @@ namespace spaces { +dist_func_t Choose_SQ8_IP_implementation_AVX2(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_AVX2(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_AVX2(size_t dim); + dist_func_t Choose_BF16_IP_implementation_AVX2(size_t dim); dist_func_t Choose_BF16_L2_implementation_AVX2(size_t dim); diff --git a/src/VecSim/spaces/functions/AVX2_FMA.cpp b/src/VecSim/spaces/functions/AVX2_FMA.cpp new file mode 100644 index 000000000..4dc627c57 --- /dev/null +++ b/src/VecSim/spaces/functions/AVX2_FMA.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "AVX2_FMA.h" +#include "VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h" + +namespace spaces { + +#include "implementation_chooser.h" +// FMA optimized implementations +dist_func_t Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2_FMA); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX2_FMA); + return ret_dist_func; +} +dist_func_t Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX2_FMA); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX2_FMA.h b/src/VecSim/spaces/functions/AVX2_FMA.h new file mode 100644 index 000000000..b81dfd5ab --- /dev/null +++ b/src/VecSim/spaces/functions/AVX2_FMA.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim); + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F.h b/src/VecSim/spaces/functions/AVX512F.h index fd36f312f..450c3d6bc 100644 --- a/src/VecSim/spaces/functions/AVX512F.h +++ b/src/VecSim/spaces/functions/AVX512F.h @@ -20,4 +20,7 @@ dist_func_t Choose_FP16_L2_implementation_AVX512F(size_t dim); dist_func_t Choose_FP32_L2_implementation_AVX512F(size_t dim); dist_func_t Choose_FP64_L2_implementation_AVX512F(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_AVX512F(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_AVX512F(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 338726adc..fef34dd22 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -14,6 +14,9 @@ #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" +#include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h" +#include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h" + namespace spaces { #include "implementation_chooser.h" @@ -54,6 +57,22 @@ dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t return ret_dist_func; } +dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} +dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} +dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index 4ac0ac2f3..745a339fb 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -20,4 +20,8 @@ dist_func_t Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) dist_func_t Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp index 6e5662f07..d0b5c9160 100644 --- a/src/VecSim/spaces/functions/NEON.cpp +++ b/src/VecSim/spaces/functions/NEON.cpp @@ -15,6 +15,8 @@ #include "VecSim/spaces/IP/IP_NEON_UINT8.h" #include "VecSim/spaces/L2/L2_NEON_FP64.h" #include "VecSim/spaces/IP/IP_NEON_FP64.h" +#include "VecSim/spaces/L2/L2_NEON_SQ8.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8.h" namespace spaces { @@ -79,6 +81,24 @@ dist_func_t Choose_FP64_L2_implementation_NEON(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_L2_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_NEON); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_IP_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_NEON); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_NEON); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h index 85fb78510..1449c6ac5 100644 --- a/src/VecSim/spaces/functions/NEON.h +++ b/src/VecSim/spaces/functions/NEON.h @@ -26,4 +26,8 @@ dist_func_t Choose_FP32_L2_implementation_NEON(size_t dim); dist_func_t Choose_FP64_IP_implementation_NEON(size_t dim); dist_func_t Choose_FP64_L2_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SSE.cpp b/src/VecSim/spaces/functions/SSE.cpp index bec82169b..d25463531 100644 --- a/src/VecSim/spaces/functions/SSE.cpp +++ b/src/VecSim/spaces/functions/SSE.cpp @@ -10,9 +10,11 @@ #include "VecSim/spaces/L2/L2_SSE_FP32.h" #include "VecSim/spaces/L2/L2_SSE_FP64.h" +#include "VecSim/spaces/L2/L2_SSE4_SQ8.h" #include "VecSim/spaces/IP/IP_SSE_FP32.h" #include "VecSim/spaces/IP/IP_SSE_FP64.h" +#include "VecSim/spaces/IP/IP_SSE4_SQ8.h" namespace spaces { diff --git a/src/VecSim/spaces/functions/SSE4.cpp b/src/VecSim/spaces/functions/SSE4.cpp new file mode 100644 index 000000000..d8dd51448 --- /dev/null +++ b/src/VecSim/spaces/functions/SSE4.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "SSE4.h" +#include "VecSim/spaces/IP/IP_SSE4_SQ8.h" +#include "VecSim/spaces/L2/L2_SSE4_SQ8.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_SQ8_IP_implementation_SSE4(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_SSE4); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_SSE4(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_SSE4); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_L2_implementation_SSE4(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_SSE4); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/SSE4.h b/src/VecSim/spaces/functions/SSE4.h new file mode 100644 index 000000000..27bbae0e0 --- /dev/null +++ b/src/VecSim/spaces/functions/SSE4.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_SQ8_IP_implementation_SSE4(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_SSE4(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_SSE4(size_t dim); + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index 8d527a2d3..208763779 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -22,6 +22,8 @@ #include "VecSim/spaces/L2/L2_SVE_UINT8.h" #include "VecSim/spaces/IP/IP_SVE_UINT8.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8.h" +#include "VecSim/spaces/L2/L2_SVE_SQ8.h" namespace spaces { @@ -96,6 +98,24 @@ dist_func_t Choose_UINT8_Cosine_implementation_SVE(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_L2SqrSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index 5f74b8d46..680e906e6 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -29,4 +29,8 @@ dist_func_t Choose_UINT8_L2_implementation_SVE(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_SVE(size_t dim); dist_func_t Choose_UINT8_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index 690bb0cd1..9df4b3b08 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -20,6 +20,8 @@ #include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8.h" // SVE2 implementation is identical to SVE namespace spaces { @@ -94,6 +96,24 @@ dist_func_t Choose_UINT8_Cosine_implementation_SVE2(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_IP_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_Cosine_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_L2SqrSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index b67d56dd9..059f38b1b 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -29,4 +29,8 @@ dist_func_t Choose_UINT8_L2_implementation_SVE2(size_t dim); dist_func_t Choose_UINT8_Cosine_implementation_SVE2(size_t dim); dist_func_t Choose_UINT8_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_Cosine_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim); + } // namespace spaces diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt index a5c9e7257..8a207228a 100644 --- a/tests/benchmark/CMakeLists.txt +++ b/tests/benchmark/CMakeLists.txt @@ -38,7 +38,7 @@ endif() # Spaces benchmarks # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # -set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8) +set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8 sq8) foreach(data_type IN LISTS DATA_TYPE) add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp) target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark) diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh index 78584130e..76389ad89 100755 --- a/tests/benchmark/benchmarks.sh +++ b/tests/benchmark/benchmarks.sh @@ -15,6 +15,7 @@ if [ -z "$BM_TYPE" ] || [ "$BM_TYPE" = "benchmarks-all" ]; then echo spaces_fp16 echo spaces_int8 echo spaces_uint8 + echo spaces_sq8 elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo basics_single_fp32 @@ -25,6 +26,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo spaces_fp16 echo spaces_int8 echo spaces_uint8 + echo spaces_sq8 # Basic benchmarks @@ -91,6 +93,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then echo spaces_bf16 echo spaces_int8 echo spaces_uint8 + echo spaces_sq8 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then echo spaces_fp32 @@ -104,4 +107,6 @@ elif [ "$BM_TYPE" = "bm-spaces-int8" ] ; then echo spaces_int8 elif [ "$BM_TYPE" = "bm-spaces-uint8" ] ; then echo spaces_uint8 +elif [ "$BM_TYPE" = "bm-spaces-sq8" ] ; then + echo spaces_sq8 fi diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index ded60d4dd..d99bcc4ca 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -24,7 +24,9 @@ #include "VecSim/spaces/functions/AVX512BF16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_FMA.h" #include "VecSim/spaces/functions/F16C.h" +#include "VecSim/spaces/functions/SSE4.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/SSE.h" #include "VecSim/spaces/functions/NEON.h" diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp new file mode 100644 index 000000000..1349a3512 --- /dev/null +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "bm_spaces.h" +#include "utils/tests_utils.h" + +class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { +protected: + std::mt19937 rng; + size_t dim; + float *v1; + uint8_t *v2; + +public: + BM_VecSimSpaces_SQ8() { rng.seed(47); } + ~BM_VecSimSpaces_SQ8() = default; + + void SetUp(const ::benchmark::State &state) { + dim = state.range(0); + v1 = new float[dim]; + test_utils::populate_float_vec(v1, dim, 123); + // Allocate vector with extra space for min, delta and cosine calculations + v2 = new uint8_t[dim + sizeof(float) * 3]; + test_utils::populate_float_vec_to_sq8(v2, dim, 1234); + } + void TearDown(const ::benchmark::State &state) { + delete v1; + delete v2; + } +}; + +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; + +// NEON implementation for ARMv8-a +#ifdef OPT_NEON +bool neon_supported = opt.asimd; // ARMv8-a always supports NEON +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported); +#endif +// SVE implementation +#ifdef OPT_SVE +bool sve_supported = opt.sve; // Check for SVE support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported); +#endif +// SVE2 implementation +#ifdef OPT_SVE2 +bool sve2_supported = opt.sve2; // Check for SVE2 support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported); +#endif +#endif // AARCH64 + +#ifdef CPU_FEATURES_ARCH_X86_64 +cpu_features::X86Features opt = cpu_features::GetX86Info().features; + +// AVX512_F_BW_VL_VNNI functions +#ifdef OPT_AVX512_F_BW_VL_VNNI +bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 16, + avx512_f_bw_vl_vnni_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 16, + avx512_f_bw_vl_vnni_supported); +#endif // AVX512_F_BW_VL_VNNI + +#ifdef OPT_AVX2_FMA +bool avx2_fma3_supported = opt.avx2 && opt.fma3; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported); +#endif // AVX2_FMA + +#ifdef OPT_AVX2 +// AVX2 functions +bool avx2_supported = opt.avx2; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported); +#endif // AVX2 + +// SSE4 functions +#ifdef OPT_SSE4 +bool sse4_supported = opt.sse4_1; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SSE4, 16, sse4_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SSE4, 16, sse4_supported); +#endif // SSE4 +#endif // x86_64 + +// Naive algorithms + +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, InnerProduct, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, Cosine, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, L2Sqr, 16); + +// Naive + +BENCHMARK_MAIN(); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 1c5633f87..dabe9c794 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -28,7 +28,9 @@ #include "VecSim/spaces/functions/AVX512FP16_VL.h" #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" +#include "VecSim/spaces/functions/AVX2_FMA.h" #include "VecSim/spaces/functions/SSE3.h" +#include "VecSim/spaces/functions/SSE4.h" #include "VecSim/spaces/functions/F16C.h" #include "VecSim/spaces/functions/NEON.h" #include "VecSim/spaces/functions/NEON_DOTPROD.h" @@ -305,6 +307,176 @@ TEST_F(SpacesTest, uint8_Cosine_no_optimization_func_test) { ASSERT_NEAR(dist, 0.0, 0.000001); } +void common_ip_sq8(bool should_normalize, float expected_dist) { + + size_t dim = 5; + + // Create original vectors + float v1_orig[dim], v2_orig[dim]; + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i + 1.5); + } + + // Create SQ8 compressed version of v2 + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + if (should_normalize) { + spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); + } + + // Find min and max for quantization + float min_val = v2_orig[0]; + float max_val = v2_orig[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v2_orig[i]); + max_val = std::max(max_val, v2_orig[i]); + } + + // Calculate delta and inverse norm + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + std::vector v2_compressed(compressed_size); + + // Quantize v2 + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + float *params = reinterpret_cast(quant_values + dim); + + // Store parameters + params[0] = min_val; + params[1] = delta; + + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v2_orig[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + } + + float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim); + + // Since we're comparing identical vectors, the inner product distance should be close to + // expected + ASSERT_NEAR(dist, expected_dist, 0.01) << "SQ8_InnerProduct failed to match expected distance"; +} + +/* ======================== Tests SQ8 ========================= */ +TEST_F(SpacesTest, SQ8_ip_no_optimization_func_test) { + float expected_dist = -70.2147f; // Expected distance for identical vectors + common_ip_sq8(false, expected_dist); +} + +TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { common_ip_sq8(true, 0.0f); } + +TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { + // create a vector with extra space for the norm + size_t dim = 5; + + // Create original vectors + float v1_orig[dim], v2_orig[dim]; + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i + 1.5); + } + + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + spaces::GetNormalizeFunc()(v1_orig, dim); + // Find min and max for quantization + float min_val = v2_orig[0]; + float max_val = v2_orig[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v2_orig[i]); + max_val = std::max(max_val, v2_orig[i]); + } + // Calculate delta and inverse norm + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + // Compress v2 + std::vector v2_compressed(compressed_size); + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + float *params = reinterpret_cast(quant_values + dim); + + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v2_orig[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + } + // Calculate inverse norm from decompressed values + float inv_norm = 0.0f; + for (size_t i = 0; i < dim; i++) { + float decompressed_value = min_val + quant_values[i] * delta; + inv_norm += decompressed_value * decompressed_value; + } + inv_norm = 1.0f / std::sqrt(inv_norm); + // Store parameters + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; + + float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); + ASSERT_NEAR(dist, 0.0f, 0.000001f) << "SQ8_Cosine failed to match expected distance"; +} +TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { + // create a vector with extra space for the norm + size_t dim = 5; + + // Create original vectors + float v1_orig[dim], v2_orig[dim]; + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i + 1.5); + } + + // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); + // Find min and max for quantization + float min_val = v2_orig[0]; + float max_val = v2_orig[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v2_orig[i]); + max_val = std::max(max_val, v2_orig[i]); + } + // Calculate delta and inverse norm + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + // Compress v2 + std::vector v2_compressed(compressed_size); + uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); + float *params = reinterpret_cast(quant_values + dim); + + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v2_orig[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + } + // Calculate inverse norm from decompressed values + float inv_norm = 0.0f; + for (size_t i = 0; i < dim; i++) { + float decompressed_value = min_val + quant_values[i] * delta; + inv_norm += decompressed_value * decompressed_value; + } + inv_norm = 1.0f / std::sqrt(inv_norm); + // Store parameters + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; + + float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim); + ASSERT_NEAR(dist, 0.0f, 0.00001f) << "SQ8_Cosine failed to match expected distance"; +} + /* ======================== Test Getters ======================== */ TEST_F(SpacesTest, GetDistFuncInvalidMetricFP32) { @@ -1889,3 +2061,409 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) { INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest, testing::Range(32UL, 64 * 2UL + 1)); + +// Helper function to create SQ8 compressed vector +std::vector CreateSQ8CompressedVector(const float *original, size_t dim) { + // Create a copy of the original vector that we can modify + std::vector vec_copy(original, original + dim); + + // Size: dim (uint8_t) + min_val (float) + delta (float) + norm (float) + size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); + std::vector compressed(compressed_size); + + // Find min and max for quantization + float min_val = vec_copy[0]; + float max_val = vec_copy[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, vec_copy[i]); + max_val = std::max(max_val, vec_copy[i]); + } + + // Calculate delta + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + + // Quantize vector + uint8_t *quant_values = compressed.data(); + float norm = 0.0f; + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (vec_copy[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + quant_values[i] = static_cast(std::round(normalized)); + norm += (quant_values[i] * delta + min_val) * (quant_values[i] * delta + min_val); + } + + float inv_norm = 1.0f / std::sqrt(norm); + // Store parameters + float *params = reinterpret_cast(quant_values + dim); + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; + + return compressed; +} + +class SQ8SpacesOptimizationTest : public testing::TestWithParam {}; + +TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + + // Create SQ8 compressed version of v2 + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim); +// Test different optimizations based on CPU features +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX512 with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.avx512f = 0; + } +#endif +#ifdef OPT_AVX2_FMA + if (optimization.avx2 && optimization.fma3) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2_FMA(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.fma3 = 0; + } +#endif +#ifdef OPT_AVX2 + if (optimization.avx2) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim; + // Unset avx flag as well, so we'll choose the next optimization (SSE). + optimization.avx2 = 0; + } +#endif +#ifdef OPT_SSE4 + if (optimization.sse4_1) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SSE4(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SSE with dim " << dim; + // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; + // Unset sse flag as well, so we'll choose the next optimization (default). + optimization.sse4_1 = 0; + } +#endif + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "NEON with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.asimd = 0; + } +#endif + + // Test default implementation + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + spaces::GetNormalizeFunc()(v1_orig.data(), dim); + + // Create SQ8 compressed version of v2 + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + // print min and delta + float *params = reinterpret_cast(v2_compressed.data() + dim); + + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); + +// Test different optimizations based on CPU features +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } +#endif +#ifdef OPT_AVX2_FMA + if (optimization.avx2 && optimization.fma3) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2_FMA(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + optimization.fma3 = 0; + } +#endif +#ifdef OPT_AVX2 + if (optimization.avx2) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + optimization.avx2 = 0; + } +#endif +#ifdef OPT_SSE + if (optimization.sse4_1) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SSE4(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SSE with dim " << dim; + optimization.sse4_1 = 0; + } +#endif +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "NEON with dim " << dim; + optimization.asimd = 0; + } +#endif + + // Test default implementation + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_InnerProduct) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +// Instantiate the test suite with dimensions to test +INSTANTIATE_TEST_SUITE_P(SQ8InnerProductTest, SQ8SpacesOptimizationTest, + testing::Range(16UL, 16 * 2UL + 1)); + +TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create original vectors + std::vector v1_orig(dim); + std::vector v2_orig(dim); + for (size_t i = 0; i < dim; i++) { + v1_orig[i] = float(i + 1.5); + v2_orig[i] = float(i * 0.75 + 1.0); + } + + // Normalize v1 + spaces::GetNormalizeFunc()(v1_orig.data(), dim); + spaces::GetNormalizeFunc()(v2_orig.data(), dim); + + // Create SQ8 compressed version of v2 (with normalization) + std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "NEON with dim " << dim; + optimization.asimd = 0; + } +#endif + +// Test different optimizations based on CPU features +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } +#endif +#ifdef OPT_AVX2_FMA + if (optimization.avx2 && optimization.fma3) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2_FMA(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + optimization.fma3 = 0; + } +#endif +#ifdef OPT_AVX2 + if (optimization.avx2) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "AVX with dim " << dim; + optimization.avx2 = 0; + } +#endif + +#ifdef OPT_SSE + if (optimization.sse4_1) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SSE4(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "SSE with dim " << dim; + optimization.sse4_1 = 0; + } +#endif + + // Test default implementation + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 93a3eaf0b..cbab07653 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -53,6 +53,45 @@ static void populate_float_vec(float *v, size_t dim, int seed = 1234) { } } +static void quantize_float_vec_to_uint8(float *v, size_t dim, uint8_t *qv, int seed = 1234) { + + float min_val = v[0]; + float max_val = v[0]; + for (size_t i = 1; i < dim; i++) { + min_val = std::min(min_val, v[i]); + max_val = std::max(max_val, v[i]); + } + // Calculate delta + float delta = (max_val - min_val) / 255.0f; + if (delta == 0) + delta = 1.0f; // Avoid division by zero + float norm = 0.0f; + // Quantize each value + for (size_t i = 0; i < dim; i++) { + float normalized = (v[i] - min_val) / delta; + normalized = std::max(0.0f, std::min(255.0f, normalized)); + qv[i] = static_cast(std::round(normalized)); + norm += (qv[i] * delta + min_val) * (qv[i] * delta + min_val); + } + float inv_norm = 1.0f / std::sqrt(norm); + // Store parameters + float *params = reinterpret_cast(qv + dim); + params[0] = min_val; + params[1] = delta; + params[2] = inv_norm; +} + +static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { + + std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed + std::uniform_real_distribution dis(-1.0f, 1.0f); + std::vector vec(dim); + for (size_t i = 0; i < dim; i++) { + vec[i] = dis(gen); + } + quantize_float_vec_to_uint8(vec.data(), dim, v, seed); +} + template float integral_compute_norm(const datatype *vec, size_t dim) { return spaces::IntegralType_ComputeNorm(vec, dim);