RedisAI · dor-forer · Jun 9, 2025 · May 11, 2025 · May 11, 2025 · May 11, 2025
diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake
@@ -17,6 +17,7 @@ CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2)
 CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
 CHECK_CXX_COMPILER_FLAG(-mf16c CXX_F16C)
 CHECK_CXX_COMPILER_FLAG(-mfma CXX_FMA)
+CHECK_CXX_COMPILER_FLAG(-msse4.1 CXX_SSE4)
 CHECK_CXX_COMPILER_FLAG(-msse3 CXX_SSE3)
 CHECK_CXX_COMPILER_FLAG(-msse CXX_SSE)
 
@@ -60,10 +61,18 @@ if(CXX_AVX2)
 	add_compile_definitions(OPT_AVX2)
 endif()
 
+if(CXX_AVX2 AND CXX_FMA)
+	add_compile_definitions(OPT_AVX2_FMA)
+endif()
+
 if(CXX_AVX)
 	add_compile_definitions(OPT_AVX)
 endif()
 
+if(CXX_SSE4)
+	add_compile_definitions(OPT_SSE4)
+endif()
+
 if(CXX_SSE3)
 	add_compile_definitions(OPT_SSE3)
 endif()

diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
@@ -56,6 +56,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/AVX2.cpp)
 	endif()
 
+	if(CXX_AVX2 AND CXX_FMA)
+		message("Building with AVX2 and FMA")
+		set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
+		list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp)
+	endif()
+
 	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
 		message("Building with CXX_F16C")
 		set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx")
@@ -74,6 +80,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/SSE3.cpp)
 	endif()
 
+	if(CXX_SSE4)
+		message("Building with SSE4")
+		set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS -msse4.1)
+		list(APPEND OPTIMIZATIONS functions/SSE4.cpp)
+	endif()
+
 	if(CXX_SSE)
 		message("Building with SSE")
 		set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)

diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
@@ -14,6 +14,43 @@
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
+float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension,
+                                 float min_val, float delta, float inv_norm) {
+    float res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        float dequantized_V2 = (pVect2v[i] * delta + min_val);
+        res += pVect1v[i] * dequantized_V2;
+    }
+    return res * inv_norm;
+}
+
+float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const float *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply
+    // it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta
+    // (float)][inv_norm (float)] The last two values are used to dequantize the vector.
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Compute inner product with dequantization
+    const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, 1.0f);
+    return 1.0f - res;
+}
+
+float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const float *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Get quantization parameters
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+    // Compute inner product with dequantization
+    const float res =
+        FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm);
+    return 1.0f - res;
+}
+
 float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) {
     auto *vec1 = (float *)pVect1;
     auto *vec2 = (float *)pVect2;

diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h
@@ -10,6 +10,12 @@
 
 #include <cstdlib>
 
+// pVect1v vector of type fp32 and pVect2v vector of type uint8
+float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension);
+
+// pVect1v vector of type fp32 and pVect2v vector of type uint8
+float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);
+
 float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
 
 double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);

diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2,
+                                           __m256 &sum256, const __m256 &min_val_vec,
+                                           const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+    pVect1 += 8;
+
+    // Load 8 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+    pVect2 += 8;
+
+    // Zero-extend uint8 to int32
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+    // Dequantize and compute dot product in one step using FMA
+    // (val * delta) + min_val -> v2_dequant
+    // sum256 += v1 * v2_dequant
+    // Using FMA: sum256 = v1 * v2_dequant + sum256
+
+    // First, compute v2_dequant = v2_f * delta_vec + min_val_vec
+    __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+    // Then, compute sum256 += v1 * v2_dequant using FMA
+    sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    // pVect2 is a quantized uint8_t vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float *pEnd1 = pVect1 + dimension;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
+    __m256 delta_vec = _mm256_set1_ps(delta);
+
+    __m256 sum256 = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
+    // 16-float block, so mask loading is guaranteed to be safe.
+    if constexpr (residual % 8) {
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
+        pVect1 += residual % 8;
+
+        // Load quantized values and dequantize
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+        pVect2 += residual % 8;
+
+        // Zero-extend uint8 to int32
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+        // Dequantize using FMA: (val * delta) + min_val
+        __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+        // Compute dot product with masking
+        sum256 = _mm256_mul_ps(v1, v2_dequant);
+    }
+
+    // If the reminder is >=8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    do {
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    } while (pVect1 < pEnd1);
+
+    return my_mm256_reduce_add_ps(sum256);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    // Get dequantization parameters from the end of quantized vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Calculate inner product using common implementation with normalization
+    float ip = SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
+
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - ip * inv_norm;
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
+                                       const __m256 &min_val_vec, const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+    pVect1 += 8;
+
+    // Load 8 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+    pVect2 += 8;
+
+    // Zero-extend uint8 to int32
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+    // Dequantize: (val * delta) + min_val
+    __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+
+    // Compute dot product and add to sum
+    sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant));
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    // pVect2 is a quantized uint8_t vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float *pEnd1 = pVect1 + dimension;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
+    __m256 delta_vec = _mm256_set1_ps(delta);
+
+    __m256 sum256 = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
+    // 16-float block, so mask loading is guaranteed to be safe.
+    if constexpr (residual % 8) {
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
+        pVect1 += residual % 8;
+
+        // Load quantized values and dequantize
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+        pVect2 += residual % 8;
+
+        // Zero-extend uint8 to int32
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+        // Dequantize: (val * delta) + min_val
+        __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+
+        // Compute dot product with masking
+        sum256 = _mm256_mul_ps(v1, v2_dequant);
+    }
+
+    // If the reminder is >=8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    do {
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    } while (pVect1 < pEnd1);
+
+    return my_mm256_reduce_add_ps(sum256);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    // Get dequantization parameters from the end of quantized vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Calculate inner product using common implementation with normalization
+    float ip = SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - ip * inv_norm;
+}