diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake
index 4976d60e6..dadd550a8 100644
--- a/cmake/x86_64InstructionFlags.cmake
+++ b/cmake/x86_64InstructionFlags.cmake
@@ -17,6 +17,7 @@ CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2)
 CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
 CHECK_CXX_COMPILER_FLAG(-mf16c CXX_F16C)
 CHECK_CXX_COMPILER_FLAG(-mfma CXX_FMA)
+CHECK_CXX_COMPILER_FLAG(-msse4.1 CXX_SSE4)
 CHECK_CXX_COMPILER_FLAG(-msse3 CXX_SSE3)
 CHECK_CXX_COMPILER_FLAG(-msse CXX_SSE)
 
@@ -60,10 +61,18 @@ if(CXX_AVX2)
 	add_compile_definitions(OPT_AVX2)
 endif()
 
+if(CXX_AVX2 AND CXX_FMA)
+	add_compile_definitions(OPT_AVX2_FMA)
+endif()
+
 if(CXX_AVX)
 	add_compile_definitions(OPT_AVX)
 endif()
 
+if(CXX_SSE4)
+	add_compile_definitions(OPT_SSE4)
+endif()
+
 if(CXX_SSE3)
 	add_compile_definitions(OPT_SSE3)
 endif()
diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
index 7ee0a5844..d88750e91 100644
--- a/src/VecSim/spaces/CMakeLists.txt
+++ b/src/VecSim/spaces/CMakeLists.txt
@@ -56,6 +56,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/AVX2.cpp)
 	endif()
 
+	if(CXX_AVX2 AND CXX_FMA)
+		message("Building with AVX2 and FMA")
+		set_source_files_properties(functions/AVX2_FMA.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
+		list(APPEND OPTIMIZATIONS functions/AVX2_FMA.cpp)
+	endif()
+
 	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
 		message("Building with CXX_F16C")
 		set_source_files_properties(functions/F16C.cpp PROPERTIES COMPILE_FLAGS "-mf16c -mfma -mavx")
@@ -74,6 +80,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/SSE3.cpp)
 	endif()
 
+	if(CXX_SSE4)
+		message("Building with SSE4")
+		set_source_files_properties(functions/SSE4.cpp PROPERTIES COMPILE_FLAGS -msse4.1)
+		list(APPEND OPTIMIZATIONS functions/SSE4.cpp)
+	endif()
+
 	if(CXX_SSE)
 		message("Building with SSE")
 		set_source_files_properties(functions/SSE.cpp PROPERTIES COMPILE_FLAGS -msse)
diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
index 3b9253a21..5e2c4b4dc 100644
--- a/src/VecSim/spaces/IP/IP.cpp
+++ b/src/VecSim/spaces/IP/IP.cpp
@@ -14,6 +14,43 @@
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
+float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension,
+                                 float min_val, float delta, float inv_norm) {
+    float res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        float dequantized_V2 = (pVect2v[i] * delta + min_val);
+        res += pVect1v[i] * dequantized_V2;
+    }
+    return res * inv_norm;
+}
+
+float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const float *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply
+    // it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta
+    // (float)][inv_norm (float)] The last two values are used to dequantize the vector.
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Compute inner product with dequantization
+    const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, 1.0f);
+    return 1.0f - res;
+}
+
+float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const float *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Get quantization parameters
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+    // Compute inner product with dequantization
+    const float res =
+        FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm);
+    return 1.0f - res;
+}
+
 float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) {
     auto *vec1 = (float *)pVect1;
     auto *vec2 = (float *)pVect2;
diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h
index e38a1c22f..d4796cbd6 100644
--- a/src/VecSim/spaces/IP/IP.h
+++ b/src/VecSim/spaces/IP/IP.h
@@ -10,6 +10,12 @@
 
 #include <cstdlib>
 
+// pVect1v vector of type fp32 and pVect2v vector of type uint8
+float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension);
+
+// pVect1v vector of type fp32 and pVect2v vector of type uint8
+float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);
+
 float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
 
 double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
new file mode 100644
index 000000000..007ee333e
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2,
+                                           __m256 &sum256, const __m256 &min_val_vec,
+                                           const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+    pVect1 += 8;
+
+    // Load 8 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+    pVect2 += 8;
+
+    // Zero-extend uint8 to int32
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+    // Dequantize and compute dot product in one step using FMA
+    // (val * delta) + min_val -> v2_dequant
+    // sum256 += v1 * v2_dequant
+    // Using FMA: sum256 = v1 * v2_dequant + sum256
+
+    // First, compute v2_dequant = v2_f * delta_vec + min_val_vec
+    __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+    // Then, compute sum256 += v1 * v2_dequant using FMA
+    sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    // pVect2 is a quantized uint8_t vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float *pEnd1 = pVect1 + dimension;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
+    __m256 delta_vec = _mm256_set1_ps(delta);
+
+    __m256 sum256 = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
+    // 16-float block, so mask loading is guaranteed to be safe.
+    if constexpr (residual % 8) {
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
+        pVect1 += residual % 8;
+
+        // Load quantized values and dequantize
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+        pVect2 += residual % 8;
+
+        // Zero-extend uint8 to int32
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+        // Dequantize using FMA: (val * delta) + min_val
+        __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+        // Compute dot product with masking
+        sum256 = _mm256_mul_ps(v1, v2_dequant);
+    }
+
+    // If the reminder is >=8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    do {
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    } while (pVect1 < pEnd1);
+
+    return my_mm256_reduce_add_ps(sum256);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    // Get dequantization parameters from the end of quantized vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Calculate inner product using common implementation with normalization
+    float ip = SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
+
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - ip * inv_norm;
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
new file mode 100644
index 000000000..89b1c0b6b
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
+                                       const __m256 &min_val_vec, const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+    pVect1 += 8;
+
+    // Load 8 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+    pVect2 += 8;
+
+    // Zero-extend uint8 to int32
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+    // Dequantize: (val * delta) + min_val
+    __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+
+    // Compute dot product and add to sum
+    sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant));
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    // pVect2 is a quantized uint8_t vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float *pEnd1 = pVect1 + dimension;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
+    __m256 delta_vec = _mm256_set1_ps(delta);
+
+    __m256 sum256 = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
+    // 16-float block, so mask loading is guaranteed to be safe.
+    if constexpr (residual % 8) {
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
+        pVect1 += residual % 8;
+
+        // Load quantized values and dequantize
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+        pVect2 += residual % 8;
+
+        // Zero-extend uint8 to int32
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+        // Dequantize: (val * delta) + min_val
+        __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+
+        // Compute dot product with masking
+        sum256 = _mm256_mul_ps(v1, v2_dequant);
+    }
+
+    // If the reminder is >=8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    do {
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    } while (pVect1 < pEnd1);
+
+    return my_mm256_reduce_add_ps(sum256);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    // Get dequantization parameters from the end of quantized vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Calculate inner product using common implementation with normalization
+    float ip = SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - ip * inv_norm;
+}
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
new file mode 100644
index 000000000..3fd665111
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include <immintrin.h>
+#include <iostream>
+
+static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum,
+                                        const __m512 &min_val_vec, const __m512 &delta_vec) {
+    // Load 16 float elements from pVec1
+    __m512 v1 = _mm512_loadu_ps(pVec1);
+
+    // Load 16 uint8 elements from pVec2 and convert to __m512i
+    __m128i v2_128 = _mm_loadu_si128((__m128i *)pVec2);
+    __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+
+    // Convert uint8 to float
+    __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+
+    // Dequantize: (val * delta) + min_val
+    __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+    // Compute dot product and add to sum
+    sum = _mm512_fmadd_ps(v1, dequantized, sum);
+
+    // Advance pointers
+    pVec1 += 16;
+    pVec2 += 16;
+}
+
+// Common implementation for both inner product and cosine similarity
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension,
+                          float inv_norm = 1.0f) {
+    const float *pVec1 = static_cast<const float *>(pVec1v);
+    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
+    const float *pEnd1 = pVec1 + dimension;
+
+    // Get dequantization parameters from the end of pVec2
+    const float min_val = *reinterpret_cast<const float *>(pVec2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVec2 + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    __m512 min_val_vec = _mm512_set1_ps(min_val);
+    __m512 delta_vec = _mm512_set1_ps(delta);
+
+    // Initialize sum accumulator
+    __m512 sum = _mm512_setzero_ps();
+
+    // Deal with remainder first
+    if constexpr (residual > 0) {
+        // Handle less than 16 elements
+        __mmask16 mask = (1U << residual) - 1;
+
+        // Load masked float elements
+        __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1);
+
+        // Load full uint8 elements - we know that the first 16 elements are safe to load
+        __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
+        __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+        __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+
+        // Dequantize
+        __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+        // Compute dot product
+        __m512 product = _mm512_mul_ps(v1, dequantized);
+
+        // Apply mask to product and add to sum
+        sum = _mm512_fmadd_ps(sum, sum, product);
+
+        pVec1 += residual;
+        pVec2 += residual;
+    }
+
+    // Process remaining full chunks of 16 elements
+    do {
+        SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec);
+    } while (pVec1 < pEnd1);
+
+    // Return the raw inner product result
+    return _mm512_reduce_add_ps(sum);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
+                                                size_t dimension) {
+    // Calculate inner product using common implementation
+    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+
+    // The inner product similarity is 1 - ip
+    return 1.0f - ip;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
+                                          size_t dimension) {
+    // Get the inverse norm factor stored after min_val and delta
+    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
+    const float inv_norm = *reinterpret_cast<const float *>(pVec2 + dimension + 2 * sizeof(float));
+
+    // Calculate inner product using common implementation with normalization
+    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension, inv_norm);
+
+    // The cosine similarity is 1 - ip
+    return 1.0f - ip;
+}
diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
new file mode 100644
index 000000000..3e632dcdb
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include <arm_neon.h>
+
+static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum,
+                                    const float32x4_t &min_val_vec, const float32x4_t &delta_vec) {
+    // Load 4 float elements from pVect1
+    float32x4_t v1 = vld1q_f32(pVect1);
+    pVect1 += 4;
+
+    // Load 4 uint8 elements from pVect2
+    uint8x8_t v2_u8 = vld1_u8(pVect2);
+    pVect2 += 4;
+
+    // Convert uint8 to uint32
+    uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8)));
+
+    // Convert uint32 to float32
+    float32x4_t v2_f = vcvtq_f32_u32(v2_u32);
+
+    // Dequantize: (val * delta) + min_val
+    float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec);
+
+    // Compute dot product and add to sum
+    sum = vmlaq_f32(sum, v1, v2_dequant);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    float32x4_t min_val_vec = vdupq_n_f32(min_val);
+    float32x4_t delta_vec = vdupq_n_f32(delta);
+
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+    float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+    const size_t num_of_chunks = dimension / 16;
+
+    // Process 16 elements at a time in the main loop
+    for (size_t i = 0; i < num_of_chunks; i++) {
+        InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec);
+        InnerProductStep(pVect1, pVect2, sum3, min_val_vec, delta_vec);
+    }
+
+    // Handle remaining complete 4-float blocks within residual
+    if constexpr (residual >= 4) {
+        InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec);
+    }
+    if constexpr (residual >= 8) {
+        InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec);
+    }
+    if constexpr (residual >= 12) {
+        InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec);
+    }
+
+    // Handle final residual elements (0-3 elements)
+    constexpr size_t final_residual = residual % 4;
+    if constexpr (final_residual > 0) {
+        float32x4_t v1 = vdupq_n_f32(0.0f);
+        float32x4_t v2_dequant = vdupq_n_f32(0.0f);
+
+        if constexpr (final_residual >= 1) {
+            v1 = vld1q_lane_f32(pVect1, v1, 0);
+            float dequant0 = pVect2[0] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0);
+        }
+        if constexpr (final_residual >= 2) {
+            v1 = vld1q_lane_f32(pVect1 + 1, v1, 1);
+            float dequant1 = pVect2[1] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1);
+        }
+        if constexpr (final_residual >= 3) {
+            v1 = vld1q_lane_f32(pVect1 + 2, v1, 2);
+            float dequant2 = pVect2[2] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2);
+        }
+
+        sum3 = vmlaq_f32(sum3, v1, v2_dequant);
+    }
+
+    // Combine all four sum accumulators
+    float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3));
+
+    // Horizontal sum of the 4 elements in the combined NEON register
+    float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined));
+    float32x2_t summed = vpadd_f32(sum_halves, sum_halves);
+    float sum = vget_lane_f32(summed, 0);
+
+    return sum;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductSIMD16_NEON_IMP<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Get quantization parameters
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Compute inner product with dequantization using the common function
+    const float res = SQ8_InnerProductSIMD16_NEON_IMP<residual>(pVect1v, pVect2v, dimension);
+
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - res * inv_norm;
+}
diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
new file mode 100644
index 000000000..5e47af2b6
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include <iostream>
+#include <string.h>
+
+static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod,
+                                    const __m128 &min_val_vec, const __m128 &delta_vec) {
+    // Load 4 float elements from pVect1
+    __m128 v1 = _mm_loadu_ps(pVect1);
+    pVect1 += 4;
+
+    // Load 4 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)pVect2));
+    pVect2 += 4;
+
+    // Convert int32 to float
+    __m128 v2_f = _mm_cvtepi32_ps(v2_i);
+
+    // Dequantize: (val * delta) + min_val
+    __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec);
+
+    // Compute dot product and add to sum
+    sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant));
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *quantized = static_cast<const uint8_t *>(pVect2v);
+
+    // Get dequantization parameters from the end of quantized vector
+    float min = *(float *)(quantized + dimension);
+    float delta = *(float *)(quantized + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    __m128 min_val_vec = _mm_set1_ps(min);
+    __m128 delta_vec = _mm_set1_ps(delta);
+
+    const float *pEnd1 = pVect1 + dimension;
+
+    __m128 sum = _mm_setzero_ps();
+
+    // Process residual elements if needed
+    if constexpr (residual) {
+        // Handle residual elements (1-3)
+        if constexpr (residual % 4) {
+            __m128 v1;
+            __m128 v2_dequant;
+
+            if constexpr (residual % 4 == 3) {
+                // Set 3 floats and the last one to 0
+                v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]);
+
+                // Dequantize and set 3 values
+                v2_dequant = _mm_set_ps(0.0f, quantized[2] * delta + min,
+                                        quantized[1] * delta + min, quantized[0] * delta + min);
+
+            } else if constexpr (residual % 4 == 2) {
+                // Set 2 floats and the last two to 0
+                v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]);
+
+                // Dequantize and set 2 values
+                v2_dequant =
+                    _mm_set_ps(0.0f, 0.0f, quantized[1] * delta + min, quantized[0] * delta + min);
+
+            } else if constexpr (residual % 4 == 1) {
+                // Set 1 float and the last three to 0
+                v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]);
+
+                // Dequantize and set 1 value
+                v2_dequant = _mm_set_ps(0.0f, 0.0f, 0.0f, quantized[0] * delta + min);
+            }
+
+            pVect1 += residual % 4;
+            quantized += residual % 4;
+            sum = _mm_mul_ps(v1, v2_dequant);
+        }
+    }
+
+    // Process 4 elements at a time
+    while (pVect1 < pEnd1) {
+        InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+    }
+
+    // TmpRes must be 16 bytes aligned.
+    float PORTABLE_ALIGN16 TmpRes[4];
+    _mm_store_ps(TmpRes, sum);
+    float result = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+
+    return result;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_InnerProductSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductSIMD16_SSE4_IMP<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_CosineSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) {
+
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    // Get quantization parameters
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Compute inner product with dequantization using the common function
+    // We need to cast away const for the inner product function, but it doesn't modify the vectors
+    const float res = SQ8_InnerProductSIMD16_SSE4_IMP<residual>(pVect1v, pVect2v, dimension);
+
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - res * inv_norm;
+}
diff --git a/src/VecSim/spaces/IP/IP_SVE_FP32.h b/src/VecSim/spaces/IP/IP_SVE_FP32.h
index c60acb16a..c1cc79ccd 100644
--- a/src/VecSim/spaces/IP/IP_SVE_FP32.h
+++ b/src/VecSim/spaces/IP/IP_SVE_FP32.h
@@ -11,13 +11,13 @@
 #include <arm_sve.h>
 
 static inline void InnerProductStep(float *&pVect1, float *&pVect2, size_t &offset,
-                                    svfloat32_t &sum) {
+                                    svfloat32_t &sum, const size_t chunk) {
     svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset);
     svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset);
 
     sum = svmla_f32_x(svptrue_b32(), sum, v1, v2);
 
-    offset += svcntw();
+    offset += chunk;
 }
 
 template <bool partial_chunk, unsigned char additional_steps>
@@ -26,33 +26,33 @@ float FP32_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t
     float *pVect2 = (float *)pVect2v;
     size_t offset = 0;
 
-    uint64_t sve_word_count = svcntw();
+    uint64_t chunk = svcntw();
 
     svfloat32_t sum0 = svdup_f32(0.0f);
     svfloat32_t sum1 = svdup_f32(0.0f);
     svfloat32_t sum2 = svdup_f32(0.0f);
     svfloat32_t sum3 = svdup_f32(0.0f);
 
-    auto chunk_size = 4 * sve_word_count;
+    auto chunk_size = 4 * chunk;
     const size_t number_of_chunks = dimension / chunk_size;
     for (size_t i = 0; i < number_of_chunks; i++) {
-        InnerProductStep(pVect1, pVect2, offset, sum0);
-        InnerProductStep(pVect1, pVect2, offset, sum1);
-        InnerProductStep(pVect1, pVect2, offset, sum2);
-        InnerProductStep(pVect1, pVect2, offset, sum3);
+        InnerProductStep(pVect1, pVect2, offset, sum0, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum1, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum2, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum3, chunk);
     }
 
     // Process remaining complete SVE vectors that didn't fit into the main loop
     // These are full vector operations (0-3 elements)
     if constexpr (additional_steps > 0) {
         if constexpr (additional_steps >= 1) {
-            InnerProductStep(pVect1, pVect2, offset, sum0);
+            InnerProductStep(pVect1, pVect2, offset, sum0, chunk);
         }
         if constexpr (additional_steps >= 2) {
-            InnerProductStep(pVect1, pVect2, offset, sum1);
+            InnerProductStep(pVect1, pVect2, offset, sum1, chunk);
         }
         if constexpr (additional_steps >= 3) {
-            InnerProductStep(pVect1, pVect2, offset, sum3);
+            InnerProductStep(pVect1, pVect2, offset, sum3, chunk);
         }
     }
 
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
new file mode 100644
index 000000000..7b9bd86bc
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include <arm_sve.h>
+#include <iostream>
+#include <string.h>
+
+static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset,
+                                    svfloat32_t &sum, const svfloat32_t &min_val_vec,
+                                    const svfloat32_t &delta_vec, const size_t chunk) {
+    svbool_t pg = svptrue_b32();
+
+    // Load float elements from pVect1
+    svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
+
+    // Convert uint8 to uint32
+    svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa
+
+    // Convert uint32 to float32
+    svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32);
+
+    // Dequantize: (val * delta) + min_val
+    svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec);
+
+    // Compute dot product and add to sum
+    sum = svmla_f32_x(pg, sum, v1, v2_dequant);
+
+    // Move to the next set of elements
+    offset += chunk;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    size_t offset = 0;
+
+    // Get dequantization parameters from the end of quantized vector
+    float min = *(float *)(pVect2 + dimension);
+    float delta = *(float *)(pVect2 + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    svbool_t pg = svptrue_b32();
+    svfloat32_t min_val_vec = svdup_f32(min);
+    svfloat32_t delta_vec = svdup_f32(delta);
+
+    // Get the number of 32-bit elements per vector at runtime
+    uint64_t chunk = svcntw();
+
+    // Multiple accumulators to increase instruction-level parallelism
+    svfloat32_t sum0 = svdup_f32(0.0f);
+    svfloat32_t sum1 = svdup_f32(0.0f);
+    svfloat32_t sum2 = svdup_f32(0.0f);
+    svfloat32_t sum3 = svdup_f32(0.0f);
+
+    // Handle partial chunk if needed
+    if constexpr (partial_chunk) {
+        size_t remaining = dimension % chunk;
+        if (remaining > 0) {
+            // Create predicate for the remaining elements
+            svbool_t pg_partial =
+                svwhilelt_b32(static_cast<uint32_t>(0), static_cast<uint32_t>(remaining));
+
+            // Load float elements from pVect1 with predicate
+            svfloat32_t v1 = svld1_f32(pg_partial, pVect1);
+
+            // load 8-bit bytes from pVect2+offset and zero-extend each into a 32-bit lane
+            svuint32_t v2_u32 = svld1ub_u32(
+                pg_partial, pVect2 + offset); // LD1UB: load 8-bit, zero-extend to 32-bit
+                                              // :contentReference[oaicite:0]{index=0}
+
+            // Convert uint32 to float32
+            svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32);
+
+            // Dequantize: (val * delta) + min_val
+            svfloat32_t v2_dequant = svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec);
+
+            // Compute dot product and add to sum
+            sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant);
+
+            // Move pointers past the partial chunk
+            offset += remaining;
+        }
+    }
+
+    // Process 4 chunks at a time in the main loop
+    auto chunk_size = 4 * chunk;
+    const size_t number_of_chunks =
+        (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
+
+    for (size_t i = 0; i < number_of_chunks; i++) {
+        InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk);
+        InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk);
+    }
+
+    // Handle remaining steps (0-3)
+    if constexpr (additional_steps > 0) {
+        InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk);
+    }
+    if constexpr (additional_steps > 1) {
+        InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk);
+    }
+    if constexpr (additional_steps > 2) {
+        InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk);
+    }
+
+    // Combine the accumulators
+    svfloat32_t sum = svadd_f32_z(pg, sum0, sum1);
+    sum = svadd_f32_z(pg, sum, sum2);
+    sum = svadd_f32_z(pg, sum, sum3);
+
+    // Horizontal sum of all elements in the vector
+    float result = svaddv_f32(pg, sum);
+
+    return result;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(pVect1v, pVect2v,
+                                                                                dimension);
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Get quantization parameters
+    const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
+
+    // Compute inner product with dequantization using the common function
+    const float res =
+        SQ8_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+
+    // For cosine, we need to account for the vector norms
+    // The inv_norm parameter is stored after min_val and delta in the quantized vector
+    return 1.0f - res * inv_norm;
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index ef8b6a7c5..d24c1d142 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -20,7 +20,9 @@
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_FMA.h"
 #include "VecSim/spaces/functions/SSE3.h"
+#include "VecSim/spaces/functions/SSE4.h"
 #include "VecSim/spaces/functions/NEON.h"
 #include "VecSim/spaces/functions/NEON_DOTPROD.h"
 #include "VecSim/spaces/functions/NEON_HP.h"
@@ -33,6 +35,121 @@ using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
 namespace spaces {
+dist_func_t<float> IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = SQ8_InnerProduct;
+    [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+#ifdef CPU_FEATURES_ARCH_AARCH64
+
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_IP_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON
+    if (features.asimd) {
+        return Choose_SQ8_IP_implementation_NEON(dim);
+    }
+#endif
+
+#endif
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+    // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vnni) {
+        return Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#ifdef OPT_AVX2_FMA
+    if (features.avx2 && features.fma3) {
+        return Choose_SQ8_IP_implementation_AVX2_FMA(dim);
+    }
+#endif
+#ifdef OPT_AVX2
+    if (features.avx2) {
+        return Choose_SQ8_IP_implementation_AVX2(dim);
+    }
+#endif
+#ifdef OPT_SSE4
+    if (features.sse4_1) {
+        return Choose_SQ8_IP_implementation_SSE4(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
+
+dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment,
+                                          const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = SQ8_Cosine;
+    [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+#ifdef CPU_FEATURES_ARCH_AARCH64
+
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_Cosine_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_Cosine_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON
+    if (features.asimd) {
+        return Choose_SQ8_Cosine_implementation_NEON(dim);
+    }
+#endif
+
+#endif
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+    // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vnni) {
+        return Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#ifdef OPT_AVX2_FMA
+    if (features.avx2 && features.fma3) {
+        return Choose_SQ8_Cosine_implementation_AVX2_FMA(dim);
+    }
+#endif
+#ifdef OPT_AVX2
+    if (features.avx2) {
+        return Choose_SQ8_Cosine_implementation_AVX2(dim);
+    }
+#endif
+#ifdef OPT_SSE4
+    if (features.sse4_1) {
+        return Choose_SQ8_Cosine_implementation_SSE4(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
+
 dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
     unsigned char dummy_alignment;
     if (alignment == nullptr) {
diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h
index b976e9580..db2d0b2d9 100644
--- a/src/VecSim/spaces/IP_space.h
+++ b/src/VecSim/spaces/IP_space.h
@@ -10,6 +10,9 @@
 #include "VecSim/spaces/spaces.h"
 
 namespace spaces {
+dist_func_t<float> IP_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                      const void *arch_opt = nullptr);
+
 dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                        const void *arch_opt = nullptr);
 dist_func_t<double> IP_FP64_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
@@ -26,4 +29,6 @@ dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = n
                                         const void *arch_opt = nullptr);
 dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                             const void *arch_opt = nullptr);
+dist_func_t<float> Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                          const void *arch_opt = nullptr);
 } // namespace spaces
diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index 47ad551df..a68ea5114 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -10,10 +10,29 @@
 #include "VecSim/types/bfloat16.h"
 #include "VecSim/types/float16.h"
 #include <cstring>
+#include <iostream>
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
+float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const float *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    // pvect2 is a vector of uint8_t, so we need to dequantize it, normalize it and then multiply
+    // it. it structred as [quantized values (uint8_t * dim)][min_val (float)][delta
+    // (float)][inv_norm (float)] The last two values are used to dequantize the vector.
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+
+    float res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        auto dequantized_V2 = (pVect2[i] * delta + min_val);
+        float t = pVect1[i] - dequantized_V2;
+        res += t * t;
+    }
+    return res;
+}
+
 float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
     float *vec1 = (float *)pVect1v;
     float *vec2 = (float *)pVect2v;
diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h
index 9b796f943..ba3e64207 100644
--- a/src/VecSim/spaces/L2/L2.h
+++ b/src/VecSim/spaces/L2/L2.h
@@ -10,6 +10,9 @@
 
 #include <cstdlib>
 
+// pVect1v vector of type fp32 and pVect2v vector of type uint8
+float SQ8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension);
+
 float FP32_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension);
 
 double FP64_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension);
diff --git a/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
new file mode 100644
index 000000000..dfbbaa9e9
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h
@@ -0,0 +1,94 @@
+
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void L2StepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
+                                 const __m256 &min_val_vec, const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+    pVect1 += 8;
+
+    // Load 8 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+    pVect2 += 8;
+
+    // Zero-extend uint8 to int32
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+    // Dequantize: v2_dequant = v2_f * delta_vec + min_val_vec
+    __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+    // Calculate squared difference - simple and efficient approach
+    __m256 diff = _mm256_sub_ps(v1, v2_dequant);
+
+    // Use FMA for diff² + sum in one instruction
+    sum256 = _mm256_fmadd_ps(diff, diff, sum256);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_L2SqrSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    // pVect2 is a quantized uint8_t vector
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float *pEnd1 = pVect1 + dimension;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
+    __m256 delta_vec = _mm256_set1_ps(delta);
+
+    __m256 sum256 = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
+    // 16-float block, so mask loading is guaranteed to be safe.
+    if constexpr (residual % 8) {
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
+        pVect1 += residual % 8;
+
+        // Load quantized values and dequantize
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+        pVect2 += residual % 8;
+
+        // Zero-extend uint8 to int32
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+        // Dequantize using FMA: (val * delta) + min_val
+        __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
+        v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
+
+        // Calculate squared difference
+        __m256 diff = _mm256_sub_ps(v1, v2_dequant);
+        sum256 = _mm256_mul_ps(diff, diff);
+    }
+
+    // If the reminder is >=8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    do {
+        L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        L2StepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+    } while (pVect1 < pEnd1);
+
+    return my_mm256_reduce_add_ps(sum256);
+}
diff --git a/src/VecSim/spaces/L2/L2_AVX2_SQ8.h b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h
new file mode 100644
index 000000000..bdde99e62
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX2_SQ8.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/AVX_utils.h"
+
+static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum,
+                             const __m256 &min_val_vec, const __m256 &delta_vec) {
+    // Load 8 float elements from pVect1
+    __m256 v1 = _mm256_loadu_ps(pVect1);
+
+    // Load 8 uint8 elements from pVect2
+    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+
+    // Zero-extend uint8 to int32
+    __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+    // Convert int32 to float
+    __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+    // Dequantize: (val * delta) + min_val
+    __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+
+    // Compute difference
+    __m256 diff = _mm256_sub_ps(v1, v2_dequant);
+
+    // Square difference and add to sum
+    sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
+
+    // Advance pointers
+    pVect1 += 8;
+    pVect2 += 8;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_L2SqrSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+    // Create broadcast vectors for SIMD operations
+    __m256 min_val_vec = _mm256_set1_ps(min_val);
+    __m256 delta_vec = _mm256_set1_ps(delta);
+
+    const float *pEnd1 = pVect1 + dimension;
+
+    __m256 sum = _mm256_setzero_ps();
+
+    // Deal with 1-7 floats with mask loading, if needed
+    if constexpr (residual % 8) {
+        __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
+        __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
+        pVect1 += residual % 8;
+
+        // Direct load - safe because we only process the masked elements
+        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+        pVect2 += residual % 8;
+
+        // Zero-extend uint8 to int32
+        __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
+
+        // Convert int32 to float
+        __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
+
+        // Dequantize: (val * delta) + min_val
+        __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
+
+        // Apply mask to zero out unused elements
+        v2_dequant = _mm256_blend_ps(_mm256_setzero_ps(), v2_dequant, mask);
+
+        __m256 diff = _mm256_sub_ps(v1, v2_dequant);
+        sum = _mm256_mul_ps(diff, diff);
+    }
+
+    // If the reminder is >= 8, have another step of 8 floats
+    if constexpr (residual >= 8) {
+        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 16 floats.
+    // In each iteration we calculate 16 floats = 512 bits.
+    do {
+        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+    } while (pVect1 < pEnd1);
+
+    return my_mm256_reduce_add_ps(sum);
+}
diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
new file mode 100644
index 000000000..d2775f5be
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+
+// Helper function to perform L2 squared distance calculation for a chunk of 16 elements
+static inline void SQ8_L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m512 &sum,
+                                 const __m512 &min_val_vec, const __m512 &delta_vec) {
+    // Load 16 float elements from pVect1
+    __m512 v1 = _mm512_loadu_ps(pVect1);
+
+    // Load 16 uint8 elements from pVect2 and convert to __m512i
+    __m128i v2_128 = _mm_loadu_si128((__m128i *)pVect2);
+    __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+
+    // Convert uint8 to float
+    __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+
+    // Dequantize: (val * delta + min_val) * inv_norm
+    __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+    // Compute difference
+    __m512 diff = _mm512_sub_ps(v1, dequantized);
+
+    // Square difference and add to sum
+    sum = _mm512_fmadd_ps(diff, diff, sum);
+
+    // Advance pointers
+    pVect1 += 16;
+    pVect2 += 16;
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                         size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    const float *pEnd1 = pVect1 + dimension;
+
+    // Get dequantization parameters from the end of pVect2
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    __m512 min_val_vec = _mm512_set1_ps(min_val);
+    __m512 delta_vec = _mm512_set1_ps(delta);
+
+    // Initialize sum accumulator
+    __m512 sum = _mm512_setzero_ps();
+
+    // Handle residual elements (0 to 15)
+    if constexpr (residual > 0) {
+        // Create mask for residual elements
+        __mmask16 mask = (1U << residual) - 1;
+
+        // Load masked float elements from pVect1
+        __m512 v1 = _mm512_maskz_loadu_ps(mask, pVect1);
+
+        // Load masked uint8 elements from pVect2
+        __m128i v2_128 = _mm_maskz_loadu_epi8(mask, reinterpret_cast<const __m128i *>(pVect2));
+        __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+        __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+
+        // Dequantize: (val * delta + min_val) * inv_norm
+        __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec);
+
+        // Compute difference
+        __m512 diff = _mm512_sub_ps(v1, dequantized);
+
+        // Square difference and add to sum (with mask)
+        __m512 squared = _mm512_mul_ps(diff, diff);
+        sum = _mm512_mask_add_ps(sum, mask, sum, squared);
+
+        // Advance pointers
+        pVect1 += residual;
+        pVect2 += residual;
+    }
+
+    // Process remaining full chunks of 16 elements
+    do {
+        SQ8_L2SqrStep(pVect1, pVect2, sum, min_val_vec, delta_vec);
+    } while (pVect1 < pEnd1);
+
+    // Horizontal sum
+    float result = _mm512_reduce_add_ps(sum);
+
+    return result;
+}
diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8.h b/src/VecSim/spaces/L2/L2_NEON_SQ8.h
new file mode 100644
index 000000000..e751d1c00
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_NEON_SQ8.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include <arm_neon.h>
+
+static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum,
+                             const float32x4_t &min_val_vec, const float32x4_t &delta_vec) {
+    // Load 4 float elements from pVect1
+    float32x4_t v1 = vld1q_f32(pVect1);
+    pVect1 += 4;
+
+    // Load 4 uint8 elements from pVect2
+    uint8x8_t v2_u8 = vld1_u8(pVect2);
+    pVect2 += 4;
+
+    // Convert uint8 to uint32
+    uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8)));
+
+    // Convert uint32 to float32
+    float32x4_t v2_f = vcvtq_f32_u32(v2_u32);
+
+    // Dequantize: (val * delta) + min_val
+    float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec);
+
+    // Compute difference
+    float32x4_t diff = vsubq_f32(v1, v2_dequant);
+
+    // Square difference and add to sum
+    sum = vmlaq_f32(sum, diff, diff);
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_L2SqrSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    float32x4_t min_val_vec = vdupq_n_f32(min_val);
+    float32x4_t delta_vec = vdupq_n_f32(delta);
+
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+    float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+    const size_t num_of_chunks = dimension / 16;
+
+    // Process 16 elements at a time in the main loop
+    for (size_t i = 0; i < num_of_chunks; i++) {
+        L2SqrStep(pVect1, pVect2, sum0, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum1, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum2, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, pVect2, sum3, min_val_vec, delta_vec);
+    }
+
+    // Handle remaining complete 4-float blocks within residual
+    if constexpr (residual >= 4) {
+        L2SqrStep(pVect1, pVect2, sum0, min_val_vec, delta_vec);
+    }
+    if constexpr (residual >= 8) {
+        L2SqrStep(pVect1, pVect2, sum1, min_val_vec, delta_vec);
+    }
+    if constexpr (residual >= 12) {
+        L2SqrStep(pVect1, pVect2, sum2, min_val_vec, delta_vec);
+    }
+
+    // Handle final residual elements (0-3 elements)
+    constexpr size_t final_residual = residual % 4;
+    if constexpr (final_residual > 0) {
+        float32x4_t v1 = vdupq_n_f32(0.0f);
+        float32x4_t v2_dequant = vdupq_n_f32(0.0f);
+
+        if constexpr (final_residual >= 1) {
+            v1 = vld1q_lane_f32(pVect1, v1, 0);
+            float dequant0 = pVect2[0] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0);
+        }
+        if constexpr (final_residual >= 2) {
+            v1 = vld1q_lane_f32(pVect1 + 1, v1, 1);
+            float dequant1 = pVect2[1] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1);
+        }
+        if constexpr (final_residual >= 3) {
+            v1 = vld1q_lane_f32(pVect1 + 2, v1, 2);
+            float dequant2 = pVect2[2] * delta + min_val;
+            v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2);
+        }
+
+        float32x4_t diff = vsubq_f32(v1, v2_dequant);
+        sum3 = vmlaq_f32(sum3, diff, diff);
+    }
+
+    // Combine all four sum accumulators
+    float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3));
+
+    // Horizontal sum of the 4 elements in the combined NEON register
+    float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined));
+    float32x2_t summed = vpadd_f32(sum_halves, sum_halves);
+    float sum = vget_lane_f32(summed, 0);
+
+    return sum;
+}
diff --git a/src/VecSim/spaces/L2/L2_SSE4_SQ8.h b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
new file mode 100644
index 000000000..cd36a4e91
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SSE4_SQ8.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include <string.h>
+
+static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum,
+                             const __m128 &min_val_vec, const __m128 &delta_vec) {
+    // Load 4 float elements from pVect1
+    __m128 v1 = _mm_loadu_ps(pVect1);
+    pVect1 += 4;
+
+    // Load 4 uint8 elements from pVect2, convert to int32, then to float
+    __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)pVect2));
+    pVect2 += 4;
+
+    // Convert int32 to float
+    __m128 v2_f = _mm_cvtepi32_ps(v2_i);
+
+    // Dequantize: (val * delta) + min_val
+    __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec);
+
+    // Compute difference
+    __m128 diff = _mm_sub_ps(v1, v2_dequant);
+
+    // Square difference and add to sum
+    sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+}
+
+template <unsigned char residual> // 0..15
+float SQ8_L2SqrSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *quantized = static_cast<const uint8_t *>(pVect2v);
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(quantized + dimension);
+    const float delta = *reinterpret_cast<const float *>(quantized + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    __m128 min_val_vec = _mm_set1_ps(min_val);
+    __m128 delta_vec = _mm_set1_ps(delta);
+
+    const float *pEnd1 = pVect1 + dimension;
+
+    __m128 sum = _mm_setzero_ps();
+
+    // Process residual elements if needed
+    if constexpr (residual) {
+        // Handle residual elements (1-3)
+        if constexpr (residual % 4) {
+            __m128 v1;
+            __m128 v2_dequant;
+
+            if constexpr (residual % 4 == 3) {
+                // Load 3 floats and set the last one to 0
+                v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]);
+
+                // Dequantize and set 3 values
+                v2_dequant =
+                    _mm_set_ps(0.0f, quantized[2] * delta + min_val, quantized[1] * delta + min_val,
+                               quantized[0] * delta + min_val);
+
+            } else if constexpr (residual % 4 == 2) {
+                // Set 2 floats and the last two to 0
+                v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]);
+
+                // Dequantize and set 2 valuesAdd commentMore actions
+                v2_dequant = _mm_set_ps(0.0f, 0.0f, quantized[1] * delta + min_val,
+                                        quantized[0] * delta + min_val);
+
+            } else if constexpr (residual % 4 == 1) {
+                // Set 1 float and the last three to 0Add commentMore actions
+                v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]);
+
+                // Dequantize and set 1 value
+                v2_dequant = _mm_set_ps(0.0f, 0.0f, 0.0f, quantized[0] * delta + min_val);
+            }
+
+            pVect1 += residual % 4;
+            quantized += residual % 4;
+
+            // Compute difference
+            __m128 diff = _mm_sub_ps(v1, v2_dequant);
+
+            // Square difference and initialize sum
+            sum = _mm_mul_ps(diff, diff);
+        }
+
+        // Process remaining blocks of 4 elements based on residual
+        if constexpr (residual >= 12)
+            L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+        if constexpr (residual >= 8)
+            L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+        if constexpr (residual >= 4)
+            L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+    }
+
+    // Process 16 elements at a time (4 elements per step, 4 steps)
+    while (pVect1 < pEnd1) {
+        L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+        L2SqrStep(pVect1, quantized, sum, min_val_vec, delta_vec);
+    }
+
+    // TmpRes must be 16 bytes aligned
+    float PORTABLE_ALIGN16 TmpRes[4];
+    _mm_store_ps(TmpRes, sum);
+    return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+}
diff --git a/src/VecSim/spaces/L2/L2_SVE_FP32.h b/src/VecSim/spaces/L2/L2_SVE_FP32.h
index a3e96c7a8..8367baa97 100644
--- a/src/VecSim/spaces/L2/L2_SVE_FP32.h
+++ b/src/VecSim/spaces/L2/L2_SVE_FP32.h
@@ -9,7 +9,8 @@
 #include "VecSim/spaces/space_includes.h"
 #include <arm_sve.h>
 
-static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset, svfloat32_t &sum) {
+static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset, svfloat32_t &sum,
+                                const size_t chunk) {
     // Load vectors
     svfloat32_t v1 = svld1_f32(svptrue_b32(), pVect1 + offset);
     svfloat32_t v2 = svld1_f32(svptrue_b32(), pVect2 + offset);
@@ -21,7 +22,7 @@ static inline void L2SquareStep(float *&pVect1, float *&pVect2, size_t &offset,
     sum = svmla_f32_z(svptrue_b32(), sum, diff, diff);
 
     // Advance pointers by the vector length
-    offset += svcntw();
+    offset += chunk;
 }
 
 template <bool partial_chunk, unsigned char additional_steps>
@@ -31,7 +32,7 @@ float FP32_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimens
     size_t offset = 0;
 
     // Get the number of 32-bit elements per vector at runtime
-    uint64_t sve_word_count = svcntw();
+    uint64_t chunk = svcntw();
 
     // Multiple accumulators to increase instruction-level parallelism
     svfloat32_t sum0 = svdup_f32(0.0f);
@@ -40,27 +41,27 @@ float FP32_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimens
     svfloat32_t sum3 = svdup_f32(0.0f);
 
     // Process vectors in chunks, with unrolling for better pipelining
-    auto chunk_size = 4 * sve_word_count;
+    auto chunk_size = 4 * chunk;
     size_t number_of_chunks = dimension / chunk_size;
     for (size_t i = 0; i < number_of_chunks; ++i) {
         // Process 4 chunks with separate accumulators
-        L2SquareStep(pVect1, pVect2, offset, sum0);
-        L2SquareStep(pVect1, pVect2, offset, sum1);
-        L2SquareStep(pVect1, pVect2, offset, sum2);
-        L2SquareStep(pVect1, pVect2, offset, sum3);
+        L2SquareStep(pVect1, pVect2, offset, sum0, chunk);
+        L2SquareStep(pVect1, pVect2, offset, sum1, chunk);
+        L2SquareStep(pVect1, pVect2, offset, sum2, chunk);
+        L2SquareStep(pVect1, pVect2, offset, sum3, chunk);
     }
 
     // Process remaining complete SVE vectors that didn't fit into the main loop
     // These are full vector operations (0-3 elements)
     if constexpr (additional_steps > 0) {
         if constexpr (additional_steps >= 1) {
-            L2SquareStep(pVect1, pVect2, offset, sum0);
+            L2SquareStep(pVect1, pVect2, offset, sum0, chunk);
         }
         if constexpr (additional_steps >= 2) {
-            L2SquareStep(pVect1, pVect2, offset, sum1);
+            L2SquareStep(pVect1, pVect2, offset, sum1, chunk);
         }
         if constexpr (additional_steps >= 3) {
-            L2SquareStep(pVect1, pVect2, offset, sum2);
+            L2SquareStep(pVect1, pVect2, offset, sum2, chunk);
         }
     }
 
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8.h b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
new file mode 100644
index 000000000..756f82522
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "VecSim/spaces/space_includes.h"
+#include <arm_sve.h>
+
+static inline void L2SqrStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset,
+                             svfloat32_t &sum, const svfloat32_t &min_val_vec,
+                             const svfloat32_t &delta_vec, const size_t chunk) {
+    svbool_t pg = svptrue_b32();
+
+    // Load float elements from pVect1
+    svfloat32_t v1 = svld1_f32(pg, pVect1 + offset);
+
+    // Convert uint8 to uint32
+    svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset);
+
+    // Convert uint32 to float32
+    svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32);
+
+    // Dequantize: (val * delta) + min_val
+    svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec);
+
+    // Compute difference
+    svfloat32_t diff = svsub_f32_x(pg, v1, v2_dequant);
+
+    // Square difference and add to sum
+    sum = svmla_f32_x(pg, sum, diff, diff);
+
+    // Move to the next set of elements
+    offset += chunk;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *pVect1 = static_cast<const float *>(pVect1v);
+    const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    size_t offset = 0;
+
+    // Get dequantization parameters from the end of quantized vector
+    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    svbool_t pg = svptrue_b32();
+    svfloat32_t min_val_vec = svdup_f32(min_val);
+    svfloat32_t delta_vec = svdup_f32(delta);
+
+    // Get the number of 32-bit elements per vector at runtime
+    uint64_t chunk = svcntw();
+
+    // Multiple accumulators to increase instruction-level parallelism
+    svfloat32_t sum0 = svdup_f32(0.0f);
+    svfloat32_t sum1 = svdup_f32(0.0f);
+    svfloat32_t sum2 = svdup_f32(0.0f);
+    svfloat32_t sum3 = svdup_f32(0.0f);
+
+    // Handle partial chunk if needed
+    if constexpr (partial_chunk) {
+        size_t remaining = dimension % chunk;
+        if (remaining > 0) {
+            // Create predicate for the remaining elements
+            svbool_t pg_partial =
+                svwhilelt_b32(static_cast<uint32_t>(0), static_cast<uint32_t>(remaining));
+
+            // Load float elements from pVect1 with predicate
+            svfloat32_t v1 = svld1_f32(pg_partial, pVect1);
+
+            // Load uint8 elements from pVect2 with predicate, convert to int32, then to float
+            svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset);
+
+            // Convert uint32 to float32
+            svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32);
+
+            // Dequantize: (val * delta) + min_val
+            svfloat32_t v2_dequant =
+                svadd_f32_z(pg_partial, svmul_f32_z(pg_partial, v2_f, delta_vec), min_val_vec);
+
+            // Compute difference
+            svfloat32_t diff = svsub_f32_z(pg_partial, v1, v2_dequant);
+
+            // Square difference and add to sum
+            sum0 = svmla_f32_z(pg_partial, sum0, diff, diff);
+
+            // Move pointers past the partial chunk
+            offset += remaining;
+        }
+    }
+    // Handle remaining steps (0-3)
+    if constexpr (additional_steps > 0) {
+        L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk);
+    }
+    if constexpr (additional_steps > 1) {
+        L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk);
+    }
+    if constexpr (additional_steps > 2) {
+        L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk);
+    }
+
+    // Process 4 chunks at a time in the main loop
+    auto chunk_size = 4 * chunk;
+    size_t number_of_chunks = dimension / chunk_size;
+
+    for (size_t i = 0; i < number_of_chunks; i++) {
+        L2SqrStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk);
+        L2SqrStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk);
+        L2SqrStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk);
+        L2SqrStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk);
+    }
+
+    // Combine the accumulators
+    svfloat32_t sum = svadd_f32_z(pg, sum0, sum1);
+    sum = svadd_f32_z(pg, sum, sum2);
+    sum = svadd_f32_z(pg, sum, sum3);
+
+    // Horizontal sum of all elements in the vector
+    float result = svaddv_f32(pg, sum);
+
+    return result;
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 7cd277ca7..ed920927d 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -19,7 +19,9 @@
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_FMA.h"
 #include "VecSim/spaces/functions/SSE3.h"
+#include "VecSim/spaces/functions/SSE4.h"
 #include "VecSim/spaces/functions/NEON.h"
 #include "VecSim/spaces/functions/NEON_DOTPROD.h"
 #include "VecSim/spaces/functions/NEON_HP.h"
@@ -33,6 +35,63 @@ using float16 = vecsim_types::float16;
 
 namespace spaces {
 
+dist_func_t<float> L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (!alignment) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = SQ8_L2Sqr;
+
+    [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt);
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_L2_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_L2_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON
+    if (features.asimd) {
+        return Choose_SQ8_L2_implementation_NEON(dim);
+    }
+#endif
+#endif
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+    // Optimizations assume at least 16 floats. If we have less, we use the naive implementation.
+
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vnni) {
+        return Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#ifdef OPT_AVX2_FMA
+    if (features.avx2 && features.fma3) {
+        return Choose_SQ8_L2_implementation_AVX2_FMA(dim);
+    }
+#endif
+#ifdef OPT_AVX2
+    if (features.avx2) {
+        return Choose_SQ8_L2_implementation_AVX2(dim);
+    }
+#endif
+#ifdef OPT_SSE4
+    if (features.sse4_1) {
+        return Choose_SQ8_L2_implementation_SSE4(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
+
 dist_func_t<float> L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
     unsigned char dummy_alignment;
     if (!alignment) {
diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h
index 9643aaa71..a58fcd7e4 100644
--- a/src/VecSim/spaces/L2_space.h
+++ b/src/VecSim/spaces/L2_space.h
@@ -22,4 +22,6 @@ dist_func_t<float> L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nu
                                        const void *arch_opt = nullptr);
 dist_func_t<float> L2_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                         const void *arch_opt = nullptr);
+dist_func_t<float> L2_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                      const void *arch_opt = nullptr);
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX2.cpp b/src/VecSim/spaces/functions/AVX2.cpp
index e63cf7ee8..3b24060c1 100644
--- a/src/VecSim/spaces/functions/AVX2.cpp
+++ b/src/VecSim/spaces/functions/AVX2.cpp
@@ -10,6 +10,8 @@
 
 #include "VecSim/spaces/IP/IP_AVX2_BF16.h"
 #include "VecSim/spaces/L2/L2_AVX2_BF16.h"
+#include "VecSim/spaces/IP/IP_AVX2_SQ8.h"
+#include "VecSim/spaces/L2/L2_AVX2_SQ8.h"
 
 namespace spaces {
 
@@ -27,6 +29,24 @@ dist_func_t<float> Choose_BF16_L2_implementation_AVX2(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX2);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX2);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX2.h b/src/VecSim/spaces/functions/AVX2.h
index d1377765f..ecc28f01f 100644
--- a/src/VecSim/spaces/functions/AVX2.h
+++ b/src/VecSim/spaces/functions/AVX2.h
@@ -12,6 +12,10 @@
 
 namespace spaces {
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX2(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX2(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX2(size_t dim);
+
 dist_func_t<float> Choose_BF16_IP_implementation_AVX2(size_t dim);
 dist_func_t<float> Choose_BF16_L2_implementation_AVX2(size_t dim);
 
diff --git a/src/VecSim/spaces/functions/AVX2_FMA.cpp b/src/VecSim/spaces/functions/AVX2_FMA.cpp
new file mode 100644
index 000000000..4dc627c57
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX2_FMA.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "AVX2_FMA.h"
+#include "VecSim/spaces/L2/L2_AVX2_FMA_SQ8.h"
+#include "VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+// FMA optimized implementations
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX2_FMA);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX2_FMA);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX2_FMA);
+    return ret_dist_func;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX2_FMA.h b/src/VecSim/spaces/functions/AVX2_FMA.h
new file mode 100644
index 000000000..b81dfd5ab
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX2_FMA.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+namespace spaces {
+
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX2_FMA(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX2_FMA(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX2_FMA(size_t dim);
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F.h b/src/VecSim/spaces/functions/AVX512F.h
index fd36f312f..450c3d6bc 100644
--- a/src/VecSim/spaces/functions/AVX512F.h
+++ b/src/VecSim/spaces/functions/AVX512F.h
@@ -20,4 +20,7 @@ dist_func_t<float> Choose_FP16_L2_implementation_AVX512F(size_t dim);
 dist_func_t<float> Choose_FP32_L2_implementation_AVX512F(size_t dim);
 dist_func_t<double> Choose_FP64_L2_implementation_AVX512F(size_t dim);
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F(size_t dim);
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index 338726adc..fef34dd22 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -14,6 +14,9 @@
 #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h"
 #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h"
 
+#include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h"
+#include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h"
+
 namespace spaces {
 
 #include "implementation_chooser.h"
@@ -54,6 +57,22 @@ dist_func_t<float> Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
index 4ac0ac2f3..745a339fb 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -20,4 +20,8 @@ dist_func_t<float> Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim)
 dist_func_t<float> Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 dist_func_t<float> Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 
+dist_func_t<float> Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp
index 6e5662f07..d0b5c9160 100644
--- a/src/VecSim/spaces/functions/NEON.cpp
+++ b/src/VecSim/spaces/functions/NEON.cpp
@@ -15,6 +15,8 @@
 #include "VecSim/spaces/IP/IP_NEON_UINT8.h"
 #include "VecSim/spaces/L2/L2_NEON_FP64.h"
 #include "VecSim/spaces/IP/IP_NEON_FP64.h"
+#include "VecSim/spaces/L2/L2_NEON_SQ8.h"
+#include "VecSim/spaces/IP/IP_NEON_SQ8.h"
 
 namespace spaces {
 
@@ -79,6 +81,24 @@ dist_func_t<double> Choose_FP64_L2_implementation_NEON(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_L2_implementation_NEON(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_NEON);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_IP_implementation_NEON(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_NEON);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_NEON(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_NEON);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h
index 85fb78510..1449c6ac5 100644
--- a/src/VecSim/spaces/functions/NEON.h
+++ b/src/VecSim/spaces/functions/NEON.h
@@ -26,4 +26,8 @@ dist_func_t<float> Choose_FP32_L2_implementation_NEON(size_t dim);
 dist_func_t<double> Choose_FP64_IP_implementation_NEON(size_t dim);
 dist_func_t<double> Choose_FP64_L2_implementation_NEON(size_t dim);
 
+dist_func_t<float> Choose_SQ8_L2_implementation_NEON(size_t dim);
+dist_func_t<float> Choose_SQ8_IP_implementation_NEON(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_NEON(size_t dim);
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SSE.cpp b/src/VecSim/spaces/functions/SSE.cpp
index bec82169b..d25463531 100644
--- a/src/VecSim/spaces/functions/SSE.cpp
+++ b/src/VecSim/spaces/functions/SSE.cpp
@@ -10,9 +10,11 @@
 
 #include "VecSim/spaces/L2/L2_SSE_FP32.h"
 #include "VecSim/spaces/L2/L2_SSE_FP64.h"
+#include "VecSim/spaces/L2/L2_SSE4_SQ8.h"
 
 #include "VecSim/spaces/IP/IP_SSE_FP32.h"
 #include "VecSim/spaces/IP/IP_SSE_FP64.h"
+#include "VecSim/spaces/IP/IP_SSE4_SQ8.h"
 
 namespace spaces {
 
diff --git a/src/VecSim/spaces/functions/SSE4.cpp b/src/VecSim/spaces/functions/SSE4.cpp
new file mode 100644
index 000000000..d8dd51448
--- /dev/null
+++ b/src/VecSim/spaces/functions/SSE4.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "SSE4.h"
+#include "VecSim/spaces/IP/IP_SSE4_SQ8.h"
+#include "VecSim/spaces/L2/L2_SSE4_SQ8.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+
+dist_func_t<float> Choose_SQ8_IP_implementation_SSE4(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_InnerProductSIMD16_SSE4);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SSE4(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_CosineSIMD16_SSE4);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_L2_implementation_SSE4(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_SSE4);
+    return ret_dist_func;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/SSE4.h b/src/VecSim/spaces/functions/SSE4.h
new file mode 100644
index 000000000..27bbae0e0
--- /dev/null
+++ b/src/VecSim/spaces/functions/SSE4.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+namespace spaces {
+
+dist_func_t<float> Choose_SQ8_IP_implementation_SSE4(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SSE4(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_SSE4(size_t dim);
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp
index 8d527a2d3..208763779 100644
--- a/src/VecSim/spaces/functions/SVE.cpp
+++ b/src/VecSim/spaces/functions/SVE.cpp
@@ -22,6 +22,8 @@
 
 #include "VecSim/spaces/L2/L2_SVE_UINT8.h"
 #include "VecSim/spaces/IP/IP_SVE_UINT8.h"
+#include "VecSim/spaces/IP/IP_SVE_SQ8.h"
+#include "VecSim/spaces/L2/L2_SVE_SQ8.h"
 
 namespace spaces {
 
@@ -96,6 +98,24 @@ dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_IP_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_InnerProductSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_CosineSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_L2_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_L2SqrSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h
index 5f74b8d46..680e906e6 100644
--- a/src/VecSim/spaces/functions/SVE.h
+++ b/src/VecSim/spaces/functions/SVE.h
@@ -29,4 +29,8 @@ dist_func_t<float> Choose_UINT8_L2_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_UINT8_IP_implementation_SVE(size_t dim);
 
+dist_func_t<float> Choose_SQ8_IP_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_SVE(size_t dim);
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp
index 690bb0cd1..9df4b3b08 100644
--- a/src/VecSim/spaces/functions/SVE2.cpp
+++ b/src/VecSim/spaces/functions/SVE2.cpp
@@ -20,6 +20,8 @@
 #include "VecSim/spaces/IP/IP_SVE_INT8.h"  // SVE2 implementation is identical to SVE
 #include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE
 #include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/IP/IP_SVE_SQ8.h"   // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/L2/L2_SVE_SQ8.h"   // SVE2 implementation is identical to SVE
 
 namespace spaces {
 
@@ -94,6 +96,24 @@ dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE2(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_IP_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_InnerProductSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_CosineSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_L2_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_L2SqrSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h
index b67d56dd9..059f38b1b 100644
--- a/src/VecSim/spaces/functions/SVE2.h
+++ b/src/VecSim/spaces/functions/SVE2.h
@@ -29,4 +29,8 @@ dist_func_t<float> Choose_UINT8_L2_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_UINT8_IP_implementation_SVE2(size_t dim);
 
+dist_func_t<float> Choose_SQ8_IP_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_SQ8_Cosine_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_SQ8_L2_implementation_SVE2(size_t dim);
+
 } // namespace spaces
diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt
index a5c9e7257..8a207228a 100644
--- a/tests/benchmark/CMakeLists.txt
+++ b/tests/benchmark/CMakeLists.txt
@@ -38,7 +38,7 @@ endif()
 # Spaces benchmarks								                                          #
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
-set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8)
+set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8 sq8)
 foreach(data_type IN LISTS DATA_TYPE)
 	add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp)
 	target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark)
diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh
index 78584130e..76389ad89 100755
--- a/tests/benchmark/benchmarks.sh
+++ b/tests/benchmark/benchmarks.sh
@@ -15,6 +15,7 @@ if [ -z "$BM_TYPE"  ] || [ "$BM_TYPE" = "benchmarks-all" ]; then
     echo spaces_fp16
     echo spaces_int8
     echo spaces_uint8
+    echo spaces_sq8
 
 elif [ "$BM_TYPE" = "benchmarks-default" ]; then
     echo basics_single_fp32
@@ -25,6 +26,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then
     echo spaces_fp16
     echo spaces_int8
     echo spaces_uint8
+    echo spaces_sq8
 
 
 # Basic benchmarks
@@ -91,6 +93,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then
     echo spaces_bf16
     echo spaces_int8
     echo spaces_uint8
+    echo spaces_sq8
 
 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then
     echo spaces_fp32
@@ -104,4 +107,6 @@ elif [ "$BM_TYPE" = "bm-spaces-int8" ] ; then
     echo spaces_int8
 elif [ "$BM_TYPE" = "bm-spaces-uint8" ] ; then
     echo spaces_uint8
+elif [ "$BM_TYPE" = "bm-spaces-sq8" ] ; then
+    echo spaces_sq8
 fi
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index ded60d4dd..d99bcc4ca 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -24,7 +24,9 @@
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_FMA.h"
 #include "VecSim/spaces/functions/F16C.h"
+#include "VecSim/spaces/functions/SSE4.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/SSE.h"
 #include "VecSim/spaces/functions/NEON.h"
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
new file mode 100644
index 000000000..1349a3512
--- /dev/null
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#include "bm_spaces.h"
+#include "utils/tests_utils.h"
+
+class BM_VecSimSpaces_SQ8 : public benchmark::Fixture {
+protected:
+    std::mt19937 rng;
+    size_t dim;
+    float *v1;
+    uint8_t *v2;
+
+public:
+    BM_VecSimSpaces_SQ8() { rng.seed(47); }
+    ~BM_VecSimSpaces_SQ8() = default;
+
+    void SetUp(const ::benchmark::State &state) {
+        dim = state.range(0);
+        v1 = new float[dim];
+        test_utils::populate_float_vec(v1, dim, 123);
+        // Allocate vector with extra space for min, delta and cosine calculations
+        v2 = new uint8_t[dim + sizeof(float) * 3];
+        test_utils::populate_float_vec_to_sq8(v2, dim, 1234);
+    }
+    void TearDown(const ::benchmark::State &state) {
+        delete v1;
+        delete v2;
+    }
+};
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features;
+
+// NEON implementation for ARMv8-a
+#ifdef OPT_NEON
+bool neon_supported = opt.asimd; // ARMv8-a always supports NEON
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, NEON, 16, neon_supported);
+#endif
+// SVE implementation
+#ifdef OPT_SVE
+bool sve_supported = opt.sve; // Check for SVE support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SVE, 16, sve_supported);
+#endif
+// SVE2 implementation
+#ifdef OPT_SVE2
+bool sve2_supported = opt.sve2; // Check for SVE2 support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SVE2, 16, sve2_supported);
+#endif
+#endif // AARCH64
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+cpu_features::X86Features opt = cpu_features::GetX86Info().features;
+
+// AVX512_F_BW_VL_VNNI functions
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 16,
+                                avx512_f_bw_vl_vnni_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX512F_BW_VL_VNNI, 16,
+                                 avx512_f_bw_vl_vnni_supported);
+#endif // AVX512_F_BW_VL_VNNI
+
+#ifdef OPT_AVX2_FMA
+bool avx2_fma3_supported = opt.avx2 && opt.fma3;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2_FMA, 16, avx2_fma3_supported);
+#endif // AVX2_FMA
+
+#ifdef OPT_AVX2
+// AVX2 functions
+bool avx2_supported = opt.avx2;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, AVX2, 16, avx2_supported);
+#endif // AVX2
+
+// SSE4 functions
+#ifdef OPT_SSE4
+bool sse4_supported = opt.sse4_1;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8, SQ8, SSE4, 16, sse4_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8, SQ8, SSE4, 16, sse4_supported);
+#endif // SSE4
+#endif // x86_64
+
+// Naive algorithms
+
+INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, InnerProduct, 16);
+INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, Cosine, 16);
+INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8, SQ8, L2Sqr, 16);
+
+// Naive
+
+BENCHMARK_MAIN();
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 1c5633f87..dabe9c794 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -28,7 +28,9 @@
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
+#include "VecSim/spaces/functions/AVX2_FMA.h"
 #include "VecSim/spaces/functions/SSE3.h"
+#include "VecSim/spaces/functions/SSE4.h"
 #include "VecSim/spaces/functions/F16C.h"
 #include "VecSim/spaces/functions/NEON.h"
 #include "VecSim/spaces/functions/NEON_DOTPROD.h"
@@ -305,6 +307,176 @@ TEST_F(SpacesTest, uint8_Cosine_no_optimization_func_test) {
     ASSERT_NEAR(dist, 0.0, 0.000001);
 }
 
+void common_ip_sq8(bool should_normalize, float expected_dist) {
+
+    size_t dim = 5;
+
+    // Create original vectors
+    float v1_orig[dim], v2_orig[dim];
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i + 1.5);
+    }
+
+    // Create SQ8 compressed version of v2
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
+    size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
+    if (should_normalize) {
+        spaces::GetNormalizeFunc<float>()(v1_orig, dim);
+        spaces::GetNormalizeFunc<float>()(v2_orig, dim);
+    }
+
+    // Find min and max for quantization
+    float min_val = v2_orig[0];
+    float max_val = v2_orig[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, v2_orig[i]);
+        max_val = std::max(max_val, v2_orig[i]);
+    }
+
+    // Calculate delta and inverse norm
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+
+    std::vector<uint8_t> v2_compressed(compressed_size);
+
+    // Quantize v2
+    uint8_t *quant_values = reinterpret_cast<uint8_t *>(v2_compressed.data());
+    float *params = reinterpret_cast<float *>(quant_values + dim);
+
+    // Store parameters
+    params[0] = min_val;
+    params[1] = delta;
+
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (v2_orig[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
+    }
+
+    float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
+
+    // Since we're comparing identical vectors, the inner product distance should be close to
+    // expected
+    ASSERT_NEAR(dist, expected_dist, 0.01) << "SQ8_InnerProduct failed to match expected distance";
+}
+
+/* ======================== Tests SQ8 ========================= */
+TEST_F(SpacesTest, SQ8_ip_no_optimization_func_test) {
+    float expected_dist = -70.2147f; // Expected distance for identical vectors
+    common_ip_sq8(false, expected_dist);
+}
+
+TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { common_ip_sq8(true, 0.0f); }
+
+TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) {
+    // create a vector with extra space for the norm
+    size_t dim = 5;
+
+    // Create original vectors
+    float v1_orig[dim], v2_orig[dim];
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i + 1.5);
+    }
+
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
+    size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
+    spaces::GetNormalizeFunc<float>()(v1_orig, dim);
+    // Find min and max for quantization
+    float min_val = v2_orig[0];
+    float max_val = v2_orig[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, v2_orig[i]);
+        max_val = std::max(max_val, v2_orig[i]);
+    }
+    // Calculate delta and inverse norm
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+
+    // Compress v2
+    std::vector<uint8_t> v2_compressed(compressed_size);
+    uint8_t *quant_values = reinterpret_cast<uint8_t *>(v2_compressed.data());
+    float *params = reinterpret_cast<float *>(quant_values + dim);
+
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (v2_orig[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
+    }
+    // Calculate inverse norm from decompressed values
+    float inv_norm = 0.0f;
+    for (size_t i = 0; i < dim; i++) {
+        float decompressed_value = min_val + quant_values[i] * delta;
+        inv_norm += decompressed_value * decompressed_value;
+    }
+    inv_norm = 1.0f / std::sqrt(inv_norm);
+    // Store parameters
+    params[0] = min_val;
+    params[1] = delta;
+    params[2] = inv_norm;
+
+    float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
+    ASSERT_NEAR(dist, 0.0f, 0.000001f) << "SQ8_Cosine failed to match expected distance";
+}
+TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) {
+    // create a vector with extra space for the norm
+    size_t dim = 5;
+
+    // Create original vectors
+    float v1_orig[dim], v2_orig[dim];
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i + 1.5);
+    }
+
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float)
+    size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
+    spaces::GetNormalizeFunc<float>()(v1_orig, dim);
+    spaces::GetNormalizeFunc<float>()(v2_orig, dim);
+    // Find min and max for quantization
+    float min_val = v2_orig[0];
+    float max_val = v2_orig[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, v2_orig[i]);
+        max_val = std::max(max_val, v2_orig[i]);
+    }
+    // Calculate delta and inverse norm
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+
+    // Compress v2
+    std::vector<uint8_t> v2_compressed(compressed_size);
+    uint8_t *quant_values = reinterpret_cast<uint8_t *>(v2_compressed.data());
+    float *params = reinterpret_cast<float *>(quant_values + dim);
+
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (v2_orig[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
+    }
+    // Calculate inverse norm from decompressed values
+    float inv_norm = 0.0f;
+    for (size_t i = 0; i < dim; i++) {
+        float decompressed_value = min_val + quant_values[i] * delta;
+        inv_norm += decompressed_value * decompressed_value;
+    }
+    inv_norm = 1.0f / std::sqrt(inv_norm);
+    // Store parameters
+    params[0] = min_val;
+    params[1] = delta;
+    params[2] = inv_norm;
+
+    float dist = SQ8_L2Sqr((const void *)v1_orig, (const void *)v2_compressed.data(), dim);
+    ASSERT_NEAR(dist, 0.0f, 0.00001f) << "SQ8_Cosine failed to match expected distance";
+}
+
 /* ======================== Test Getters ======================== */
 
 TEST_F(SpacesTest, GetDistFuncInvalidMetricFP32) {
@@ -1889,3 +2061,409 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) {
 
 INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest,
                          testing::Range(32UL, 64 * 2UL + 1));
+
+// Helper function to create SQ8 compressed vector
+std::vector<uint8_t> CreateSQ8CompressedVector(const float *original, size_t dim) {
+    // Create a copy of the original vector that we can modify
+    std::vector<float> vec_copy(original, original + dim);
+
+    // Size: dim (uint8_t) + min_val (float) + delta (float) + norm (float)
+    size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float);
+    std::vector<uint8_t> compressed(compressed_size);
+
+    // Find min and max for quantization
+    float min_val = vec_copy[0];
+    float max_val = vec_copy[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, vec_copy[i]);
+        max_val = std::max(max_val, vec_copy[i]);
+    }
+
+    // Calculate delta
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+
+    // Quantize vector
+    uint8_t *quant_values = compressed.data();
+    float norm = 0.0f;
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (vec_copy[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        quant_values[i] = static_cast<uint8_t>(std::round(normalized));
+        norm += (quant_values[i] * delta + min_val) * (quant_values[i] * delta + min_val);
+    }
+
+    float inv_norm = 1.0f / std::sqrt(norm);
+    // Store parameters
+    float *params = reinterpret_cast<float *>(quant_values + dim);
+    params[0] = min_val;
+    params[1] = delta;
+    params[2] = inv_norm;
+
+    return compressed;
+}
+
+class SQ8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
+
+TEST_P(SQ8SpacesOptimizationTest, SQ8L2SqrTest) {
+    auto optimization = getCpuOptimizationFeatures();
+    size_t dim = GetParam();
+
+    // Create original vectors
+    std::vector<float> v1_orig(dim);
+    std::vector<float> v2_orig(dim);
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i * 0.75 + 1.0);
+    }
+
+    // Create SQ8 compressed version of v2
+    std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
+
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = SQ8_L2Sqr(v1_orig.data(), v2_compressed.data(), dim);
+// Test different optimizations based on CPU features
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX512 with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.avx512f = 0;
+    }
+#endif
+#ifdef OPT_AVX2_FMA
+    if (optimization.avx2 && optimization.fma3) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2_FMA(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.fma3 = 0;
+    }
+#endif
+#ifdef OPT_AVX2
+    if (optimization.avx2) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_AVX2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX with dim " << dim;
+        // Unset avx flag as well, so we'll choose the next optimization (SSE).
+        optimization.avx2 = 0;
+    }
+#endif
+#ifdef OPT_SSE4
+    if (optimization.sse4_1) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SSE4(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SSE with dim " << dim;
+        // ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim;
+        // Unset sse flag as well, so we'll choose the next optimization (default).
+        optimization.sse4_1 = 0;
+    }
+#endif
+
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON
+    if (optimization.asimd) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_L2_implementation_NEON(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "NEON with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.asimd = 0;
+    }
+#endif
+
+    // Test default implementation
+    unsigned char alignment = 0;
+    arch_opt_func = L2_SQ8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, SQ8_L2Sqr) << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
+
+TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) {
+    auto optimization = getCpuOptimizationFeatures();
+    size_t dim = GetParam();
+
+    // Create original vectors
+    std::vector<float> v1_orig(dim);
+    std::vector<float> v2_orig(dim);
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i * 0.75 + 1.0);
+    }
+    spaces::GetNormalizeFunc<float>()(v1_orig.data(), dim);
+
+    // Create SQ8 compressed version of v2
+    std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
+    // print min and delta
+    float *params = reinterpret_cast<float *>(v2_compressed.data() + dim);
+
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = SQ8_InnerProduct(v1_orig.data(), v2_compressed.data(), dim);
+
+// Test different optimizations based on CPU features
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX512 with dim " << dim;
+        optimization.avx512f = 0;
+    }
+#endif
+#ifdef OPT_AVX2_FMA
+    if (optimization.avx2 && optimization.fma3) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2_FMA(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        optimization.fma3 = 0;
+    }
+#endif
+#ifdef OPT_AVX2
+    if (optimization.avx2) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_AVX2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        optimization.avx2 = 0;
+    }
+#endif
+#ifdef OPT_SSE
+    if (optimization.sse4_1) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SSE4(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SSE with dim " << dim;
+        optimization.sse4_1 = 0;
+    }
+#endif
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON
+    if (optimization.asimd) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_IP_implementation_NEON(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "NEON with dim " << dim;
+        optimization.asimd = 0;
+    }
+#endif
+
+    // Test default implementation
+    unsigned char alignment = 0;
+    arch_opt_func = IP_SQ8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, SQ8_InnerProduct)
+        << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
+
+// Instantiate the test suite with dimensions to test
+INSTANTIATE_TEST_SUITE_P(SQ8InnerProductTest, SQ8SpacesOptimizationTest,
+                         testing::Range(16UL, 16 * 2UL + 1));
+
+TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) {
+    auto optimization = getCpuOptimizationFeatures();
+    size_t dim = GetParam();
+
+    // Create original vectors
+    std::vector<float> v1_orig(dim);
+    std::vector<float> v2_orig(dim);
+    for (size_t i = 0; i < dim; i++) {
+        v1_orig[i] = float(i + 1.5);
+        v2_orig[i] = float(i * 0.75 + 1.0);
+    }
+
+    // Normalize v1
+    spaces::GetNormalizeFunc<float>()(v1_orig.data(), dim);
+    spaces::GetNormalizeFunc<float>()(v2_orig.data(), dim);
+
+    // Create SQ8 compressed version of v2 (with normalization)
+    std::vector<uint8_t> v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim);
+
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(uint8_t) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim);
+
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON
+    if (optimization.asimd) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_NEON(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "NEON with dim " << dim;
+        optimization.asimd = 0;
+    }
+#endif
+
+// Test different optimizations based on CPU features
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX512 with dim " << dim;
+        optimization.avx512f = 0;
+    }
+#endif
+#ifdef OPT_AVX2_FMA
+    if (optimization.avx2 && optimization.fma3) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2_FMA(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        optimization.fma3 = 0;
+    }
+#endif
+#ifdef OPT_AVX2
+    if (optimization.avx2) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "AVX with dim " << dim;
+        optimization.avx2 = 0;
+    }
+#endif
+
+#ifdef OPT_SSE
+    if (optimization.sse4_1) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SSE4(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+            << "SSE with dim " << dim;
+        optimization.sse4_1 = 0;
+    }
+#endif
+
+    // Test default implementation
+    unsigned char alignment = 0;
+    arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01)
+        << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h
index 93a3eaf0b..cbab07653 100644
--- a/tests/utils/tests_utils.h
+++ b/tests/utils/tests_utils.h
@@ -53,6 +53,45 @@ static void populate_float_vec(float *v, size_t dim, int seed = 1234) {
     }
 }
 
+static void quantize_float_vec_to_uint8(float *v, size_t dim, uint8_t *qv, int seed = 1234) {
+
+    float min_val = v[0];
+    float max_val = v[0];
+    for (size_t i = 1; i < dim; i++) {
+        min_val = std::min(min_val, v[i]);
+        max_val = std::max(max_val, v[i]);
+    }
+    // Calculate delta
+    float delta = (max_val - min_val) / 255.0f;
+    if (delta == 0)
+        delta = 1.0f; // Avoid division by zero
+    float norm = 0.0f;
+    // Quantize each value
+    for (size_t i = 0; i < dim; i++) {
+        float normalized = (v[i] - min_val) / delta;
+        normalized = std::max(0.0f, std::min(255.0f, normalized));
+        qv[i] = static_cast<uint8_t>(std::round(normalized));
+        norm += (qv[i] * delta + min_val) * (qv[i] * delta + min_val);
+    }
+    float inv_norm = 1.0f / std::sqrt(norm);
+    // Store parameters
+    float *params = reinterpret_cast<float *>(qv + dim);
+    params[0] = min_val;
+    params[1] = delta;
+    params[2] = inv_norm;
+}
+
+static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) {
+
+    std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed
+    std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
+    std::vector<float> vec(dim);
+    for (size_t i = 0; i < dim; i++) {
+        vec[i] = dis(gen);
+    }
+    quantize_float_vec_to_uint8(vec.data(), dim, v, seed);
+}
+
 template <typename datatype>
 float integral_compute_norm(const datatype *vec, size_t dim) {
     return spaces::IntegralType_ComputeNorm<datatype>(vec, dim);