diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake
index 1fedda7fe..1ff8f48f2 100644
--- a/cmake/x86_64InstructionFlags.cmake
+++ b/cmake/x86_64InstructionFlags.cmake
@@ -13,6 +13,7 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 	CHECK_CXX_COMPILER_FLAG(-mavx512vbmi2 CXX_AVX512VBMI2)
 	CHECK_CXX_COMPILER_FLAG(-mavx512fp16 CXX_AVX512FP16)
 	CHECK_CXX_COMPILER_FLAG(-mavx512f CXX_AVX512F)
+	CHECK_CXX_COMPILER_FLAG(-mavx512vnni CXX_AVX512VNNI)
 	CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2)
 	CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
 	CHECK_CXX_COMPILER_FLAG(-mf16c CXX_F16C)
@@ -48,6 +49,10 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		add_compile_definitions(OPT_AVX512_BW_VBMI2)
 	endif()
 
+	if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI)
+		add_compile_definitions(OPT_AVX512_F_BW_VL_VNNI)
+	endif()
+
 	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
 		add_compile_definitions(OPT_F16C)
 	endif()
diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
index 9cc0baaaf..1fc9473b2 100644
--- a/src/VecSim/spaces/CMakeLists.txt
+++ b/src/VecSim/spaces/CMakeLists.txt
@@ -44,6 +44,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/AVX512F.cpp)
 	endif()
 
+	if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI)
+		message("Building with AVX512F, AVX512BW, AVX512VL and AVX512VNNI")
+		set_source_files_properties(functions/AVX512F_BW_VL_VNNI.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512vl -mavx512vnni")
+		list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp)
+	endif()
+
 	if(CXX_AVX2)
 		message("Building with AVX2")
 		set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
index 98ad07676..0884df3bb 100644
--- a/src/VecSim/spaces/IP/IP.cpp
+++ b/src/VecSim/spaces/IP/IP.cpp
@@ -66,3 +66,27 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension
     }
     return 1.0f - res;
 }
+
+static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    int8_t *pVect1 = (int8_t *)pVect1v;
+    int8_t *pVect2 = (int8_t *)pVect2v;
+
+    int res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        res += pVect1[i] * pVect2[i];
+    }
+    return res;
+}
+
+float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1 - INT8_InnerProductImp(pVect1v, pVect2v, dimension);
+}
+
+float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    // We expect the vectors' norm to be stored at the end of the vector.
+    float norm_v1 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect1v) + dimension);
+    float norm_v2 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect2v) + dimension);
+    return 1.0f - float(INT8_InnerProductImp(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2);
+}
diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h
index 50fecef33..d712499ed 100644
--- a/src/VecSim/spaces/IP/IP.h
+++ b/src/VecSim/spaces/IP/IP.h
@@ -16,3 +16,6 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension
 
 float BF16_InnerProduct_LittleEndian(const void *pVect1v, const void *pVect2v, size_t dimension);
 float BF16_InnerProduct_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension);
+
+float INT8_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
+float INT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension);
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
new file mode 100644
index 000000000..7716d8ad7
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
@@ -0,0 +1,77 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+
+static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) {
+    __m256i temp_a = _mm256_loadu_epi8(pVect1);
+    __m512i va = _mm512_cvtepi8_epi16(temp_a);
+    pVect1 += 32;
+
+    __m256i temp_b = _mm256_loadu_epi8(pVect2);
+    __m512i vb = _mm512_cvtepi8_epi16(temp_b);
+    pVect2 += 32;
+
+    // _mm512_dpwssd_epi32(src, a, b)
+    // Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding
+    // 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results
+    // with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+    sum = _mm512_dpwssd_epi32(sum, va, vb);
+}
+
+template <unsigned char residual> // 0..64
+static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    int8_t *pVect1 = (int8_t *)pVect1v;
+    int8_t *pVect2 = (int8_t *)pVect2v;
+
+    const int8_t *pEnd1 = pVect1 + dimension;
+
+    __m512i sum = _mm512_setzero_epi32();
+
+    // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block,
+    // so mask loading is guaranteed to be safe
+    if constexpr (residual % 32) {
+        __mmask32 mask = (1LU << (residual % 32)) - 1;
+        __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1);
+        __m512i va = _mm512_cvtepi8_epi16(temp_a);
+        pVect1 += residual % 32;
+
+        __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2);
+        __m512i vb = _mm512_cvtepi8_epi16(temp_b);
+        pVect2 += residual % 32;
+
+        sum = _mm512_dpwssd_epi32(sum, va, vb);
+    }
+
+    if constexpr (residual >= 32) {
+        InnerProductStep(pVect1, pVect2, sum);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 64-int_8.
+    while (pVect1 < pEnd1) {
+        InnerProductStep(pVect1, pVect2, sum);
+        InnerProductStep(pVect1, pVect2, sum);
+    }
+
+    return _mm512_reduce_add_epi32(sum);
+}
+
+template <unsigned char residual> // 0..64
+float INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                                 size_t dimension) {
+
+    return 1 - INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+}
+template <unsigned char residual> // 0..64
+float INT8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                           size_t dimension) {
+    float ip = INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+    float norm_v1 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect1v) + dimension);
+    float norm_v2 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect2v) + dimension);
+    return 1.0f - ip / (norm_v1 * norm_v2);
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index e6da26947..e7129b2e8 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -16,6 +16,7 @@
 #include "VecSim/spaces/functions/AVX512BW_VBMI2.h"
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
+#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
 
@@ -196,4 +197,59 @@ dist_func_t<float> IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con
     return ret_dist_func;
 }
 
+dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = INT8_InnerProduct;
+    // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
+    if (dim < 32) {
+        return ret_dist_func;
+    }
+#ifdef CPU_FEATURES_ARCH_X86_64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetX86Info().features
+                        : *static_cast<const cpu_features::X86Features *>(arch_opt);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+        if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 32 * sizeof(int8_t); // align to 256 bits.
+        return Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
+
+dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
+                                           const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = INT8_Cosine;
+    // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
+    if (dim < 32) {
+        return ret_dist_func;
+    }
+#ifdef CPU_FEATURES_ARCH_X86_64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetX86Info().features
+                        : *static_cast<const cpu_features::X86Features *>(arch_opt);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+        // For int8 vectors with cosine distance, the extra float for the norm shifts alignment to
+        // `(dim + sizeof(float)) % 32`.
+        // Vectors satisfying this have a residual, causing offset loads during calculation.
+        // To avoid complexity, we skip alignment here, assuming the performance impact is
+        // negligible.
+        return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
 } // namespace spaces
diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h
index a3ab0f4f6..0d8c3a836 100644
--- a/src/VecSim/spaces/IP_space.h
+++ b/src/VecSim/spaces/IP_space.h
@@ -16,4 +16,8 @@ dist_func_t<float> IP_BF16_GetDistFunc(size_t dim, unsigned char *alignment = nu
                                        const void *arch_opt = nullptr);
 dist_func_t<float> IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                        const void *arch_opt = nullptr);
+dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                       const void *arch_opt = nullptr);
+dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                           const void *arch_opt = nullptr);
 } // namespace spaces
diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index 5fba0555e..ef310418b 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -70,3 +70,17 @@ float FP16_L2Sqr(const void *pVect1, const void *pVect2, size_t dimension) {
     }
     return res;
 }
+
+float INT8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    int8_t *pVect1 = (int8_t *)pVect1v;
+    int8_t *pVect2 = (int8_t *)pVect2v;
+
+    int res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        int16_t a = pVect1[i];
+        int16_t b = pVect2[i];
+        int16_t diff = a - b;
+        res += diff * diff;
+    }
+    return float(res);
+}
diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h
index c367f2ee1..65649d4eb 100644
--- a/src/VecSim/spaces/L2/L2.h
+++ b/src/VecSim/spaces/L2/L2.h
@@ -16,3 +16,5 @@ float BF16_L2Sqr_LittleEndian(const void *pVect1v, const void *pVect2v, size_t d
 float BF16_L2Sqr_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension);
 
 float FP16_L2Sqr(const void *pVect1, const void *pVect2, size_t dimension);
+
+float INT8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension);
diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
new file mode 100644
index 000000000..2c8b846af
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
@@ -0,0 +1,63 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+
+static inline void L2SqrStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) {
+    __m256i temp_a = _mm256_loadu_epi8(pVect1);
+    __m512i va = _mm512_cvtepi8_epi16(temp_a);
+    pVect1 += 32;
+
+    __m256i temp_b = _mm256_loadu_epi8(pVect2);
+    __m512i vb = _mm512_cvtepi8_epi16(temp_b);
+    pVect2 += 32;
+
+    __m512i diff = _mm512_sub_epi16(va, vb);
+    // _mm512_dpwssd_epi32(src, a, b)
+    // Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding
+    // 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results
+    // with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+    sum = _mm512_dpwssd_epi32(sum, diff, diff);
+}
+
+template <unsigned char residual> // 0..64
+float INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                          size_t dimension) {
+    int8_t *pVect1 = (int8_t *)pVect1v;
+    int8_t *pVect2 = (int8_t *)pVect2v;
+
+    const int8_t *pEnd1 = pVect1 + dimension;
+
+    __m512i sum = _mm512_setzero_epi32();
+
+    // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block,
+    // so mask loading is guaranteed to be safe
+    if constexpr (residual % 32) {
+        constexpr __mmask32 mask = (1LU << (residual % 32)) - 1;
+        __m256i temp_a = _mm256_loadu_epi8(pVect1);
+        __m512i va = _mm512_cvtepi8_epi16(temp_a);
+        pVect1 += residual % 32;
+
+        __m256i temp_b = _mm256_loadu_epi8(pVect2);
+        __m512i vb = _mm512_cvtepi8_epi16(temp_b);
+        pVect2 += residual % 32;
+
+        __m512i diff = _mm512_maskz_sub_epi16(mask, va, vb);
+        sum = _mm512_dpwssd_epi32(sum, diff, diff);
+    }
+
+    if constexpr (residual >= 32) {
+        L2SqrStep(pVect1, pVect2, sum);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 64-int_8.
+    while (pVect1 < pEnd1) {
+        L2SqrStep(pVect1, pVect2, sum);
+        L2SqrStep(pVect1, pVect2, sum);
+    }
+
+    return _mm512_reduce_add_epi32(sum);
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 1c2b2b59f..c0bec428f 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -15,6 +15,7 @@
 #include "VecSim/spaces/functions/SSE.h"
 #include "VecSim/spaces/functions/AVX512BW_VBMI2.h"
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
+#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
 
@@ -189,4 +190,30 @@ dist_func_t<float> L2_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con
     return ret_dist_func;
 }
 
+dist_func_t<float> L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = INT8_L2Sqr;
+    // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
+    if (dim < 32) {
+        return ret_dist_func;
+    }
+#ifdef CPU_FEATURES_ARCH_X86_64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetX86Info().features
+                        : *static_cast<const cpu_features::X86Features *>(arch_opt);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+        if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 32 * sizeof(int8_t); // align to 256 bits.
+        return Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h
index 4a2ea801a..48e50a8c2 100644
--- a/src/VecSim/spaces/L2_space.h
+++ b/src/VecSim/spaces/L2_space.h
@@ -16,4 +16,6 @@ dist_func_t<float> L2_BF16_GetDistFunc(size_t dim, unsigned char *alignment = nu
                                        const void *arch_opt = nullptr);
 dist_func_t<float> L2_FP16_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                        const void *arch_opt = nullptr);
+dist_func_t<float> L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                       const void *arch_opt = nullptr);
 } // namespace spaces
diff --git a/src/VecSim/spaces/computer/calculator.h b/src/VecSim/spaces/computer/calculator.h
index 36e76deed..64e0d8dae 100644
--- a/src/VecSim/spaces/computer/calculator.h
+++ b/src/VecSim/spaces/computer/calculator.h
@@ -26,10 +26,10 @@ class IndexCalculatorInterface : public VecsimBaseObject {
 /**
  * This object purpose is to calculate the distance between two vectors.
  * It extends the IndexCalculatorInterface class' type to hold the distance function.
- * Every specific implmentation of the distance claculater should hold by refrence or by value the
+ * Every specific implementation of the distance calculator should hold by reference or by value the
  * parameters required for the calculation. The distance calculation API of all DistanceCalculator
  * classes is: calc_dist(v1,v2,dim). Internally it calls the distance function according the
- * template signature, allowing fexability in the distance function arguments.
+ * template signature, allowing flexibility in the distance function arguments.
  */
 template <typename DistType, typename DistFuncType>
 class DistanceCalculatorInterface : public IndexCalculatorInterface<DistType> {
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
new file mode 100644
index 000000000..661c2c945
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -0,0 +1,36 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "AVX512BW_VBMI2.h"
+
+#include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h"
+#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+
+dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_CosineSIMD64_AVX512F_BW_VL_VNNI);
+    return ret_dist_func;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
new file mode 100644
index 000000000..532a33c76
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+namespace spaces {
+
+dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+dist_func_t<float> Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+dist_func_t<float> Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/implementation_chooser.h b/src/VecSim/spaces/functions/implementation_chooser.h
index 2903b8cc4..b32ad56c6 100644
--- a/src/VecSim/spaces/functions/implementation_chooser.h
+++ b/src/VecSim/spaces/functions/implementation_chooser.h
@@ -25,23 +25,28 @@
 // of 4N, 4N+1, 4N+2, 4N+3.
 #define C4(X, func, N) X(4 * N, func) X(4 * N + 1, func) X(4 * N + 2, func) X(4 * N + 3, func)
 
-// Macros for 8, 16 and 32 cases. Used to collapse the switch statement. Expands into 0-31, 0-15 or
-// 0-7 cases.
+// Macros for 8, 16, 32 and 64 cases. Used to collapse the switch statement. Expands into 0-63,
+// 0-31, 0-15 or 0-7 cases.
 #define CASES32(X, func)                                                                           \
     C4(X, func, 0)                                                                                 \
     C4(X, func, 1)                                                                                 \
     C4(X, func, 2) C4(X, func, 3) C4(X, func, 4) C4(X, func, 5) C4(X, func, 6) C4(X, func, 7)
 #define CASES16(X, func) C4(X, func, 0) C4(X, func, 1) C4(X, func, 2) C4(X, func, 3)
 #define CASES8(X, func)  C4(X, func, 0) C4(X, func, 1)
+#define CASES64(X, func)                                                                           \
+    CASES32(X, func)                                                                               \
+    C4(X, func, 8)                                                                                 \
+    C4(X, func, 9)                                                                                 \
+    C4(X, func, 10) C4(X, func, 11) C4(X, func, 12) C4(X, func, 13) C4(X, func, 14) C4(X, func, 15)
 
 // Main macro. Expands into a switch statement that chooses the implementation based on the
 // dimension's remainder.
 // @params:
 // out:     The output variable that will be set to the chosen implementation.
 // dim:     The dimension.
-// chunk:   The chunk size. Can be 32, 16 or 8. 32 for 16-bit elements, 16 for 32-bit elements, 8
-// for 64-bit elements. func:    The templated function that we want to choose the implementation
-// for.
+// chunk:   The chunk size. Can be 64, 32, 16 or 8. 64 for 8-bit elements, 32 for 16-bit elements,
+// 16 for 32-bit elements, 8 for 64-bit elements. func:    The templated function that we want to
+// choose the implementation for.
 #define CHOOSE_IMPLEMENTATION(out, dim, chunk, func)                                               \
     do {                                                                                           \
         decltype(out) __ret_dist_func;                                                             \
diff --git a/src/VecSim/spaces/spaces.cpp b/src/VecSim/spaces/spaces.cpp
index 84f71b463..4385b5e94 100644
--- a/src/VecSim/spaces/spaces.cpp
+++ b/src/VecSim/spaces/spaces.cpp
@@ -70,6 +70,20 @@ dist_func_t<double> GetDistFunc<double, double>(VecSimMetric metric, size_t dim,
     throw std::invalid_argument("Invalid metric");
 }
 
+template <>
+dist_func_t<float> GetDistFunc<int8_t, float>(VecSimMetric metric, size_t dim,
+                                              unsigned char *alignment) {
+    switch (metric) {
+    case VecSimMetric_Cosine:
+        return Cosine_INT8_GetDistFunc(dim, alignment);
+    case VecSimMetric_IP:
+        return IP_INT8_GetDistFunc(dim, alignment);
+    case VecSimMetric_L2:
+        return L2_INT8_GetDistFunc(dim, alignment);
+    }
+    throw std::invalid_argument("Invalid metric");
+}
+
 template <>
 normalizeVector_f<float> GetNormalizeFunc<float>(void) {
     return normalizeVector_imp<float>;
diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt
index 4d25a5499..2fa066e82 100644
--- a/tests/benchmark/CMakeLists.txt
+++ b/tests/benchmark/CMakeLists.txt
@@ -31,7 +31,7 @@ endforeach()
 
 include(${root}/cmake/x86_64InstructionFlags.cmake)
 
-set(DATA_TYPE fp32 fp64 bf16 fp16)
+set(DATA_TYPE fp32 fp64 bf16 fp16 int8)
 foreach(data_type IN LISTS DATA_TYPE)
 	add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp)
 	target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark)
diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh
index 11872e869..867077ede 100755
--- a/tests/benchmark/benchmarks.sh
+++ b/tests/benchmark/benchmarks.sh
@@ -13,6 +13,7 @@ if [ -z "$BM_TYPE"  ] || [ "$BM_TYPE" = "benchmarks-all" ]; then
     echo spaces_fp64
     echo spaces_bf16
     echo spaces_fp16
+    echo spaces_int8
 elif [ "$BM_TYPE" = "benchmarks-default" ]; then
     echo basics_single_fp32
     echo basics_multi_fp32
@@ -20,6 +21,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then
     echo spaces_fp64
     echo spaces_bf16
     echo spaces_fp16
+    echo spaces_int8
 # Basic benchmarks
 elif [ "$BM_TYPE" = "bm-basics-fp32-single" ] ; then
     echo basics_single_fp32
@@ -66,4 +68,15 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then
     echo spaces_fp16
     echo spaces_fp64
     echo spaces_bf16
+    echo spaces_int8
+elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then
+    echo spaces_fp32
+elif [ "$BM_TYPE" = "bm-spaces-fp64" ] ; then
+    echo spaces_fp64
+elif [ "$BM_TYPE" = "bm-spaces-bf16" ] ; then
+    echo spaces_bf16
+elif [ "$BM_TYPE" = "bm-spaces-fp16" ] ; then
+    echo spaces_fp16
+elif [ "$BM_TYPE" = "bm-spaces-int8" ] ; then
+    echo spaces_int8
 fi
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index 3b55a9032..b7431c43c 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -19,6 +19,7 @@
 #include "VecSim/spaces/functions/AVX.h"
 #include "VecSim/spaces/functions/AVX512BW_VBMI2.h"
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
+#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/F16C.h"
 #include "VecSim/spaces/functions/SSE3.h"
@@ -123,6 +124,12 @@ static constexpr size_t start = min_no_res_th_dim;
     INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported);          \
     INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported);
 
-#define INITIALIZE_BENCHMARKS_SET(bm_class, type_prefix, arch, dim_opt, arch_supported)            \
+#define INITIALIZE_BENCHMARKS_SET_Cosine(bm_class, type_prefix, arch, dim_opt, arch_supported)     \
+    INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, Cosine, arch_supported);                      \
+    INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, Cosine, arch_supported);                       \
+    INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, Cosine, dim_opt, arch_supported);      \
+    INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, Cosine, dim_opt, arch_supported);
+
+#define INITIALIZE_BENCHMARKS_SET_L2_IP(bm_class, type_prefix, arch, dim_opt, arch_supported)      \
     INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported)             \
     INITIALIZE_BENCHMARKS_SET_IP(bm_class, type_prefix, arch, dim_opt, arch_supported)
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp
index 8022c712a..27fe82a3d 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp
@@ -26,20 +26,20 @@ INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_BF16, BF16, AVX512BF16_VL, 32,
 // AVX512 functions
 #ifdef OPT_AVX512_BW_VBMI2
 bool avx512_bw_vbmi2_supported = opt.avx512bw && opt.avx512vbmi2;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, AVX512BW_VBMI2, 32,
-                          avx512_bw_vbmi2_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, AVX512BW_VBMI2, 32,
+                                avx512_bw_vbmi2_supported);
 #endif // AVX512F
 
 // AVX functions
 #ifdef OPT_AVX2
 bool avx2_supported = opt.avx2;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, AVX2, 32, avx2_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, AVX2, 32, avx2_supported);
 #endif // AVX
 
 // SSE functions
 #ifdef OPT_SSE3
 bool sse3_supported = opt.sse3;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, SSE3, 32, sse3_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, SSE3, 32, sse3_supported);
 #endif // SSE
 
 #endif // x86_64
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp
index c9bc42b0b..9457bc77d 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp
@@ -22,8 +22,8 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 class BM_VecSimSpaces_FP16_adv : public BM_VecSimSpaces<_Float16> {};
 
 bool avx512fp16_vl_supported = opt.avx512_fp16 && opt.avx512vl;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16_adv, FP16, AVX512FP16_VL, 32,
-                          avx512fp16_vl_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16_adv, FP16, AVX512FP16_VL, 32,
+                                avx512fp16_vl_supported);
 
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, InnerProduct, 32);
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, L2Sqr, 32);
@@ -32,12 +32,12 @@ INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, L2Sqr, 32);
 // OPT_AVX512F functions
 #ifdef OPT_AVX512F
 bool avx512f_supported = opt.avx512f;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16, FP16, AVX512F, 32, avx512f_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, AVX512F, 32, avx512f_supported);
 #endif // OPT_AVX512F
 // AVX functions
 #ifdef OPT_F16C
 bool avx512_bw_f16c_supported = opt.f16c && opt.fma3 && opt.avx;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16, FP16, F16C, 32, avx512_bw_f16c_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, F16C, 32, avx512_bw_f16c_supported);
 #endif // OPT_F16C
 
 #endif // x86_64
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp
index 289e42405..106b2abc8 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp
@@ -13,19 +13,19 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 // AVX512 functions
 #ifdef OPT_AVX512F
 bool avx512f_supported = opt.avx512f;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, AVX512F, 16, avx512f_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, AVX512F, 16, avx512f_supported);
 #endif // AVX512F
 
 // AVX functions
 #ifdef OPT_AVX
 bool avx_supported = opt.avx;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, AVX, 16, avx_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, AVX, 16, avx_supported);
 #endif // AVX
 
 // SSE functions
 #ifdef OPT_SSE
 bool sse_supported = opt.sse;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, SSE, 16, sse_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, SSE, 16, sse_supported);
 #endif // SSE
 
 #endif // x86_64
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp
index 19157f03f..01052cebc 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp
@@ -13,19 +13,19 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 // AVX512 functions
 #ifdef OPT_AVX512F
 bool avx512f_supported = opt.avx512f;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, AVX512F, 8, avx512f_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, AVX512F, 8, avx512f_supported);
 #endif // AVX512F
 
 // AVX functions
 #ifdef OPT_AVX
 bool avx_supported = opt.avx;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, AVX, 8, avx_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, AVX, 8, avx_supported);
 #endif // AVX
 
 // SSE functions
 #ifdef OPT_SSE
 bool sse_supported = opt.sse;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, SSE, 8, sse_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, SSE, 8, sse_supported);
 #endif // SSE
 
 #endif // x86_64
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
new file mode 100644
index 000000000..0adde8972
--- /dev/null
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
@@ -0,0 +1,56 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+#include <benchmark/benchmark.h>
+#include <random>
+#include <cstring>
+#include "utils/tests_utils.h"
+#include "bm_spaces.h"
+
+class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture {
+protected:
+    std::mt19937 rng;
+    size_t dim;
+    int8_t *v1, *v2;
+
+public:
+    BM_VecSimSpaces_Integers_INT8() { rng.seed(47); }
+    ~BM_VecSimSpaces_Integers_INT8() = default;
+
+    void SetUp(const ::benchmark::State &state) {
+        dim = state.range(0);
+        // Allocate vector with extra space for cosine calculations
+        v1 = new int8_t[dim + sizeof(float)];
+        v2 = new int8_t[dim + sizeof(float)];
+        test_utils::populate_int8_vec(v1, dim, 123);
+        test_utils::populate_int8_vec(v2, dim, 1234);
+
+        // Store the norm in the extra space for cosine calculations
+        *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim);
+        *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim);
+    }
+    void TearDown(const ::benchmark::State &state) {
+        delete v1;
+        delete v2;
+    }
+};
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+cpu_features::X86Features opt = cpu_features::GetX86Info().features;
+
+// AVX512_F_BW_VL_VNNI functions
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
+                                avx512_f_bw_vl_vnni_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
+                                 avx512_f_bw_vl_vnni_supported)
+#endif // AVX512_F_BW_VL_VNNI
+
+#endif // x86_64
+
+    INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, InnerProduct, 32);
+INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, L2Sqr, 32);
+BENCHMARK_MAIN();
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index b16bddac6..caa3fc522 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -30,15 +30,15 @@ endif()
 
 include(${root}/cmake/x86_64InstructionFlags.cmake)
 
-add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp test_utils.cpp)
-add_executable(test_hnsw_parallel test_hnsw_parallel.cpp test_utils.cpp)
-add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp test_utils.cpp)
-add_executable(test_allocator test_allocator.cpp test_utils.cpp)
+add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp unit_test_utils.cpp)
+add_executable(test_hnsw_parallel test_hnsw_parallel.cpp unit_test_utils.cpp)
+add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp unit_test_utils.cpp)
+add_executable(test_allocator test_allocator.cpp unit_test_utils.cpp)
 add_executable(test_spaces test_spaces.cpp)
 add_executable(test_types test_types.cpp)
-add_executable(test_common ../utils/mock_thread_pool.cpp test_utils.cpp test_common.cpp)
-add_executable(test_bf16 ../utils/mock_thread_pool.cpp test_bf16.cpp test_utils.cpp)
-add_executable(test_fp16 ../utils/mock_thread_pool.cpp test_fp16.cpp test_utils.cpp)
+add_executable(test_common ../utils/mock_thread_pool.cpp unit_test_utils.cpp test_common.cpp)
+add_executable(test_bf16 ../utils/mock_thread_pool.cpp test_bf16.cpp unit_test_utils.cpp)
+add_executable(test_fp16 ../utils/mock_thread_pool.cpp test_fp16.cpp unit_test_utils.cpp)
 
 target_link_libraries(test_hnsw PUBLIC gtest_main VectorSimilarity)
 target_link_libraries(test_hnsw_parallel PUBLIC gtest_main VectorSimilarity)
diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp
index 4eb389260..03f0b4e52 100644
--- a/tests/unit/test_allocator.cpp
+++ b/tests/unit/test_allocator.cpp
@@ -10,7 +10,7 @@
 #include "VecSim/memory/vecsim_base.h"
 #include "VecSim/algorithms/brute_force/brute_force_single.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/utils/serializer.h"
 #include "VecSim/index_factories/hnsw_factory.h"
 
@@ -83,7 +83,7 @@ TEST_F(AllocatorTest, test_nested_object) {
 template <typename index_type_t>
 class IndexAllocatorTest : public ::testing::Test {};
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(IndexAllocatorTest, DataTypeSet);
 
diff --git a/tests/unit/test_bf16.cpp b/tests/unit/test_bf16.cpp
index 921c80c35..95e12c98b 100644
--- a/tests/unit/test_bf16.cpp
+++ b/tests/unit/test_bf16.cpp
@@ -2,7 +2,7 @@
 #include "VecSim/vec_sim.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
 #include "VecSim/index_factories/hnsw_factory.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/utils/serializer.h"
 #include "mock_thread_pool.h"
 #include "VecSim/query_result_definitions.h"
diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp
index c56415e3d..b3d5b1192 100644
--- a/tests/unit/test_bruteforce.cpp
+++ b/tests/unit/test_bruteforce.cpp
@@ -6,7 +6,7 @@
 
 #include "gtest/gtest.h"
 #include "VecSim/vec_sim.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/algorithms/brute_force/brute_force.h"
 #include "VecSim/algorithms/brute_force/brute_force_single.h"
 #include "cpu_features_macros.h"
@@ -32,7 +32,7 @@ class BruteForceTest : public ::testing::Test {
     }
 };
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(BruteForceTest, DataTypeSet);
 
diff --git a/tests/unit/test_bruteforce_multi.cpp b/tests/unit/test_bruteforce_multi.cpp
index ef9cfc636..55aadedd4 100644
--- a/tests/unit/test_bruteforce_multi.cpp
+++ b/tests/unit/test_bruteforce_multi.cpp
@@ -6,7 +6,7 @@
 
 #include "gtest/gtest.h"
 #include "VecSim/vec_sim.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/algorithms/brute_force/brute_force_multi.h"
 #include <cmath>
 
@@ -27,7 +27,7 @@ class BruteForceMultiTest : public ::testing::Test {
     }
 };
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(BruteForceMultiTest, DataTypeSet);
 
diff --git a/tests/unit/test_common.cpp b/tests/unit/test_common.cpp
index 58df46fba..bdfd6d9f2 100644
--- a/tests/unit/test_common.cpp
+++ b/tests/unit/test_common.cpp
@@ -10,7 +10,7 @@
 #include "VecSim/query_result_definitions.h"
 #include "VecSim/utils/updatable_heap.h"
 #include "VecSim/utils/vec_utils.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/containers/vecsim_results_container.h"
 #include "VecSim/algorithms/hnsw/hnsw.h"
 #include "VecSim/index_factories/hnsw_factory.h"
@@ -32,7 +32,7 @@ using float16 = vecsim_types::float16;
 template <typename index_type_t>
 class CommonIndexTest : public ::testing::Test {};
 
-// DataTypeSet are defined in test_utils.h
+// DataTypeSet are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(CommonIndexTest, DataTypeSet);
 
diff --git a/tests/unit/test_fp16.cpp b/tests/unit/test_fp16.cpp
index 377ef8f32..244bb9d0c 100644
--- a/tests/unit/test_fp16.cpp
+++ b/tests/unit/test_fp16.cpp
@@ -2,7 +2,7 @@
 #include "VecSim/vec_sim.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
 #include "VecSim/index_factories/hnsw_factory.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/utils/serializer.h"
 #include "mock_thread_pool.h"
 #include "VecSim/query_result_definitions.h"
diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp
index f57a6d3e3..cc400d48a 100644
--- a/tests/unit/test_hnsw.cpp
+++ b/tests/unit/test_hnsw.cpp
@@ -9,7 +9,7 @@
 #include "VecSim/vec_sim_debug.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
 #include "VecSim/index_factories/hnsw_factory.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/utils/serializer.h"
 #include "VecSim/query_result_definitions.h"
 #include <unistd.h>
@@ -36,7 +36,7 @@ class HNSWTest : public ::testing::Test {
     }
 };
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(HNSWTest, DataTypeSet);
 
diff --git a/tests/unit/test_hnsw_multi.cpp b/tests/unit/test_hnsw_multi.cpp
index ba87f1759..026f96e62 100644
--- a/tests/unit/test_hnsw_multi.cpp
+++ b/tests/unit/test_hnsw_multi.cpp
@@ -6,7 +6,7 @@
 
 #include "gtest/gtest.h"
 #include "VecSim/vec_sim.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/algorithms/hnsw/hnsw_multi.h"
 #include <cmath>
 #include <map>
@@ -31,7 +31,7 @@ class HNSWMultiTest : public ::testing::Test {
     }
 };
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(HNSWMultiTest, DataTypeSet);
 
diff --git a/tests/unit/test_hnsw_parallel.cpp b/tests/unit/test_hnsw_parallel.cpp
index a2d4827ca..0354a6af1 100644
--- a/tests/unit/test_hnsw_parallel.cpp
+++ b/tests/unit/test_hnsw_parallel.cpp
@@ -7,7 +7,7 @@
 #include "gtest/gtest.h"
 #include "VecSim/vec_sim.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/query_result_definitions.h"
 #include "VecSim/vec_sim_debug.h"
 #include <unistd.h>
@@ -124,7 +124,7 @@ class HNSWTestParallel : public ::testing::Test {
     void parallelInsertSearch(bool is_multi);
 };
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(HNSWTestParallel, DataTypeSet);
 
diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp
index ee5fef7d1..676424fd7 100644
--- a/tests/unit/test_hnsw_tiered.cpp
+++ b/tests/unit/test_hnsw_tiered.cpp
@@ -6,7 +6,7 @@
 #include <string>
 #include <array>
 
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "mock_thread_pool.h"
 
 #include <thread>
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 7cf7de92b..9931d318a 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -23,9 +23,11 @@
 #include "VecSim/spaces/functions/AVX512BW_VBMI2.h"
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
+#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/F16C.h"
+#include "tests_utils.h"
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
@@ -102,6 +104,21 @@ TEST_F(SpacesTest, fp16_l2_no_optimization_func_test) {
     ASSERT_EQ(dist, FP32_L2Sqr((const void *)sanity_a, (const void *)sanity_b, dim));
 }
 
+TEST_F(SpacesTest, int8_l2_no_optimization_func_test) {
+    size_t dim = 5;
+
+    int8_t a[dim], b[dim];
+    for (size_t i = 0; i < dim; i++) {
+        a[i] = (i + 1);
+        b[i] = (i + 2);
+    }
+
+    float dist = INT8_L2Sqr((const void *)a, (const void *)b, dim);
+    ASSERT_EQ(dist, 5.0);
+}
+
+/* ======================== IP NO OPT ======================== */
+
 TEST_F(SpacesTest, float_ip_no_optimization_func_test) {
     size_t dim = 5;
 
@@ -211,6 +228,36 @@ TEST_F(SpacesTest, fp16_ip_no_optimization_func_test) {
     ASSERT_EQ(dist, FP32_InnerProduct((const void *)sanity_a, (const void *)sanity_b, dim));
 }
 
+TEST_F(SpacesTest, int8_ip_no_optimization_func_test) {
+    size_t dim = 4;
+    int8_t a[] = {1, 0, 0, 0};
+    int8_t b[] = {1, 0, 0, 0};
+
+    float dist = INT8_InnerProduct((const void *)a, (const void *)b, dim);
+    ASSERT_EQ(dist, 0.0);
+}
+
+/* ======================== Cosine NO OPT ======================== */
+
+TEST_F(SpacesTest, int8_Cosine_no_optimization_func_test) {
+    size_t dim = 4;
+    // create a vector with extra space for the norm
+    int8_t *v1 = new int8_t[dim + sizeof(float)];
+    int8_t *v2 = new int8_t[dim + sizeof(float)];
+
+    test_utils::populate_int8_vec(v1, dim, 123);
+    test_utils::populate_int8_vec(v2, dim, 123);
+
+    // write the norm at the end of the vector
+    *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim);
+    *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim);
+
+    float dist = INT8_Cosine((const void *)v1, (const void *)v2, dim);
+    ASSERT_NEAR(dist, 0.0, 0.000001);
+}
+
+/* ======================== Test Getters ======================== */
+
 TEST_F(SpacesTest, GetDistFuncInvalidMetricFP32) {
     EXPECT_THROW(
         (spaces::GetDistFunc<float, float>((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)),
@@ -231,6 +278,11 @@ TEST_F(SpacesTest, GetDistFuncInvalidMetricFP16) {
         (spaces::GetDistFunc<float16, float>((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)),
         std::invalid_argument);
 }
+TEST_F(SpacesTest, GetDistFuncInvalidMetricINT8) {
+    EXPECT_THROW(
+        (spaces::GetDistFunc<int8_t, float>((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)),
+        std::invalid_argument);
+}
 
 using namespace spaces;
 
@@ -241,27 +293,38 @@ TEST_F(SpacesTest, smallDimChooser) {
         ASSERT_EQ(L2_FP64_GetDistFunc(dim), FP64_L2Sqr);
         ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian);
         ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr);
+        ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr);
         ASSERT_EQ(IP_FP32_GetDistFunc(dim), FP32_InnerProduct);
         ASSERT_EQ(IP_FP64_GetDistFunc(dim), FP64_InnerProduct);
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
+        ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct);
+        ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine);
     }
     for (size_t dim = 8; dim < 16; dim++) {
         ASSERT_EQ(L2_FP32_GetDistFunc(dim), FP32_L2Sqr);
         ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian);
         ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr);
+        ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr);
         ASSERT_EQ(IP_FP32_GetDistFunc(dim), FP32_InnerProduct);
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
+        ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct);
+        ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine);
     }
     for (size_t dim = 16; dim < 32; dim++) {
         ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian);
         ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr);
+        ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr);
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
+        ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct);
+        ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine);
     }
 }
 
+/* ======================== Test SIMD Functions ======================== */
+
 // In this following tests we assume that compiler supports all X86 optimizations, so if we have
 // some hardware flag enabled, we check that the corresponding optimization function was chosen.
 #ifdef CPU_FEATURES_ARCH_X86_64
@@ -849,4 +912,118 @@ INSTANTIATE_TEST_SUITE_P(, FP16SpacesOptimizationTestAdvanced,
 
 #endif
 
+class INT8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
+
+TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
+    auto optimization = cpu_features::GetX86Info().features;
+    size_t dim = GetParam();
+    int8_t *v1 = new int8_t[dim];
+    int8_t *v2 = new int8_t[dim];
+    test_utils::populate_int8_vec(v1, dim, 123);
+    test_utils::populate_int8_vec(v2, dim, 1234);
+
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = INT8_L2Sqr(v1, v2, dim);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
+        optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim;
+        ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
+            optimization.avx512vnni = 0;
+    }
+#endif
+    unsigned char alignment = 0;
+    arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, INT8_L2Sqr) << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
+
+TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) {
+    auto optimization = cpu_features::GetX86Info().features;
+    size_t dim = GetParam();
+    int8_t *v1 = new int8_t[dim];
+    int8_t *v2 = new int8_t[dim];
+    test_utils::populate_int8_vec(v1, dim, 123);
+    test_utils::populate_int8_vec(v2, dim, 1234);
+
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = INT8_InnerProduct(v1, v2, dim);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
+        optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim;
+        ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
+            optimization.avx512vnni = 0;
+    }
+#endif
+    unsigned char alignment = 0;
+    arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, INT8_InnerProduct)
+        << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
+
+TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
+    auto optimization = cpu_features::GetX86Info().features;
+    size_t dim = GetParam();
+    int8_t *v1 = new int8_t[dim + sizeof(float)];
+    int8_t *v2 = new int8_t[dim + sizeof(float)];
+    test_utils::populate_int8_vec(v1, dim, 123);
+    test_utils::populate_int8_vec(v2, dim, 1234);
+
+    // write the norm at the end of the vector
+    *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim);
+    *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim);
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = INT8_Cosine(v1, v2, dim);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
+        optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim;
+        // We don't align int8 vectors with cosine distance
+        ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
+            optimization.avx512vnni = 0;
+    }
+#endif
+    unsigned char alignment = 0;
+    arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, INT8_Cosine) << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
+
+INSTANTIATE_TEST_SUITE_P(INT8OptFuncs, INT8SpacesOptimizationTest,
+                         testing::Range(32UL, 32 * 2UL + 1));
+
 #endif // CPU_FEATURES_ARCH_X86_64
diff --git a/tests/unit/test_utils.cpp b/tests/unit/unit_test_utils.cpp
similarity index 99%
rename from tests/unit/test_utils.cpp
rename to tests/unit/unit_test_utils.cpp
index 7b99eba22..89973d19d 100644
--- a/tests/unit/test_utils.cpp
+++ b/tests/unit/unit_test_utils.cpp
@@ -4,7 +4,7 @@
  *the Server Side Public License v1 (SSPLv1).
  */
 
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "gtest/gtest.h"
 #include "VecSim/utils/vec_utils.h"
 #include "VecSim/memory/vecsim_malloc.h"
diff --git a/tests/unit/test_utils.h b/tests/unit/unit_test_utils.h
similarity index 100%
rename from tests/unit/test_utils.h
rename to tests/unit/unit_test_utils.h
diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h
new file mode 100644
index 000000000..31dc3d9ef
--- /dev/null
+++ b/tests/utils/tests_utils.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <random>
+#include <vector>
+
+namespace test_utils {
+
+// Assuming v is a memory allocation of size dim * sizeof(float)
+static void populate_int8_vec(int8_t *v, size_t dim, int seed = 1234) {
+
+    std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed
+
+    // uniform_int_distribution doesn't support int8,
+    // Define a distribution range for int8_t
+    std::uniform_int_distribution<int16_t> dis(-128, 127);
+
+    for (size_t i = 0; i < dim; i++) {
+        v[i] = static_cast<int8_t>(dis(gen));
+    }
+}
+
+// TODO: replace with normalize function from VecSim
+float compute_norm(const int8_t *vec, size_t dim) {
+    int norm = 0;
+    for (size_t i = 0; i < dim; i++) {
+        norm += vec[i] * vec[i];
+    }
+    return sqrt(norm);
+}
+
+} // namespace test_utils