From 2f34c1532b713d058fd242a204166675265393d5 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Tue, 3 Dec 2024 23:56:01 +0200
Subject: [PATCH 01/26] naive implementation of L2

---
 src/VecSim/spaces/L2/L2.cpp | 24 +++++++++++++++++++++++
 src/VecSim/types/int8.h     | 39 +++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 src/VecSim/types/int8.h
diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index 5fba0555e..379e21c52 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -7,6 +7,7 @@
 #include "L2.h"
 #include "VecSim/types/bfloat16.h"
 #include "VecSim/types/float16.h"
+#include "VecSim/types/int8.h"
 #include <cstring>
 
 using bfloat16 = vecsim_types::bfloat16;
@@ -70,3 +71,26 @@ float FP16_L2Sqr(const void *pVect1, const void *pVect2, size_t dimension) {
     }
     return res;
 }
+
+template <bool is_little>
+float INT8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    int8_t *pVect1 = (int8_t *)pVect1v;
+    int8_t *pVect2 = (int8_t *)pVect2v;
+
+    int res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        int16_t a = vecsim_types::int8_to_int16<is_little>(pVect1[i]);
+        int16_t b = vecsim_types::int8_to_int16<is_little>(pVect2[i]);
+        int16_t diff = a - b;
+        res += diff * diff;
+    }
+    return float(res);
+}
+
+float INT8_L2Sqr_LittleEndian(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return INT8_L2Sqr<true>(pVect1v, pVect2v, dimension);
+}
+
+float INT8_L2Sqr_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return INT8_L2Sqr<false>(pVect1v, pVect2v, dimension);
+}
diff --git a/src/VecSim/types/int8.h b/src/VecSim/types/int8.h
new file mode 100644
index 000000000..73d6b188a
--- /dev/null
+++ b/src/VecSim/types/int8.h
@@ -0,0 +1,39 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <cmath>
+
+namespace vecsim_types {
+struct bfloat16 {
+    uint16_t val;
+    bfloat16() = default;
+    explicit constexpr bfloat16(uint16_t val) : val(val) {}
+    operator uint16_t() const { return val; }
+};
+
+static inline bfloat16 float_to_bf16(const float ff) {
+    uint32_t *p_f32 = (uint32_t *)&ff;
+    uint32_t f32 = *p_f32;
+    uint32_t lsb = (f32 >> 16) & 1;
+    uint32_t round = lsb + 0x7FFF;
+    f32 += round;
+    return bfloat16(f32 >> 16);
+}
+
+template <bool is_little = true>
+inline float bfloat16_to_float32(bfloat16 val) {
+    size_t constexpr bytes_offset = is_little ? 1 : 0;
+    float result = 0;
+    bfloat16 *p_result = (bfloat16 *)&result + bytes_offset;
+    *p_result = val;
+    return result;
+}
+
+} // namespace vecsim_types

From c641d2364975fe27413ec7cac5cb6db7e90488d8 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Wed, 4 Dec 2024 00:03:37 +0200
Subject: [PATCH 02/26] update

---
 src/VecSim/types/int8.h | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/src/VecSim/types/int8.h b/src/VecSim/types/int8.h
index 73d6b188a..158f1622b 100644
--- a/src/VecSim/types/int8.h
+++ b/src/VecSim/types/int8.h
@@ -11,27 +11,12 @@
 #include <cmath>
 
 namespace vecsim_types {
-struct bfloat16 {
-    uint16_t val;
-    bfloat16() = default;
-    explicit constexpr bfloat16(uint16_t val) : val(val) {}
-    operator uint16_t() const { return val; }
-};
-
-static inline bfloat16 float_to_bf16(const float ff) {
-    uint32_t *p_f32 = (uint32_t *)&ff;
-    uint32_t f32 = *p_f32;
-    uint32_t lsb = (f32 >> 16) & 1;
-    uint32_t round = lsb + 0x7FFF;
-    f32 += round;
-    return bfloat16(f32 >> 16);
-}
 
 template <bool is_little = true>
-inline float bfloat16_to_float32(bfloat16 val) {
+inline int16_t int8_to_int16(int8_t val) {
     size_t constexpr bytes_offset = is_little ? 1 : 0;
-    float result = 0;
-    bfloat16 *p_result = (bfloat16 *)&result + bytes_offset;
+    int result = 0;
+    int16_t *p_result = (int16_t *)&result + bytes_offset;
     *p_result = val;
     return result;
 }

From 1c5eb9066083a990ddac89b7bcb3ce961d155159 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Fri, 6 Dec 2024 16:38:38 +0000
Subject: [PATCH 03/26] implment naive disatnce for int8

add cosine to spaces

fix typos in calculator
---
 src/VecSim/spaces/CMakeLists.txt              |  2 +
 src/VecSim/spaces/Cosine/Cosine.cpp           | 23 +++++++++
 src/VecSim/spaces/Cosine/Cosine.h             | 11 ++++
 src/VecSim/spaces/Cosine_space.cpp            | 27 ++++++++++
 src/VecSim/spaces/Cosine_space.h              | 13 +++++
 src/VecSim/spaces/IP/IP.cpp                   | 13 +++++
 src/VecSim/spaces/IP/IP.h                     |  2 +
 src/VecSim/spaces/IP_space.cpp                | 13 +++++
 src/VecSim/spaces/IP_space.h                  |  2 +
 src/VecSim/spaces/L2/L2.cpp                   | 14 +-----
 src/VecSim/spaces/L2/L2.h                     |  2 +
 src/VecSim/spaces/L2_space.cpp                | 14 ++++++
 src/VecSim/spaces/L2_space.h                  |  2 +
 src/VecSim/spaces/computer/calculator.h       |  4 +-
 .../spaces/functions/implementation_chooser.h |  4 +-
 src/VecSim/types/int8.h                       | 24 ---------
 tests/unit/test_spaces.cpp                    | 50 +++++++++++++++++++
 17 files changed, 180 insertions(+), 40 deletions(-)
 create mode 100644 src/VecSim/spaces/Cosine/Cosine.cpp
 create mode 100644 src/VecSim/spaces/Cosine/Cosine.h
 create mode 100644 src/VecSim/spaces/Cosine_space.cpp
 create mode 100644 src/VecSim/spaces/Cosine_space.h
 delete mode 100644 src/VecSim/types/int8.h

diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
index 9cc0baaaf..ad22e8187 100644
--- a/src/VecSim/spaces/CMakeLists.txt
+++ b/src/VecSim/spaces/CMakeLists.txt
@@ -3,6 +3,7 @@ project(VectorSimilaritySpaces_no_optimization)
 add_library(VectorSimilaritySpaces_no_optimization
 	L2/L2.cpp
 	IP/IP.cpp
+	Cosine/Cosine.cpp
 )
 
 include(${root}/cmake/cpu_features.cmake)
@@ -79,6 +80,7 @@ endif()
 add_library(VectorSimilaritySpaces
 	L2_space.cpp
 	IP_space.cpp
+	Cosine_space.cpp
 	spaces.cpp
 	${OPTIMIZATIONS}
 	computer/preprocessor_container.cpp
diff --git a/src/VecSim/spaces/Cosine/Cosine.cpp b/src/VecSim/spaces/Cosine/Cosine.cpp
new file mode 100644
index 000000000..1cbc9a191
--- /dev/null
+++ b/src/VecSim/spaces/Cosine/Cosine.cpp
@@ -0,0 +1,23 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "Cosine.h"
+
+float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    int8_t *pVect1 = (int8_t *)pVect1v;
+    int8_t *pVect2 = (int8_t *)pVect2v;
+
+    int res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        int16_t a = pVect1[i];
+        int16_t b = pVect2[i];
+        res += a * b;
+    }
+
+    float norm_v1 = *(float *)pVect1v;
+    float norm_v2 = *(float *)pVect2v;
+    return 1.0f - float(res) / (norm_v1 * norm_v2);
+}
diff --git a/src/VecSim/spaces/Cosine/Cosine.h b/src/VecSim/spaces/Cosine/Cosine.h
new file mode 100644
index 000000000..c42f6c14f
--- /dev/null
+++ b/src/VecSim/spaces/Cosine/Cosine.h
@@ -0,0 +1,11 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include <cstdlib>
+
+float INT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension);
diff --git a/src/VecSim/spaces/Cosine_space.cpp b/src/VecSim/spaces/Cosine_space.cpp
new file mode 100644
index 000000000..7cace4c32
--- /dev/null
+++ b/src/VecSim/spaces/Cosine_space.cpp
@@ -0,0 +1,27 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/Cosine_space.h"
+#include "VecSim/spaces/Cosine/Cosine.h"
+
+namespace spaces {
+dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
+                                           const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = INT8_Cosine;
+    // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
+    if (dim < 32) {
+        return ret_dist_func;
+    }
+    return ret_dist_func;
+}
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/Cosine_space.h b/src/VecSim/spaces/Cosine_space.h
new file mode 100644
index 000000000..e139a5521
--- /dev/null
+++ b/src/VecSim/spaces/Cosine_space.h
@@ -0,0 +1,13 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+#include "VecSim/spaces/spaces.h"
+
+namespace spaces {
+dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                           const void *arch_opt = nullptr);
+} // namespace spaces
diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
index 98ad07676..1562e5b1a 100644
--- a/src/VecSim/spaces/IP/IP.cpp
+++ b/src/VecSim/spaces/IP/IP.cpp
@@ -66,3 +66,16 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension
     }
     return 1.0f - res;
 }
+
+float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    int8_t *pVect1 = (int8_t *)pVect1v;
+    int8_t *pVect2 = (int8_t *)pVect2v;
+
+    int res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        int16_t a = pVect1[i];
+        int16_t b = pVect2[i];
+        res += a * b;
+    }
+    return 1.0f - float(res);
+}
diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h
index 50fecef33..64e11b52f 100644
--- a/src/VecSim/spaces/IP/IP.h
+++ b/src/VecSim/spaces/IP/IP.h
@@ -16,3 +16,5 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension
 
 float BF16_InnerProduct_LittleEndian(const void *pVect1v, const void *pVect2v, size_t dimension);
 float BF16_InnerProduct_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension);
+
+float INT8_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index e6da26947..699919dc2 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -196,4 +196,17 @@ dist_func_t<float> IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con
     return ret_dist_func;
 }
 
+dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = INT8_InnerProduct;
+    // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
+    if (dim < 32) {
+        return ret_dist_func;
+    }
+    return ret_dist_func;
+}
 } // namespace spaces
diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h
index a3ab0f4f6..87407c1a3 100644
--- a/src/VecSim/spaces/IP_space.h
+++ b/src/VecSim/spaces/IP_space.h
@@ -16,4 +16,6 @@ dist_func_t<float> IP_BF16_GetDistFunc(size_t dim, unsigned char *alignment = nu
                                        const void *arch_opt = nullptr);
 dist_func_t<float> IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                        const void *arch_opt = nullptr);
+dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                       const void *arch_opt = nullptr);
 } // namespace spaces
diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp
index 379e21c52..ef310418b 100644
--- a/src/VecSim/spaces/L2/L2.cpp
+++ b/src/VecSim/spaces/L2/L2.cpp
@@ -7,7 +7,6 @@
 #include "L2.h"
 #include "VecSim/types/bfloat16.h"
 #include "VecSim/types/float16.h"
-#include "VecSim/types/int8.h"
 #include <cstring>
 
 using bfloat16 = vecsim_types::bfloat16;
@@ -72,25 +71,16 @@ float FP16_L2Sqr(const void *pVect1, const void *pVect2, size_t dimension) {
     return res;
 }
 
-template <bool is_little>
 float INT8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) {
     int8_t *pVect1 = (int8_t *)pVect1v;
     int8_t *pVect2 = (int8_t *)pVect2v;
 
     int res = 0;
     for (size_t i = 0; i < dimension; i++) {
-        int16_t a = vecsim_types::int8_to_int16<is_little>(pVect1[i]);
-        int16_t b = vecsim_types::int8_to_int16<is_little>(pVect2[i]);
+        int16_t a = pVect1[i];
+        int16_t b = pVect2[i];
         int16_t diff = a - b;
         res += diff * diff;
     }
     return float(res);
 }
-
-float INT8_L2Sqr_LittleEndian(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    return INT8_L2Sqr<true>(pVect1v, pVect2v, dimension);
-}
-
-float INT8_L2Sqr_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    return INT8_L2Sqr<false>(pVect1v, pVect2v, dimension);
-}
diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h
index c367f2ee1..65649d4eb 100644
--- a/src/VecSim/spaces/L2/L2.h
+++ b/src/VecSim/spaces/L2/L2.h
@@ -16,3 +16,5 @@ float BF16_L2Sqr_LittleEndian(const void *pVect1v, const void *pVect2v, size_t d
 float BF16_L2Sqr_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension);
 
 float FP16_L2Sqr(const void *pVect1, const void *pVect2, size_t dimension);
+
+float INT8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension);
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 1c2b2b59f..3f9d83f03 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -189,4 +189,18 @@ dist_func_t<float> L2_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con
     return ret_dist_func;
 }
 
+dist_func_t<float> L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = INT8_L2Sqr;
+    // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
+    if (dim < 32) {
+        return ret_dist_func;
+    }
+    return ret_dist_func;
+}
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h
index 4a2ea801a..48e50a8c2 100644
--- a/src/VecSim/spaces/L2_space.h
+++ b/src/VecSim/spaces/L2_space.h
@@ -16,4 +16,6 @@ dist_func_t<float> L2_BF16_GetDistFunc(size_t dim, unsigned char *alignment = nu
                                        const void *arch_opt = nullptr);
 dist_func_t<float> L2_FP16_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                        const void *arch_opt = nullptr);
+dist_func_t<float> L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                       const void *arch_opt = nullptr);
 } // namespace spaces
diff --git a/src/VecSim/spaces/computer/calculator.h b/src/VecSim/spaces/computer/calculator.h
index 36e76deed..64e0d8dae 100644
--- a/src/VecSim/spaces/computer/calculator.h
+++ b/src/VecSim/spaces/computer/calculator.h
@@ -26,10 +26,10 @@ class IndexCalculatorInterface : public VecsimBaseObject {
 /**
  * This object purpose is to calculate the distance between two vectors.
  * It extends the IndexCalculatorInterface class' type to hold the distance function.
- * Every specific implmentation of the distance claculater should hold by refrence or by value the
+ * Every specific implementation of the distance calculator should hold by reference or by value the
  * parameters required for the calculation. The distance calculation API of all DistanceCalculator
  * classes is: calc_dist(v1,v2,dim). Internally it calls the distance function according the
- * template signature, allowing fexability in the distance function arguments.
+ * template signature, allowing flexibility in the distance function arguments.
  */
 template <typename DistType, typename DistFuncType>
 class DistanceCalculatorInterface : public IndexCalculatorInterface<DistType> {
diff --git a/src/VecSim/spaces/functions/implementation_chooser.h b/src/VecSim/spaces/functions/implementation_chooser.h
index 2903b8cc4..6bb61815e 100644
--- a/src/VecSim/spaces/functions/implementation_chooser.h
+++ b/src/VecSim/spaces/functions/implementation_chooser.h
@@ -40,8 +40,8 @@
 // out:     The output variable that will be set to the chosen implementation.
 // dim:     The dimension.
 // chunk:   The chunk size. Can be 32, 16 or 8. 32 for 16-bit elements, 16 for 32-bit elements, 8
-// for 64-bit elements. func:    The templated function that we want to choose the implementation
-// for.
+// for 64-bit elements.
+// func:    The templated function that we want to choose the implementation for.
 #define CHOOSE_IMPLEMENTATION(out, dim, chunk, func)                                               \
     do {                                                                                           \
         decltype(out) __ret_dist_func;                                                             \
diff --git a/src/VecSim/types/int8.h b/src/VecSim/types/int8.h
deleted file mode 100644
index 158f1622b..000000000
--- a/src/VecSim/types/int8.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- *Copyright Redis Ltd. 2021 - present
- *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
- *the Server Side Public License v1 (SSPLv1).
- */
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <cmath>
-
-namespace vecsim_types {
-
-template <bool is_little = true>
-inline int16_t int8_to_int16(int8_t val) {
-    size_t constexpr bytes_offset = is_little ? 1 : 0;
-    int result = 0;
-    int16_t *p_result = (int16_t *)&result + bytes_offset;
-    *p_result = val;
-    return result;
-}
-
-} // namespace vecsim_types
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 7cf7de92b..2a58d6072 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -12,10 +12,12 @@
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/IP/IP.h"
 #include "VecSim/spaces/L2/L2.h"
+#include "VecSim/spaces/Cosine/Cosine.h"
 #include "VecSim/utils/vec_utils.h"
 #include "VecSim/types/bfloat16.h"
 #include "VecSim/spaces/IP_space.h"
 #include "VecSim/spaces/L2_space.h"
+#include "VecSim/spaces/Cosine_space.h"
 #include "VecSim/types/float16.h"
 #include "VecSim/spaces/functions/AVX512F.h"
 #include "VecSim/spaces/functions/AVX.h"
@@ -102,6 +104,21 @@ TEST_F(SpacesTest, fp16_l2_no_optimization_func_test) {
     ASSERT_EQ(dist, FP32_L2Sqr((const void *)sanity_a, (const void *)sanity_b, dim));
 }
 
+TEST_F(SpacesTest, int8_l2_no_optimization_func_test) {
+    size_t dim = 5;
+
+    int8_t a[dim], b[dim];
+    for (size_t i = 0; i < dim; i++) {
+        a[i] = (i + 1);
+        b[i] = (i + 2);
+    }
+
+    float dist = INT8_L2Sqr((const void *)a, (const void *)b, dim);
+    ASSERT_EQ(dist, 5.0);
+}
+
+/* ======================== IP NO OPT ======================== */
+
 TEST_F(SpacesTest, float_ip_no_optimization_func_test) {
     size_t dim = 5;
 
@@ -211,6 +228,34 @@ TEST_F(SpacesTest, fp16_ip_no_optimization_func_test) {
     ASSERT_EQ(dist, FP32_InnerProduct((const void *)sanity_a, (const void *)sanity_b, dim));
 }
 
+TEST_F(SpacesTest, int8_ip_no_optimization_func_test) {
+    size_t dim = 4;
+    int8_t a[] = {1, 0, 0, 0};
+    int8_t b[] = {1, 0, 0, 0};
+
+    float dist = INT8_InnerProduct((const void *)a, (const void *)b, dim);
+    ASSERT_EQ(dist, 0.0);
+}
+
+/* ======================== Cosine NO OPT ======================== */
+
+TEST_F(SpacesTest, int8_Cosine_no_optimization_func_test) {
+    size_t dim = 4;
+    // create normalized vector with extra space for the norm
+    std::vector<int8_t> vec1(dim + sizeof(float), 0);
+    std::vector<int8_t> vec2(dim + sizeof(float), 0);
+
+    vec1[0] = 1; // {1, 0, 0, 0}
+    vec2[1] = 1; // {1, 0, 0, 0}
+
+    // write the norm at the end of the vector
+    *(float *)(vec1.data() + dim) = 1.0;
+    *(float *)(vec2.data() + dim) = 1.0;
+
+    float dist = INT8_InnerProduct((const void *)vec1.data(), (const void *)vec2.data(), dim);
+    ASSERT_EQ(dist, 1.0);
+}
+
 TEST_F(SpacesTest, GetDistFuncInvalidMetricFP32) {
     EXPECT_THROW(
         (spaces::GetDistFunc<float, float>((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)),
@@ -231,6 +276,11 @@ TEST_F(SpacesTest, GetDistFuncInvalidMetricFP16) {
         (spaces::GetDistFunc<float16, float>((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)),
         std::invalid_argument);
 }
+// TEST_F(SpacesTest, GetDistFuncInvalidMetricINT8) {
+//     EXPECT_THROW(
+//         (spaces::GetDistFunc<int8_t, float>((VecSimMetric)(VecSimMetric_Cosine + 1), 10,
+//         nullptr)), std::invalid_argument);
+// }
 
 using namespace spaces;
 

From fa8e9ff6856072b6d72fa4b4c0c84473b0268017 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Sun, 8 Dec 2024 05:58:56 +0000
Subject: [PATCH 04/26] imp choose L2 int8 with 256bit loop

add spaces unit tests for int8 L2
add compilation flags
introduce tests/utils for general utils
---
 cmake/x86_64InstructionFlags.cmake            |  5 ++
 src/VecSim/spaces/CMakeLists.txt              |  6 ++
 .../spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h    | 58 +++++++++++++++++++
 src/VecSim/spaces/L2_space.cpp                | 13 +++++
 .../spaces/functions/AVX512F_BW_VL_VNNI.cpp   | 23 ++++++++
 .../spaces/functions/AVX512F_BW_VL_VNNI.h     | 15 +++++
 tests/unit/test_spaces.cpp                    | 49 ++++++++++++++++
 tests/utils/tests_utils.h                     | 24 ++++++++
 8 files changed, 193 insertions(+)
 create mode 100644 src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
 create mode 100644 src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
 create mode 100644 src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
 create mode 100644 tests/utils/tests_utils.h

diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake
index 1fedda7fe..1ff8f48f2 100644
--- a/cmake/x86_64InstructionFlags.cmake
+++ b/cmake/x86_64InstructionFlags.cmake
@@ -13,6 +13,7 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 	CHECK_CXX_COMPILER_FLAG(-mavx512vbmi2 CXX_AVX512VBMI2)
 	CHECK_CXX_COMPILER_FLAG(-mavx512fp16 CXX_AVX512FP16)
 	CHECK_CXX_COMPILER_FLAG(-mavx512f CXX_AVX512F)
+	CHECK_CXX_COMPILER_FLAG(-mavx512vnni CXX_AVX512VNNI)
 	CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2)
 	CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX)
 	CHECK_CXX_COMPILER_FLAG(-mf16c CXX_F16C)
@@ -48,6 +49,10 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		add_compile_definitions(OPT_AVX512_BW_VBMI2)
 	endif()
 
+	if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI)
+		add_compile_definitions(OPT_AVX512_F_BW_VL_VNNI)
+	endif()
+
 	if(CXX_F16C AND CXX_FMA AND CXX_AVX)
 		add_compile_definitions(OPT_F16C)
 	endif()
diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
index ad22e8187..fc23adc18 100644
--- a/src/VecSim/spaces/CMakeLists.txt
+++ b/src/VecSim/spaces/CMakeLists.txt
@@ -45,6 +45,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 		list(APPEND OPTIMIZATIONS functions/AVX512F.cpp)
 	endif()
 
+	if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI)
+		message("Building with AVX512F, AVX512BW, AVX512VL and AVX512VNNI")
+		set_source_files_properties(functions/AVX512F_BW_VL_VNNI.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512vl -mavx512vnni")
+		list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp)
+	endif()
+
 	if(CXX_AVX2)
 		message("Building with AVX2")
 		set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
new file mode 100644
index 000000000..9130d6414
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
@@ -0,0 +1,58 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+
+static inline void L2SqrStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) {
+    __m256i temp_a = _mm256_loadu_epi8(pVect1);
+    __m512i va = _mm512_cvtepi8_epi16(temp_a);
+    pVect1 += 32;
+
+    __m256i temp_b = _mm256_loadu_epi8(pVect2);
+    __m512i vb = _mm512_cvtepi8_epi16(temp_b);
+    pVect2 += 32;
+
+    __m512i diff = _mm512_sub_epi16(va, vb);
+    // _mm512_dpwssd_epi32(src, a, b)
+    // Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding
+    // 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results
+    // with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+    sum = _mm512_dpwssd_epi32(sum, diff, diff);
+}
+
+template <unsigned char residual> // 0..32
+float INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                          size_t dimension) {
+    int8_t *pVect1 = (int8_t *)pVect1v;
+    int8_t *pVect2 = (int8_t *)pVect2v;
+
+    const int8_t *pEnd1 = pVect1 + dimension;
+
+    __m512i sum = _mm512_setzero_epi32();
+
+    // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block,
+    // so mask loading is guaranteed to be safe
+    if constexpr (residual) {
+        __mmask32 mask = (1LU << residual) - 1;
+        __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1);
+        __m512i va = _mm512_cvtepi8_epi16(temp_a);
+        pVect1 += residual;
+
+        __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2);
+        __m512i vb = _mm512_cvtepi8_epi16(temp_b);
+        pVect2 += residual;
+
+        __m512i diff = _mm512_sub_epi16(va, vb);
+        sum = _mm512_dpwssd_epi32(sum, diff, diff);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 32-int_8.
+    do {
+        L2SqrStep(pVect1, pVect2, sum);
+    } while (pVect1 < pEnd1);
+
+    return _mm512_reduce_add_epi32(sum);
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 3f9d83f03..3ae927224 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -15,6 +15,7 @@
 #include "VecSim/spaces/functions/SSE.h"
 #include "VecSim/spaces/functions/AVX512BW_VBMI2.h"
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
+#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
 
@@ -200,6 +201,18 @@ dist_func_t<float> L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con
     if (dim < 32) {
         return ret_dist_func;
     }
+#ifdef CPU_FEATURES_ARCH_X86_64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetX86Info().features
+                        : *static_cast<const cpu_features::X86Features *>(arch_opt);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+        if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 32 * sizeof(int8_t); // align to 256 bits.
+        return Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(dim);
+    }
+#endif
+#endif // __x86_64__
     return ret_dist_func;
 }
 
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
new file mode 100644
index 000000000..d906a5775
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -0,0 +1,23 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "AVX512BW_VBMI2.h"
+
+#include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+
+dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI);
+    return ret_dist_func;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
new file mode 100644
index 000000000..c1f32ff10
--- /dev/null
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -0,0 +1,15 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+namespace spaces {
+
+dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(size_t dim);
+
+} // namespace spaces
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 2a58d6072..c9e8d68b8 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -25,9 +25,11 @@
 #include "VecSim/spaces/functions/AVX512BW_VBMI2.h"
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
+#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/F16C.h"
+#include "../utils/tests_utils.h"
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
@@ -256,6 +258,8 @@ TEST_F(SpacesTest, int8_Cosine_no_optimization_func_test) {
     ASSERT_EQ(dist, 1.0);
 }
 
+/* ======================== Test Getters ======================== */
+
 TEST_F(SpacesTest, GetDistFuncInvalidMetricFP32) {
     EXPECT_THROW(
         (spaces::GetDistFunc<float, float>((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)),
@@ -291,6 +295,7 @@ TEST_F(SpacesTest, smallDimChooser) {
         ASSERT_EQ(L2_FP64_GetDistFunc(dim), FP64_L2Sqr);
         ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian);
         ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr);
+        ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr);
         ASSERT_EQ(IP_FP32_GetDistFunc(dim), FP32_InnerProduct);
         ASSERT_EQ(IP_FP64_GetDistFunc(dim), FP64_InnerProduct);
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
@@ -300,6 +305,7 @@ TEST_F(SpacesTest, smallDimChooser) {
         ASSERT_EQ(L2_FP32_GetDistFunc(dim), FP32_L2Sqr);
         ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian);
         ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr);
+        ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr);
         ASSERT_EQ(IP_FP32_GetDistFunc(dim), FP32_InnerProduct);
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
@@ -307,11 +313,14 @@ TEST_F(SpacesTest, smallDimChooser) {
     for (size_t dim = 16; dim < 32; dim++) {
         ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian);
         ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr);
+        ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr);
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
     }
 }
 
+/* ======================== Test SIMD Functions ======================== */
+
 // In this following tests we assume that compiler supports all X86 optimizations, so if we have
 // some hardware flag enabled, we check that the corresponding optimization function was chosen.
 #ifdef CPU_FEATURES_ARCH_X86_64
@@ -899,4 +908,44 @@ INSTANTIATE_TEST_SUITE_P(, FP16SpacesOptimizationTestAdvanced,
 
 #endif
 
+class INT8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
+
+TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
+    auto optimization = cpu_features::GetX86Info().features;
+    size_t dim = GetParam();
+    auto v1 = test_utils::create_int8_vec(dim);
+    auto v2 = test_utils::create_int8_vec(dim);
+
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = INT8_L2Sqr(v1.data(), v2.data(), dim);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
+        optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim;
+        ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
+            optimization.avx512vnni = 0;
+    }
+#endif
+    unsigned char alignment = 0;
+    arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, INT8_L2Sqr) << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim))
+        << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
+
+INSTANTIATE_TEST_SUITE_P(INT8OptFuncs, INT8SpacesOptimizationTest,
+                         testing::Range(32UL, 32 * 2UL + 1));
+
 #endif // CPU_FEATURES_ARCH_X86_64
diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h
new file mode 100644
index 000000000..568fc1a49
--- /dev/null
+++ b/tests/utils/tests_utils.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <random>
+#include <vector>
+
+namespace test_utils {
+
+std::vector<int8_t> create_int8_vec(size_t dim) {
+
+    std::mt19937 gen(1234); // Mersenne Twister engine initialized with the fixed seed
+
+    // uniform_int_distribution doesn't support int8,
+    // Define a distribution range for int8_t
+    std::uniform_int_distribution<int16_t> dis(-128, 127);
+
+    std::vector<int8_t> vec(dim);
+    for (auto &num : vec) {
+        num = static_cast<int8_t>(dis(gen));
+    }
+
+    return vec;
+}
+
+} // namespace test_utils

From a7a556f43430ebba03121c7cf753464472150ed3 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Sun, 8 Dec 2024 11:23:30 +0200
Subject: [PATCH 05/26] imp space bm for int8

change INITIALIZE_BENCHMARKS_SET to INITIALIZE_BENCHMARKS_SET_L2_IP
introduce INITIALIZE_BENCHMARKS_SET_COSINE
fix typos in Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI name
---
 src/VecSim/spaces/L2_space.cpp                |  2 +-
 .../spaces/functions/AVX512F_BW_VL_VNNI.cpp   |  2 +-
 .../spaces/functions/AVX512F_BW_VL_VNNI.h     |  2 +-
 tests/benchmark/spaces_benchmarks/bm_spaces.h |  6 ++
 .../spaces_benchmarks/bm_spaces_bf16.cpp      |  8 +--
 .../spaces_benchmarks/bm_spaces_fp16.cpp      |  8 +--
 .../spaces_benchmarks/bm_spaces_fp32.cpp      |  6 +-
 .../spaces_benchmarks/bm_spaces_fp64.cpp      |  6 +-
 .../spaces_benchmarks/bm_spaces_int8.cpp      | 56 +++++++++++++++++++
 tests/unit/test_spaces.cpp                    |  2 +-
 10 files changed, 80 insertions(+), 18 deletions(-)
 create mode 100644 tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp

diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 3ae927224..c0bec428f 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -209,7 +209,7 @@ dist_func_t<float> L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
             *alignment = 32 * sizeof(int8_t); // align to 256 bits.
-        return Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(dim);
+        return Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim);
     }
 #endif
 #endif // __x86_64__
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index d906a5775..d82d4141d 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -12,7 +12,7 @@ namespace spaces {
 
 #include "implementation_chooser.h"
 
-dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(size_t dim) {
+dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
index c1f32ff10..818b9529f 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -10,6 +10,6 @@
 
 namespace spaces {
 
-dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(size_t dim);
+dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 
 } // namespace spaces
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index 3b55a9032..8b42ac030 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -123,6 +123,12 @@ static constexpr size_t start = min_no_res_th_dim;
     INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported);          \
     INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported);
 
+#define INITIALIZE_BENCHMARKS_SET_COSINE(bm_class, type_prefix, arch, dim_opt, arch_supported)     \
+    INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, COSINE, arch_supported);                      \
+    INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, COSINE, arch_supported);                       \
+    INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported);      \
+    INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported);
+
 #define INITIALIZE_BENCHMARKS_SET(bm_class, type_prefix, arch, dim_opt, arch_supported)            \
     INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported)             \
     INITIALIZE_BENCHMARKS_SET_IP(bm_class, type_prefix, arch, dim_opt, arch_supported)
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp
index 8022c712a..27fe82a3d 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp
@@ -26,20 +26,20 @@ INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_BF16, BF16, AVX512BF16_VL, 32,
 // AVX512 functions
 #ifdef OPT_AVX512_BW_VBMI2
 bool avx512_bw_vbmi2_supported = opt.avx512bw && opt.avx512vbmi2;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, AVX512BW_VBMI2, 32,
-                          avx512_bw_vbmi2_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, AVX512BW_VBMI2, 32,
+                                avx512_bw_vbmi2_supported);
 #endif // AVX512F
 
 // AVX functions
 #ifdef OPT_AVX2
 bool avx2_supported = opt.avx2;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, AVX2, 32, avx2_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, AVX2, 32, avx2_supported);
 #endif // AVX
 
 // SSE functions
 #ifdef OPT_SSE3
 bool sse3_supported = opt.sse3;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, SSE3, 32, sse3_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, SSE3, 32, sse3_supported);
 #endif // SSE
 
 #endif // x86_64
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp
index c9bc42b0b..9457bc77d 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp
@@ -22,8 +22,8 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 class BM_VecSimSpaces_FP16_adv : public BM_VecSimSpaces<_Float16> {};
 
 bool avx512fp16_vl_supported = opt.avx512_fp16 && opt.avx512vl;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16_adv, FP16, AVX512FP16_VL, 32,
-                          avx512fp16_vl_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16_adv, FP16, AVX512FP16_VL, 32,
+                                avx512fp16_vl_supported);
 
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, InnerProduct, 32);
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, L2Sqr, 32);
@@ -32,12 +32,12 @@ INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, L2Sqr, 32);
 // OPT_AVX512F functions
 #ifdef OPT_AVX512F
 bool avx512f_supported = opt.avx512f;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16, FP16, AVX512F, 32, avx512f_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, AVX512F, 32, avx512f_supported);
 #endif // OPT_AVX512F
 // AVX functions
 #ifdef OPT_F16C
 bool avx512_bw_f16c_supported = opt.f16c && opt.fma3 && opt.avx;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16, FP16, F16C, 32, avx512_bw_f16c_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, F16C, 32, avx512_bw_f16c_supported);
 #endif // OPT_F16C
 
 #endif // x86_64
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp
index 289e42405..106b2abc8 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp
@@ -13,19 +13,19 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 // AVX512 functions
 #ifdef OPT_AVX512F
 bool avx512f_supported = opt.avx512f;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, AVX512F, 16, avx512f_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, AVX512F, 16, avx512f_supported);
 #endif // AVX512F
 
 // AVX functions
 #ifdef OPT_AVX
 bool avx_supported = opt.avx;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, AVX, 16, avx_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, AVX, 16, avx_supported);
 #endif // AVX
 
 // SSE functions
 #ifdef OPT_SSE
 bool sse_supported = opt.sse;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, SSE, 16, sse_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, SSE, 16, sse_supported);
 #endif // SSE
 
 #endif // x86_64
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp
index 19157f03f..01052cebc 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp
@@ -13,19 +13,19 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 // AVX512 functions
 #ifdef OPT_AVX512F
 bool avx512f_supported = opt.avx512f;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, AVX512F, 8, avx512f_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, AVX512F, 8, avx512f_supported);
 #endif // AVX512F
 
 // AVX functions
 #ifdef OPT_AVX
 bool avx_supported = opt.avx;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, AVX, 8, avx_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, AVX, 8, avx_supported);
 #endif // AVX
 
 // SSE functions
 #ifdef OPT_SSE
 bool sse_supported = opt.sse;
-INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, SSE, 8, sse_supported);
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, SSE, 8, sse_supported);
 #endif // SSE
 
 #endif // x86_64
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
new file mode 100644
index 000000000..def14a8bd
--- /dev/null
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
@@ -0,0 +1,56 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+#include <benchmark/benchmark.h>
+#include <random>
+#include "../../utils/tests_utils.h"
+#include "bm_spaces.h"
+
+class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture {
+protected:
+    std::mt19937 rng;
+    size_t dim;
+    int8_t *v1, *v2;
+
+public:
+    BM_VecSimSpaces_Integers_INT8() { rng.seed(47); }
+    ~BM_VecSimSpaces_Integers_INT8() = default;
+
+    void SetUp(const ::benchmark::State &state) {
+        dim = state.range(0);
+        v1 = new int8_t[dim];
+        v2 = new int8_t[dim];
+
+        // random for int8_t and uint8_t is not provided by the standard library
+        memcpy(v1, test_utils::create_int8_vec(dim).data(), dim);
+        memcpy(v2, test_utils::create_int8_vec(dim).data(), dim);
+    }
+    void TearDown(const ::benchmark::State &state) {
+        delete v1;
+        delete v2;
+    }
+};
+
+
+#ifdef CPU_FEATURES_ARCH_X86_64
+cpu_features::X86Features opt = cpu_features::GetX86Info().features;
+
+// AVX512_BF16 functions
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni;
+INITIALIZE_BENCHMARKS_SET_L2(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
+                             avx512_f_bw_vl_vnni_supported);
+// INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32,
+//                              avx512_f_bw_vl_vnni_supported);
+// INITIALIZE_BENCHMARKS_SET_COSINE(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32,
+//                              avx512_f_bw_vl_vnni_supported)
+#endif // AVX512_BF16
+
+
+#endif // x86_64
+
+INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, InnerProduct, 32);
+INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, L2Sqr, 32);
+BENCHMARK_MAIN();
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index c9e8d68b8..2968b1c9f 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -928,7 +928,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
         optimization.avx512vnni) {
         unsigned char alignment = 0;
         arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization);
-        ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(dim))
+        ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim))
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim;
         ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;

From 43064e865041ef58ffb3c272d5c0998c8ea9559c Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Sun, 8 Dec 2024 09:26:01 +0000
Subject: [PATCH 06/26] fix INITIALIZE_BENCHMARKS_SET_L2_IP and add include to
 F_BW_VL_VNNI

---
 tests/benchmark/spaces_benchmarks/bm_spaces.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index 8b42ac030..86cb45553 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -19,6 +19,7 @@
 #include "VecSim/spaces/functions/AVX.h"
 #include "VecSim/spaces/functions/AVX512BW_VBMI2.h"
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
+#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/F16C.h"
 #include "VecSim/spaces/functions/SSE3.h"
@@ -129,6 +130,6 @@ static constexpr size_t start = min_no_res_th_dim;
     INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported);      \
     INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported);
 
-#define INITIALIZE_BENCHMARKS_SET(bm_class, type_prefix, arch, dim_opt, arch_supported)            \
+#define INITIALIZE_BENCHMARKS_SET_L2_IP(bm_class, type_prefix, arch, dim_opt, arch_supported)            \
     INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported)             \
     INITIALIZE_BENCHMARKS_SET_IP(bm_class, type_prefix, arch, dim_opt, arch_supported)

From fb9f1ccf7064702b4e0b08b6633336223bbe4524 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Sun, 8 Dec 2024 09:47:20 +0000
Subject: [PATCH 07/26] rename unit/test_utuils to unit_test_utils

---
 tests/benchmark/CMakeLists.txt                     |  2 +-
 tests/benchmark/benchmarks.sh                      | 10 ++++++++++
 .../benchmark/spaces_benchmarks/bm_spaces_int8.cpp |  3 ++-
 tests/unit/CMakeLists.txt                          | 14 +++++++-------
 tests/unit/test_allocator.cpp                      |  4 ++--
 tests/unit/test_bf16.cpp                           |  2 +-
 tests/unit/test_bruteforce.cpp                     |  4 ++--
 tests/unit/test_bruteforce_multi.cpp               |  4 ++--
 tests/unit/test_common.cpp                         |  4 ++--
 tests/unit/test_fp16.cpp                           |  2 +-
 tests/unit/test_hnsw.cpp                           |  4 ++--
 tests/unit/test_hnsw_multi.cpp                     |  4 ++--
 tests/unit/test_hnsw_parallel.cpp                  |  4 ++--
 tests/unit/test_hnsw_tiered.cpp                    |  2 +-
 tests/unit/test_spaces.cpp                         |  2 +-
 tests/unit/{test_utils.cpp => unit_test_utils.cpp} |  2 +-
 tests/unit/{test_utils.h => unit_test_utils.h}     |  0
 17 files changed, 39 insertions(+), 28 deletions(-)
 rename tests/unit/{test_utils.cpp => unit_test_utils.cpp} (99%)
 rename tests/unit/{test_utils.h => unit_test_utils.h} (100%)

diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt
index 4d25a5499..2fa066e82 100644
--- a/tests/benchmark/CMakeLists.txt
+++ b/tests/benchmark/CMakeLists.txt
@@ -31,7 +31,7 @@ endforeach()
 
 include(${root}/cmake/x86_64InstructionFlags.cmake)
 
-set(DATA_TYPE fp32 fp64 bf16 fp16)
+set(DATA_TYPE fp32 fp64 bf16 fp16 int8)
 foreach(data_type IN LISTS DATA_TYPE)
 	add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp)
 	target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark)
diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh
index 11872e869..2e6664424 100755
--- a/tests/benchmark/benchmarks.sh
+++ b/tests/benchmark/benchmarks.sh
@@ -66,4 +66,14 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then
     echo spaces_fp16
     echo spaces_fp64
     echo spaces_bf16
+elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then
+    echo spaces_fp32
+elif [ "$BM_TYPE" = "bm-spaces-fp64" ] ; then
+    echo spaces_fp64
+elif [ "$BM_TYPE" = "bm-spaces-bf16" ] ; then
+    echo spaces_bf16
+elif [ "$BM_TYPE" = "bm-spaces-fp16" ] ; then
+    echo spaces_fp16
+elif [ "$BM_TYPE" = "bm-spaces-int8" ] ; then
+    echo spaces_int8
 fi
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
index def14a8bd..8cb323043 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
@@ -5,7 +5,8 @@
  */
 #include <benchmark/benchmark.h>
 #include <random>
-#include "../../utils/tests_utils.h"
+#include <cstring>
+#include "utils/tests_utils.h"
 #include "bm_spaces.h"
 
 class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture {
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index b16bddac6..caa3fc522 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -30,15 +30,15 @@ endif()
 
 include(${root}/cmake/x86_64InstructionFlags.cmake)
 
-add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp test_utils.cpp)
-add_executable(test_hnsw_parallel test_hnsw_parallel.cpp test_utils.cpp)
-add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp test_utils.cpp)
-add_executable(test_allocator test_allocator.cpp test_utils.cpp)
+add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp unit_test_utils.cpp)
+add_executable(test_hnsw_parallel test_hnsw_parallel.cpp unit_test_utils.cpp)
+add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp unit_test_utils.cpp)
+add_executable(test_allocator test_allocator.cpp unit_test_utils.cpp)
 add_executable(test_spaces test_spaces.cpp)
 add_executable(test_types test_types.cpp)
-add_executable(test_common ../utils/mock_thread_pool.cpp test_utils.cpp test_common.cpp)
-add_executable(test_bf16 ../utils/mock_thread_pool.cpp test_bf16.cpp test_utils.cpp)
-add_executable(test_fp16 ../utils/mock_thread_pool.cpp test_fp16.cpp test_utils.cpp)
+add_executable(test_common ../utils/mock_thread_pool.cpp unit_test_utils.cpp test_common.cpp)
+add_executable(test_bf16 ../utils/mock_thread_pool.cpp test_bf16.cpp unit_test_utils.cpp)
+add_executable(test_fp16 ../utils/mock_thread_pool.cpp test_fp16.cpp unit_test_utils.cpp)
 
 target_link_libraries(test_hnsw PUBLIC gtest_main VectorSimilarity)
 target_link_libraries(test_hnsw_parallel PUBLIC gtest_main VectorSimilarity)
diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp
index 4eb389260..03f0b4e52 100644
--- a/tests/unit/test_allocator.cpp
+++ b/tests/unit/test_allocator.cpp
@@ -10,7 +10,7 @@
 #include "VecSim/memory/vecsim_base.h"
 #include "VecSim/algorithms/brute_force/brute_force_single.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/utils/serializer.h"
 #include "VecSim/index_factories/hnsw_factory.h"
 
@@ -83,7 +83,7 @@ TEST_F(AllocatorTest, test_nested_object) {
 template <typename index_type_t>
 class IndexAllocatorTest : public ::testing::Test {};
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(IndexAllocatorTest, DataTypeSet);
 
diff --git a/tests/unit/test_bf16.cpp b/tests/unit/test_bf16.cpp
index 921c80c35..95e12c98b 100644
--- a/tests/unit/test_bf16.cpp
+++ b/tests/unit/test_bf16.cpp
@@ -2,7 +2,7 @@
 #include "VecSim/vec_sim.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
 #include "VecSim/index_factories/hnsw_factory.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/utils/serializer.h"
 #include "mock_thread_pool.h"
 #include "VecSim/query_result_definitions.h"
diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp
index c56415e3d..b3d5b1192 100644
--- a/tests/unit/test_bruteforce.cpp
+++ b/tests/unit/test_bruteforce.cpp
@@ -6,7 +6,7 @@
 
 #include "gtest/gtest.h"
 #include "VecSim/vec_sim.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/algorithms/brute_force/brute_force.h"
 #include "VecSim/algorithms/brute_force/brute_force_single.h"
 #include "cpu_features_macros.h"
@@ -32,7 +32,7 @@ class BruteForceTest : public ::testing::Test {
     }
 };
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(BruteForceTest, DataTypeSet);
 
diff --git a/tests/unit/test_bruteforce_multi.cpp b/tests/unit/test_bruteforce_multi.cpp
index ef9cfc636..55aadedd4 100644
--- a/tests/unit/test_bruteforce_multi.cpp
+++ b/tests/unit/test_bruteforce_multi.cpp
@@ -6,7 +6,7 @@
 
 #include "gtest/gtest.h"
 #include "VecSim/vec_sim.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/algorithms/brute_force/brute_force_multi.h"
 #include <cmath>
 
@@ -27,7 +27,7 @@ class BruteForceMultiTest : public ::testing::Test {
     }
 };
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(BruteForceMultiTest, DataTypeSet);
 
diff --git a/tests/unit/test_common.cpp b/tests/unit/test_common.cpp
index 58df46fba..bdfd6d9f2 100644
--- a/tests/unit/test_common.cpp
+++ b/tests/unit/test_common.cpp
@@ -10,7 +10,7 @@
 #include "VecSim/query_result_definitions.h"
 #include "VecSim/utils/updatable_heap.h"
 #include "VecSim/utils/vec_utils.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/containers/vecsim_results_container.h"
 #include "VecSim/algorithms/hnsw/hnsw.h"
 #include "VecSim/index_factories/hnsw_factory.h"
@@ -32,7 +32,7 @@ using float16 = vecsim_types::float16;
 template <typename index_type_t>
 class CommonIndexTest : public ::testing::Test {};
 
-// DataTypeSet are defined in test_utils.h
+// DataTypeSet are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(CommonIndexTest, DataTypeSet);
 
diff --git a/tests/unit/test_fp16.cpp b/tests/unit/test_fp16.cpp
index 377ef8f32..244bb9d0c 100644
--- a/tests/unit/test_fp16.cpp
+++ b/tests/unit/test_fp16.cpp
@@ -2,7 +2,7 @@
 #include "VecSim/vec_sim.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
 #include "VecSim/index_factories/hnsw_factory.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/utils/serializer.h"
 #include "mock_thread_pool.h"
 #include "VecSim/query_result_definitions.h"
diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp
index f57a6d3e3..cc400d48a 100644
--- a/tests/unit/test_hnsw.cpp
+++ b/tests/unit/test_hnsw.cpp
@@ -9,7 +9,7 @@
 #include "VecSim/vec_sim_debug.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
 #include "VecSim/index_factories/hnsw_factory.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/utils/serializer.h"
 #include "VecSim/query_result_definitions.h"
 #include <unistd.h>
@@ -36,7 +36,7 @@ class HNSWTest : public ::testing::Test {
     }
 };
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(HNSWTest, DataTypeSet);
 
diff --git a/tests/unit/test_hnsw_multi.cpp b/tests/unit/test_hnsw_multi.cpp
index ba87f1759..026f96e62 100644
--- a/tests/unit/test_hnsw_multi.cpp
+++ b/tests/unit/test_hnsw_multi.cpp
@@ -6,7 +6,7 @@
 
 #include "gtest/gtest.h"
 #include "VecSim/vec_sim.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/algorithms/hnsw/hnsw_multi.h"
 #include <cmath>
 #include <map>
@@ -31,7 +31,7 @@ class HNSWMultiTest : public ::testing::Test {
     }
 };
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(HNSWMultiTest, DataTypeSet);
 
diff --git a/tests/unit/test_hnsw_parallel.cpp b/tests/unit/test_hnsw_parallel.cpp
index a2d4827ca..0354a6af1 100644
--- a/tests/unit/test_hnsw_parallel.cpp
+++ b/tests/unit/test_hnsw_parallel.cpp
@@ -7,7 +7,7 @@
 #include "gtest/gtest.h"
 #include "VecSim/vec_sim.h"
 #include "VecSim/algorithms/hnsw/hnsw_single.h"
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "VecSim/query_result_definitions.h"
 #include "VecSim/vec_sim_debug.h"
 #include <unistd.h>
@@ -124,7 +124,7 @@ class HNSWTestParallel : public ::testing::Test {
     void parallelInsertSearch(bool is_multi);
 };
 
-// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h
+// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h
 
 TYPED_TEST_SUITE(HNSWTestParallel, DataTypeSet);
 
diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp
index 4b1df107a..db751b792 100644
--- a/tests/unit/test_hnsw_tiered.cpp
+++ b/tests/unit/test_hnsw_tiered.cpp
@@ -6,7 +6,7 @@
 #include <string>
 #include <array>
 
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "mock_thread_pool.h"
 
 #include <thread>
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 2968b1c9f..4a546b6ca 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -29,7 +29,7 @@
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/F16C.h"
-#include "../utils/tests_utils.h"
+#include "tests_utils.h"
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
diff --git a/tests/unit/test_utils.cpp b/tests/unit/unit_test_utils.cpp
similarity index 99%
rename from tests/unit/test_utils.cpp
rename to tests/unit/unit_test_utils.cpp
index 7b99eba22..89973d19d 100644
--- a/tests/unit/test_utils.cpp
+++ b/tests/unit/unit_test_utils.cpp
@@ -4,7 +4,7 @@
  *the Server Side Public License v1 (SSPLv1).
  */
 
-#include "test_utils.h"
+#include "unit_test_utils.h"
 #include "gtest/gtest.h"
 #include "VecSim/utils/vec_utils.h"
 #include "VecSim/memory/vecsim_malloc.h"
diff --git a/tests/unit/test_utils.h b/tests/unit/unit_test_utils.h
similarity index 100%
rename from tests/unit/test_utils.h
rename to tests/unit/unit_test_utils.h

From 602f8e94362d966c1f9f54de11618c94f7222363 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Sun, 8 Dec 2024 11:22:32 +0000
Subject: [PATCH 08/26] seed create vec

---
 tests/benchmark/spaces_benchmarks/bm_spaces.h        | 2 +-
 tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp | 5 ++---
 tests/unit/test_spaces.cpp                           | 8 ++++----
 tests/utils/tests_utils.h                            | 4 ++--
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index 86cb45553..909f4d6dd 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -130,6 +130,6 @@ static constexpr size_t start = min_no_res_th_dim;
     INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported);      \
     INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported);
 
-#define INITIALIZE_BENCHMARKS_SET_L2_IP(bm_class, type_prefix, arch, dim_opt, arch_supported)            \
+#define INITIALIZE_BENCHMARKS_SET_L2_IP(bm_class, type_prefix, arch, dim_opt, arch_supported)      \
     INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported)             \
     INITIALIZE_BENCHMARKS_SET_IP(bm_class, type_prefix, arch, dim_opt, arch_supported)
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
index 8cb323043..234550202 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
@@ -25,8 +25,8 @@ class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture {
         v2 = new int8_t[dim];
 
         // random for int8_t and uint8_t is not provided by the standard library
-        memcpy(v1, test_utils::create_int8_vec(dim).data(), dim);
-        memcpy(v2, test_utils::create_int8_vec(dim).data(), dim);
+        memcpy(v1, test_utils::create_int8_vec(dim, 123).data(), dim);
+        memcpy(v2, test_utils::create_int8_vec(dim, 1234).data(), dim);
     }
     void TearDown(const ::benchmark::State &state) {
         delete v1;
@@ -34,7 +34,6 @@ class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture {
     }
 };
 
-
 #ifdef CPU_FEATURES_ARCH_X86_64
 cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 4a546b6ca..00f2a2d2f 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -913,8 +913,8 @@ class INT8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
 TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
     auto optimization = cpu_features::GetX86Info().features;
     size_t dim = GetParam();
-    auto v1 = test_utils::create_int8_vec(dim);
-    auto v2 = test_utils::create_int8_vec(dim);
+    auto v1 = test_utils::create_int8_vec(dim, 123);
+    auto v2 = test_utils::create_int8_vec(dim, 1234);
 
     auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
         size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8;
@@ -931,7 +931,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
         ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim))
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim;
-        ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;
+        ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
@@ -946,6 +946,6 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
 }
 
 INSTANTIATE_TEST_SUITE_P(INT8OptFuncs, INT8SpacesOptimizationTest,
-                         testing::Range(32UL, 32 * 2UL + 1));
+                         testing::Range(64UL, 64 * 2UL + 1));
 
 #endif // CPU_FEATURES_ARCH_X86_64
diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h
index 568fc1a49..0231a9838 100644
--- a/tests/utils/tests_utils.h
+++ b/tests/utils/tests_utils.h
@@ -5,9 +5,9 @@
 
 namespace test_utils {
 
-std::vector<int8_t> create_int8_vec(size_t dim) {
+static std::vector<int8_t> create_int8_vec(size_t dim, int seed = 1234) {
 
-    std::mt19937 gen(1234); // Mersenne Twister engine initialized with the fixed seed
+    std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed
 
     // uniform_int_distribution doesn't support int8,
     // Define a distribution range for int8_t

From cde5e2d020910ce15e03fff6990bf2a70048acc6 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Sun, 8 Dec 2024 12:22:22 +0000
Subject: [PATCH 09/26] format

---
 tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
index 234550202..96c02a44c 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
@@ -37,7 +37,7 @@ class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture {
 #ifdef CPU_FEATURES_ARCH_X86_64
 cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 
-// AVX512_BF16 functions
+// AVX512_F_BW_VL_VNNI functions
 #ifdef OPT_AVX512_F_BW_VL_VNNI
 bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni;
 INITIALIZE_BENCHMARKS_SET_L2(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
@@ -46,8 +46,7 @@ INITIALIZE_BENCHMARKS_SET_L2(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_
 //                              avx512_f_bw_vl_vnni_supported);
 // INITIALIZE_BENCHMARKS_SET_COSINE(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32,
 //                              avx512_f_bw_vl_vnni_supported)
-#endif // AVX512_BF16
-
+#endif // AVX512_F_BW_VL_VNNI
 
 #endif // x86_64
 

From cdb4d7f621513c70c5bdd00fafda2eb566ec014f Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Sun, 8 Dec 2024 12:46:29 +0000
Subject: [PATCH 10/26] implmenet IP + unit test

---
 .../spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h    | 56 +++++++++++++++++++
 src/VecSim/spaces/IP_space.cpp                | 13 +++++
 .../spaces/functions/AVX512F_BW_VL_VNNI.cpp   |  6 ++
 .../spaces/functions/AVX512F_BW_VL_VNNI.h     |  1 +
 tests/unit/test_spaces.cpp                    | 43 +++++++++++++-
 5 files changed, 117 insertions(+), 2 deletions(-)
 create mode 100644 src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h

diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
new file mode 100644
index 000000000..a7b99fcb8
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
@@ -0,0 +1,56 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+
+static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) {
+    __m256i temp_a = _mm256_loadu_epi8(pVect1);
+    __m512i va = _mm512_cvtepi8_epi16(temp_a);
+    pVect1 += 32;
+
+    __m256i temp_b = _mm256_loadu_epi8(pVect2);
+    __m512i vb = _mm512_cvtepi8_epi16(temp_b);
+    pVect2 += 32;
+
+    // _mm512_dpwssd_epi32(src, a, b)
+    // Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding
+    // 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results
+    // with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+    sum = _mm512_dpwssd_epi32(sum, va, vb);
+}
+
+template <unsigned char residual> // 0..32
+float INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                                 size_t dimension) {
+    int8_t *pVect1 = (int8_t *)pVect1v;
+    int8_t *pVect2 = (int8_t *)pVect2v;
+
+    const int8_t *pEnd1 = pVect1 + dimension;
+
+    __m512i sum = _mm512_setzero_epi32();
+
+    // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block,
+    // so mask loading is guaranteed to be safe
+    if constexpr (residual) {
+        __mmask32 mask = (1LU << residual) - 1;
+        __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1);
+        __m512i va = _mm512_cvtepi8_epi16(temp_a);
+        pVect1 += residual;
+
+        __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2);
+        __m512i vb = _mm512_cvtepi8_epi16(temp_b);
+        pVect2 += residual;
+
+        sum = _mm512_dpwssd_epi32(sum, va, vb);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 32-int_8.
+    do {
+        InnerProductStep(pVect1, pVect2, sum);
+    } while (pVect1 < pEnd1);
+
+    return 1.0f - float(_mm512_reduce_add_epi32(sum));
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 699919dc2..0cebb3bfb 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -16,6 +16,7 @@
 #include "VecSim/spaces/functions/AVX512BW_VBMI2.h"
 #include "VecSim/spaces/functions/AVX512FP16_VL.h"
 #include "VecSim/spaces/functions/AVX512BF16_VL.h"
+#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
 
@@ -207,6 +208,18 @@ dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con
     if (dim < 32) {
         return ret_dist_func;
     }
+#ifdef CPU_FEATURES_ARCH_X86_64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetX86Info().features
+                        : *static_cast<const cpu_features::X86Features *>(arch_opt);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+        if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 32 * sizeof(int8_t); // align to 256 bits.
+        return Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#endif // __x86_64__
     return ret_dist_func;
 }
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index d82d4141d..3d3da5546 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -7,6 +7,7 @@
 #include "AVX512BW_VBMI2.h"
 
 #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h"
+#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h"
 
 namespace spaces {
 
@@ -17,6 +18,11 @@ dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim)
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
+dist_func_t<float> Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI);
+    return ret_dist_func;
+}
 
 #include "implementation_chooser_cleanup.h"
 
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
index 818b9529f..c1ef5d6b8 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -11,5 +11,6 @@
 namespace spaces {
 
 dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+dist_func_t<float> Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 
 } // namespace spaces
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 00f2a2d2f..61e6f167c 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -300,6 +300,7 @@ TEST_F(SpacesTest, smallDimChooser) {
         ASSERT_EQ(IP_FP64_GetDistFunc(dim), FP64_InnerProduct);
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
+        ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct);
     }
     for (size_t dim = 8; dim < 16; dim++) {
         ASSERT_EQ(L2_FP32_GetDistFunc(dim), FP32_L2Sqr);
@@ -309,6 +310,7 @@ TEST_F(SpacesTest, smallDimChooser) {
         ASSERT_EQ(IP_FP32_GetDistFunc(dim), FP32_InnerProduct);
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
+        ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct);
     }
     for (size_t dim = 16; dim < 32; dim++) {
         ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian);
@@ -316,6 +318,7 @@ TEST_F(SpacesTest, smallDimChooser) {
         ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr);
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
+        ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct);
     }
 }
 
@@ -931,7 +934,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
         ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim))
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim;
-        ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
+        ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
@@ -945,7 +948,43 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
 }
 
+TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) {
+    auto optimization = cpu_features::GetX86Info().features;
+    size_t dim = GetParam();
+    auto v1 = test_utils::create_int8_vec(dim, 123);
+    auto v2 = test_utils::create_int8_vec(dim, 1234);
+
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = INT8_InnerProduct(v1.data(), v2.data(), dim);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
+        optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim;
+        ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
+            optimization.avx512vnni = 0;
+    }
+#endif
+    unsigned char alignment = 0;
+    arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, INT8_InnerProduct)
+        << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim))
+        << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
+
 INSTANTIATE_TEST_SUITE_P(INT8OptFuncs, INT8SpacesOptimizationTest,
-                         testing::Range(64UL, 64 * 2UL + 1));
+                         testing::Range(32UL, 32 * 2UL + 1));
 
 #endif // CPU_FEATURES_ARCH_X86_64

From 5f018903fd7b6ca1da042ffd24571839d363d1f0 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Sun, 8 Dec 2024 12:50:45 +0000
Subject: [PATCH 11/26] ip bm

---
 tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
index 96c02a44c..5ab3ebf7f 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
@@ -40,7 +40,7 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 // AVX512_F_BW_VL_VNNI functions
 #ifdef OPT_AVX512_F_BW_VL_VNNI
 bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni;
-INITIALIZE_BENCHMARKS_SET_L2(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
                              avx512_f_bw_vl_vnni_supported);
 // INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32,
 //                              avx512_f_bw_vl_vnni_supported);

From 2dce6f0d3da611587f87001a9c7be9d7ea61e384 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Sun, 8 Dec 2024 12:54:20 +0000
Subject: [PATCH 12/26] format

---
 tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
index 5ab3ebf7f..cb12f10cb 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
@@ -41,7 +41,7 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 #ifdef OPT_AVX512_F_BW_VL_VNNI
 bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni;
 INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
-                             avx512_f_bw_vl_vnni_supported);
+                                avx512_f_bw_vl_vnni_supported);
 // INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32,
 //                              avx512_f_bw_vl_vnni_supported);
 // INITIALIZE_BENCHMARKS_SET_COSINE(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32,

From 3d3b3758e3e2f701917cd680cc20a267641ecad0 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Sun, 8 Dec 2024 15:34:34 +0000
Subject: [PATCH 13/26] implement cosine in ip API

change create_int8_vec to  populate_int8_vec

add compute norm
---
 src/VecSim/spaces/CMakeLists.txt              |  2 -
 src/VecSim/spaces/Cosine/Cosine.cpp           | 23 -----
 src/VecSim/spaces/Cosine/Cosine.h             | 11 ---
 src/VecSim/spaces/Cosine_space.cpp            | 27 ------
 src/VecSim/spaces/Cosine_space.h              | 13 ---
 src/VecSim/spaces/IP/IP.cpp                   | 14 ++-
 src/VecSim/spaces/IP/IP.h                     |  1 +
 .../spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h    | 20 +++-
 src/VecSim/spaces/IP_space.cpp                | 27 ++++++
 src/VecSim/spaces/IP_space.h                  |  2 +
 .../spaces/functions/AVX512F_BW_VL_VNNI.cpp   |  7 ++
 .../spaces/functions/AVX512F_BW_VL_VNNI.h     |  1 +
 src/VecSim/spaces/spaces.cpp                  | 13 +++
 tests/benchmark/spaces_benchmarks/bm_spaces.h | 10 +-
 .../spaces_benchmarks/bm_spaces_int8.cpp      | 23 ++---
 tests/unit/test_spaces.cpp                    | 95 +++++++++++++------
 tests/utils/tests_utils.h                     | 17 +++-
 17 files changed, 176 insertions(+), 130 deletions(-)
 delete mode 100644 src/VecSim/spaces/Cosine/Cosine.cpp
 delete mode 100644 src/VecSim/spaces/Cosine/Cosine.h
 delete mode 100644 src/VecSim/spaces/Cosine_space.cpp
 delete mode 100644 src/VecSim/spaces/Cosine_space.h

diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
index fc23adc18..1fc9473b2 100644
--- a/src/VecSim/spaces/CMakeLists.txt
+++ b/src/VecSim/spaces/CMakeLists.txt
@@ -3,7 +3,6 @@ project(VectorSimilaritySpaces_no_optimization)
 add_library(VectorSimilaritySpaces_no_optimization
 	L2/L2.cpp
 	IP/IP.cpp
-	Cosine/Cosine.cpp
 )
 
 include(${root}/cmake/cpu_features.cmake)
@@ -86,7 +85,6 @@ endif()
 add_library(VectorSimilaritySpaces
 	L2_space.cpp
 	IP_space.cpp
-	Cosine_space.cpp
 	spaces.cpp
 	${OPTIMIZATIONS}
 	computer/preprocessor_container.cpp
diff --git a/src/VecSim/spaces/Cosine/Cosine.cpp b/src/VecSim/spaces/Cosine/Cosine.cpp
deleted file mode 100644
index 1cbc9a191..000000000
--- a/src/VecSim/spaces/Cosine/Cosine.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *Copyright Redis Ltd. 2021 - present
- *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
- *the Server Side Public License v1 (SSPLv1).
- */
-
-#include "Cosine.h"
-
-float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    int8_t *pVect1 = (int8_t *)pVect1v;
-    int8_t *pVect2 = (int8_t *)pVect2v;
-
-    int res = 0;
-    for (size_t i = 0; i < dimension; i++) {
-        int16_t a = pVect1[i];
-        int16_t b = pVect2[i];
-        res += a * b;
-    }
-
-    float norm_v1 = *(float *)pVect1v;
-    float norm_v2 = *(float *)pVect2v;
-    return 1.0f - float(res) / (norm_v1 * norm_v2);
-}
diff --git a/src/VecSim/spaces/Cosine/Cosine.h b/src/VecSim/spaces/Cosine/Cosine.h
deleted file mode 100644
index c42f6c14f..000000000
--- a/src/VecSim/spaces/Cosine/Cosine.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/*
- *Copyright Redis Ltd. 2021 - present
- *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
- *the Server Side Public License v1 (SSPLv1).
- */
-
-#pragma once
-
-#include <cstdlib>
-
-float INT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension);
diff --git a/src/VecSim/spaces/Cosine_space.cpp b/src/VecSim/spaces/Cosine_space.cpp
deleted file mode 100644
index 7cace4c32..000000000
--- a/src/VecSim/spaces/Cosine_space.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- *Copyright Redis Ltd. 2021 - present
- *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
- *the Server Side Public License v1 (SSPLv1).
- */
-
-#include "VecSim/spaces/space_includes.h"
-#include "VecSim/spaces/Cosine_space.h"
-#include "VecSim/spaces/Cosine/Cosine.h"
-
-namespace spaces {
-dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
-                                           const void *arch_opt) {
-    unsigned char dummy_alignment;
-    if (alignment == nullptr) {
-        alignment = &dummy_alignment;
-    }
-
-    dist_func_t<float> ret_dist_func = INT8_Cosine;
-    // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
-    if (dim < 32) {
-        return ret_dist_func;
-    }
-    return ret_dist_func;
-}
-
-} // namespace spaces
diff --git a/src/VecSim/spaces/Cosine_space.h b/src/VecSim/spaces/Cosine_space.h
deleted file mode 100644
index e139a5521..000000000
--- a/src/VecSim/spaces/Cosine_space.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- *Copyright Redis Ltd. 2021 - present
- *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
- *the Server Side Public License v1 (SSPLv1).
- */
-
-#pragma once
-#include "VecSim/spaces/spaces.h"
-
-namespace spaces {
-dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
-                                           const void *arch_opt = nullptr);
-} // namespace spaces
diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
index 1562e5b1a..c3856abda 100644
--- a/src/VecSim/spaces/IP/IP.cpp
+++ b/src/VecSim/spaces/IP/IP.cpp
@@ -67,7 +67,7 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension
     return 1.0f - res;
 }
 
-float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
     int8_t *pVect1 = (int8_t *)pVect1v;
     int8_t *pVect2 = (int8_t *)pVect2v;
 
@@ -77,5 +77,15 @@ float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensi
         int16_t b = pVect2[i];
         res += a * b;
     }
-    return 1.0f - float(res);
+    return res;
+}
+
+float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1 - INT8_InnerProductImp(pVect1v, pVect2v, dimension);
+}
+
+float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float norm_v1 = *(float *)((int8_t *)pVect1v + dimension);
+    float norm_v2 = *(float *)((int8_t *)pVect2v + dimension);
+    return 1.0f - float(INT8_InnerProductImp(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2);
 }
diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h
index 64e11b52f..d712499ed 100644
--- a/src/VecSim/spaces/IP/IP.h
+++ b/src/VecSim/spaces/IP/IP.h
@@ -18,3 +18,4 @@ float BF16_InnerProduct_LittleEndian(const void *pVect1v, const void *pVect2v, s
 float BF16_InnerProduct_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension);
 
 float INT8_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
+float INT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension);
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
index a7b99fcb8..fcd33c00c 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
@@ -23,8 +23,7 @@ static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &s
 }
 
 template <unsigned char residual> // 0..32
-float INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
-                                                 size_t dimension) {
+static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
     int8_t *pVect1 = (int8_t *)pVect1v;
     int8_t *pVect2 = (int8_t *)pVect2v;
 
@@ -52,5 +51,20 @@ float INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void
         InnerProductStep(pVect1, pVect2, sum);
     } while (pVect1 < pEnd1);
 
-    return 1.0f - float(_mm512_reduce_add_epi32(sum));
+    return _mm512_reduce_add_epi32(sum);
+}
+
+template <unsigned char residual> // 0..32
+float INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                                 size_t dimension) {
+
+    return 1 - INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+}
+template <unsigned char residual> // 0..32
+float INT8_CosineSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                           size_t dimension) {
+    float norm_v1 = *(float *)((int8_t *)pVect1v + dimension);
+    float norm_v2 = *(float *)((int8_t *)pVect2v + dimension);
+    return 1.0f -
+           float(INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2);
 }
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 0cebb3bfb..b168b2d7d 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -222,4 +222,31 @@ dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con
 #endif // __x86_64__
     return ret_dist_func;
 }
+
+dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
+                                           const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = INT8_Cosine;
+    // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
+    if (dim < 32) {
+        return ret_dist_func;
+    }
+#ifdef CPU_FEATURES_ARCH_X86_64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetX86Info().features
+                        : *static_cast<const cpu_features::X86Features *>(arch_opt);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+        if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 32 * sizeof(int8_t); // align to 256 bits.
+        return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
 } // namespace spaces
diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h
index 87407c1a3..0d8c3a836 100644
--- a/src/VecSim/spaces/IP_space.h
+++ b/src/VecSim/spaces/IP_space.h
@@ -18,4 +18,6 @@ dist_func_t<float> IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment = nu
                                        const void *arch_opt = nullptr);
 dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                        const void *arch_opt = nullptr);
+dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                           const void *arch_opt = nullptr);
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index 3d3da5546..cd66a6096 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -18,12 +18,19 @@ dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim)
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
+
 dist_func_t<float> Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
     CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_CosineSIMD32_AVX512F_BW_VL_VNNI);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
index c1ef5d6b8..532a33c76 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h
@@ -12,5 +12,6 @@ namespace spaces {
 
 dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 dist_func_t<float> Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim);
+dist_func_t<float> Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim);
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/spaces.cpp b/src/VecSim/spaces/spaces.cpp
index 84f71b463..b512c9750 100644
--- a/src/VecSim/spaces/spaces.cpp
+++ b/src/VecSim/spaces/spaces.cpp
@@ -69,6 +69,19 @@ dist_func_t<double> GetDistFunc<double, double>(VecSimMetric metric, size_t dim,
     }
     throw std::invalid_argument("Invalid metric");
 }
+template <>
+dist_func_t<float> GetDistFunc<int8_t, float>(VecSimMetric metric, size_t dim,
+                                              unsigned char *alignment) {
+    switch (metric) {
+    case VecSimMetric_Cosine:
+        return Cosine_INT8_GetDistFunc(dim, alignment);
+    case VecSimMetric_IP:
+        return IP_INT8_GetDistFunc(dim, alignment);
+    case VecSimMetric_L2:
+        return L2_INT8_GetDistFunc(dim, alignment);
+    }
+    throw std::invalid_argument("Invalid metric");
+}
 
 template <>
 normalizeVector_f<float> GetNormalizeFunc<float>(void) {
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index 909f4d6dd..b7431c43c 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -124,11 +124,11 @@ static constexpr size_t start = min_no_res_th_dim;
     INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported);          \
     INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported);
 
-#define INITIALIZE_BENCHMARKS_SET_COSINE(bm_class, type_prefix, arch, dim_opt, arch_supported)     \
-    INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, COSINE, arch_supported);                      \
-    INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, COSINE, arch_supported);                       \
-    INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported);      \
-    INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported);
+#define INITIALIZE_BENCHMARKS_SET_Cosine(bm_class, type_prefix, arch, dim_opt, arch_supported)     \
+    INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, Cosine, arch_supported);                      \
+    INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, Cosine, arch_supported);                       \
+    INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, Cosine, dim_opt, arch_supported);      \
+    INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, Cosine, dim_opt, arch_supported);
 
 #define INITIALIZE_BENCHMARKS_SET_L2_IP(bm_class, type_prefix, arch, dim_opt, arch_supported)      \
     INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported)             \
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
index cb12f10cb..0adde8972 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
@@ -21,12 +21,15 @@ class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture {
 
     void SetUp(const ::benchmark::State &state) {
         dim = state.range(0);
-        v1 = new int8_t[dim];
-        v2 = new int8_t[dim];
-
-        // random for int8_t and uint8_t is not provided by the standard library
-        memcpy(v1, test_utils::create_int8_vec(dim, 123).data(), dim);
-        memcpy(v2, test_utils::create_int8_vec(dim, 1234).data(), dim);
+        // Allocate vector with extra space for cosine calculations
+        v1 = new int8_t[dim + sizeof(float)];
+        v2 = new int8_t[dim + sizeof(float)];
+        test_utils::populate_int8_vec(v1, dim, 123);
+        test_utils::populate_int8_vec(v2, dim, 1234);
+
+        // Store the norm in the extra space for cosine calculations
+        *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim);
+        *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim);
     }
     void TearDown(const ::benchmark::State &state) {
         delete v1;
@@ -42,14 +45,12 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni;
 INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
                                 avx512_f_bw_vl_vnni_supported);
-// INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32,
-//                              avx512_f_bw_vl_vnni_supported);
-// INITIALIZE_BENCHMARKS_SET_COSINE(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32,
-//                              avx512_f_bw_vl_vnni_supported)
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32,
+                                 avx512_f_bw_vl_vnni_supported)
 #endif // AVX512_F_BW_VL_VNNI
 
 #endif // x86_64
 
-INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, InnerProduct, 32);
+    INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, InnerProduct, 32);
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, L2Sqr, 32);
 BENCHMARK_MAIN();
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 61e6f167c..f76069c46 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -12,12 +12,10 @@
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/IP/IP.h"
 #include "VecSim/spaces/L2/L2.h"
-#include "VecSim/spaces/Cosine/Cosine.h"
 #include "VecSim/utils/vec_utils.h"
 #include "VecSim/types/bfloat16.h"
 #include "VecSim/spaces/IP_space.h"
 #include "VecSim/spaces/L2_space.h"
-#include "VecSim/spaces/Cosine_space.h"
 #include "VecSim/types/float16.h"
 #include "VecSim/spaces/functions/AVX512F.h"
 #include "VecSim/spaces/functions/AVX.h"
@@ -243,19 +241,19 @@ TEST_F(SpacesTest, int8_ip_no_optimization_func_test) {
 
 TEST_F(SpacesTest, int8_Cosine_no_optimization_func_test) {
     size_t dim = 4;
-    // create normalized vector with extra space for the norm
-    std::vector<int8_t> vec1(dim + sizeof(float), 0);
-    std::vector<int8_t> vec2(dim + sizeof(float), 0);
+    // create a vector with extra space for the norm
+    int8_t *v1 = new int8_t[dim + sizeof(float)];
+    int8_t *v2 = new int8_t[dim + sizeof(float)];
 
-    vec1[0] = 1; // {1, 0, 0, 0}
-    vec2[1] = 1; // {1, 0, 0, 0}
+    test_utils::populate_int8_vec(v1, dim, 123);
+    test_utils::populate_int8_vec(v2, dim, 123);
 
     // write the norm at the end of the vector
-    *(float *)(vec1.data() + dim) = 1.0;
-    *(float *)(vec2.data() + dim) = 1.0;
+    *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim);
+    *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim);
 
-    float dist = INT8_InnerProduct((const void *)vec1.data(), (const void *)vec2.data(), dim);
-    ASSERT_EQ(dist, 1.0);
+    float dist = INT8_Cosine((const void *)v1, (const void *)v2, dim);
+    ASSERT_NEAR(dist, 0.0, 0.000001);
 }
 
 /* ======================== Test Getters ======================== */
@@ -280,11 +278,11 @@ TEST_F(SpacesTest, GetDistFuncInvalidMetricFP16) {
         (spaces::GetDistFunc<float16, float>((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)),
         std::invalid_argument);
 }
-// TEST_F(SpacesTest, GetDistFuncInvalidMetricINT8) {
-//     EXPECT_THROW(
-//         (spaces::GetDistFunc<int8_t, float>((VecSimMetric)(VecSimMetric_Cosine + 1), 10,
-//         nullptr)), std::invalid_argument);
-// }
+TEST_F(SpacesTest, GetDistFuncInvalidMetricINT8) {
+    EXPECT_THROW(
+        (spaces::GetDistFunc<int8_t, float>((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)),
+        std::invalid_argument);
+}
 
 using namespace spaces;
 
@@ -916,8 +914,10 @@ class INT8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
 TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
     auto optimization = cpu_features::GetX86Info().features;
     size_t dim = GetParam();
-    auto v1 = test_utils::create_int8_vec(dim, 123);
-    auto v2 = test_utils::create_int8_vec(dim, 1234);
+    int8_t *v1 = new int8_t[dim];
+    int8_t *v2 = new int8_t[dim];
+    test_utils::populate_int8_vec(v1, dim, 123);
+    test_utils::populate_int8_vec(v2, dim, 1234);
 
     auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
         size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8;
@@ -925,7 +925,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
     };
 
     dist_func_t<float> arch_opt_func;
-    float baseline = INT8_L2Sqr(v1.data(), v2.data(), dim);
+    float baseline = INT8_L2Sqr(v1, v2, dim);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
         optimization.avx512vnni) {
@@ -933,7 +933,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
         arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim))
             << "Unexpected distance function chosen for dim " << dim;
-        ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim;
         ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
@@ -943,16 +943,17 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
     unsigned char alignment = 0;
     arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, INT8_L2Sqr) << "Unexpected distance function chosen for dim " << dim;
-    ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim))
-        << "No optimization with dim " << dim;
+    ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim;
     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
 }
 
 TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) {
     auto optimization = cpu_features::GetX86Info().features;
     size_t dim = GetParam();
-    auto v1 = test_utils::create_int8_vec(dim, 123);
-    auto v2 = test_utils::create_int8_vec(dim, 1234);
+    int8_t *v1 = new int8_t[dim];
+    int8_t *v2 = new int8_t[dim];
+    test_utils::populate_int8_vec(v1, dim, 123);
+    test_utils::populate_int8_vec(v2, dim, 1234);
 
     auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
         size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8;
@@ -960,7 +961,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) {
     };
 
     dist_func_t<float> arch_opt_func;
-    float baseline = INT8_InnerProduct(v1.data(), v2.data(), dim);
+    float baseline = INT8_InnerProduct(v1, v2, dim);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
         optimization.avx512vnni) {
@@ -968,7 +969,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) {
         arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(dim))
             << "Unexpected distance function chosen for dim " << dim;
-        ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim;
         ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
@@ -979,8 +980,46 @@ TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) {
     arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, INT8_InnerProduct)
         << "Unexpected distance function chosen for dim " << dim;
-    ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim))
-        << "No optimization with dim " << dim;
+    ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim;
+    ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+}
+
+TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
+    auto optimization = cpu_features::GetX86Info().features;
+    size_t dim = GetParam();
+    int8_t *v1 = new int8_t[dim + sizeof(float)];
+    int8_t *v2 = new int8_t[dim + sizeof(float)];
+    test_utils::populate_int8_vec(v1, dim, 123);
+    test_utils::populate_int8_vec(v2, dim, 1234);
+
+    // write the norm at the end of the vector
+    *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim);
+    *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim);
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+    float baseline = INT8_Cosine(v1, v2, dim);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
+        optimization.avx512vnni) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim;
+        ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
+            optimization.avx512vnni = 0;
+    }
+#endif
+    unsigned char alignment = 0;
+    arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization);
+    ASSERT_EQ(arch_opt_func, INT8_Cosine) << "Unexpected distance function chosen for dim " << dim;
+    ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim;
     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
 }
 
diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h
index 0231a9838..31dc3d9ef 100644
--- a/tests/utils/tests_utils.h
+++ b/tests/utils/tests_utils.h
@@ -5,7 +5,8 @@
 
 namespace test_utils {
 
-static std::vector<int8_t> create_int8_vec(size_t dim, int seed = 1234) {
+// Assuming v is a memory allocation of size dim * sizeof(float)
+static void populate_int8_vec(int8_t *v, size_t dim, int seed = 1234) {
 
     std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed
 
@@ -13,12 +14,18 @@ static std::vector<int8_t> create_int8_vec(size_t dim, int seed = 1234) {
     // Define a distribution range for int8_t
     std::uniform_int_distribution<int16_t> dis(-128, 127);
 
-    std::vector<int8_t> vec(dim);
-    for (auto &num : vec) {
-        num = static_cast<int8_t>(dis(gen));
+    for (size_t i = 0; i < dim; i++) {
+        v[i] = static_cast<int8_t>(dis(gen));
     }
+}
 
-    return vec;
+// TODO: replace with normalize function from VecSim
+float compute_norm(const int8_t *vec, size_t dim) {
+    int norm = 0;
+    for (size_t i = 0; i < dim; i++) {
+        norm += vec[i] * vec[i];
+    }
+    return sqrt(norm);
 }
 
 } // namespace test_utils

From 6f211b32159cd391e9ec4ec5b806f6109dcd899b Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Mon, 9 Dec 2024 11:43:18 +0000
Subject: [PATCH 14/26] use mask sub instead of msk load

---
 src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
index 9130d6414..edeb37b4b 100644
--- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
@@ -37,15 +37,15 @@ float INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect
     // so mask loading is guaranteed to be safe
     if constexpr (residual) {
         __mmask32 mask = (1LU << residual) - 1;
-        __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1);
+        __m256i temp_a = _mm256_loadu_epi8(pVect1);
         __m512i va = _mm512_cvtepi8_epi16(temp_a);
         pVect1 += residual;
 
-        __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2);
+        __m256i temp_b = _mm256_loadu_epi8(pVect2);
         __m512i vb = _mm512_cvtepi8_epi16(temp_b);
         pVect2 += residual;
 
-        __m512i diff = _mm512_sub_epi16(va, vb);
+        __m512i diff = _mm512_maskz_sub_epi16(mask, va, vb);
         sum = _mm512_dpwssd_epi32(sum, diff, diff);
     }
 

From 6ac65a3476d3de2c261fb41ae581a785baf85623 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Tue, 10 Dec 2024 06:14:36 +0000
Subject: [PATCH 15/26] loop size = 512 minimal dim = 32

---
 .../spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h    | 21 ++++++++++++-------
 .../spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h    | 21 ++++++++++++-------
 .../spaces/functions/AVX512F_BW_VL_VNNI.cpp   |  6 +++---
 .../spaces/functions/implementation_chooser.h | 15 ++++++++-----
 4 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
index fcd33c00c..28187bf31 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
@@ -22,7 +22,7 @@ static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &s
     sum = _mm512_dpwssd_epi32(sum, va, vb);
 }
 
-template <unsigned char residual> // 0..32
+template <unsigned char residual> // 0..64
 static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
     int8_t *pVect1 = (int8_t *)pVect1v;
     int8_t *pVect2 = (int8_t *)pVect2v;
@@ -33,23 +33,28 @@ static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v,
 
     // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block,
     // so mask loading is guaranteed to be safe
-    if constexpr (residual) {
-        __mmask32 mask = (1LU << residual) - 1;
+    if constexpr (residual % 32) {
+        __mmask32 mask = (1LU << (residual % 32)) - 1;
         __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1);
         __m512i va = _mm512_cvtepi8_epi16(temp_a);
-        pVect1 += residual;
+        pVect1 += residual % 32;
 
         __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2);
         __m512i vb = _mm512_cvtepi8_epi16(temp_b);
-        pVect2 += residual;
+        pVect2 += residual % 32;
 
         sum = _mm512_dpwssd_epi32(sum, va, vb);
     }
 
-    // We dealt with the residual part. We are left with some multiple of 32-int_8.
-    do {
+    if constexpr (residual >= 32) {
+        InnerProductStep(pVect1, pVect2, sum);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 64-int_8.
+    while (pVect1 < pEnd1) {
+        InnerProductStep(pVect1, pVect2, sum);
         InnerProductStep(pVect1, pVect2, sum);
-    } while (pVect1 < pEnd1);
+    }
 
     return _mm512_reduce_add_epi32(sum);
 }
diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
index edeb37b4b..d47964ca2 100644
--- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
@@ -23,7 +23,7 @@ static inline void L2SqrStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) {
     sum = _mm512_dpwssd_epi32(sum, diff, diff);
 }
 
-template <unsigned char residual> // 0..32
+template <unsigned char residual> // 0..64
 float INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
                                           size_t dimension) {
     int8_t *pVect1 = (int8_t *)pVect1v;
@@ -35,24 +35,29 @@ float INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect
 
     // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block,
     // so mask loading is guaranteed to be safe
-    if constexpr (residual) {
-        __mmask32 mask = (1LU << residual) - 1;
+    if constexpr (residual % 32) {
+        __mmask32 mask = (1LU << (residual % 32)) - 1;
         __m256i temp_a = _mm256_loadu_epi8(pVect1);
         __m512i va = _mm512_cvtepi8_epi16(temp_a);
-        pVect1 += residual;
+        pVect1 += residual % 32;
 
         __m256i temp_b = _mm256_loadu_epi8(pVect2);
         __m512i vb = _mm512_cvtepi8_epi16(temp_b);
-        pVect2 += residual;
+        pVect2 += residual % 32;
 
         __m512i diff = _mm512_maskz_sub_epi16(mask, va, vb);
         sum = _mm512_dpwssd_epi32(sum, diff, diff);
     }
 
-    // We dealt with the residual part. We are left with some multiple of 32-int_8.
-    do {
+    if constexpr (residual >= 32) {
         L2SqrStep(pVect1, pVect2, sum);
-    } while (pVect1 < pEnd1);
+    }
+
+    // We dealt with the residual part. We are left with some multiple of 64-int_8.
+    while (pVect1 < pEnd1) {
+        L2SqrStep(pVect1, pVect2, sum);
+        L2SqrStep(pVect1, pVect2, sum);
+    }
 
     return _mm512_reduce_add_epi32(sum);
 }
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index cd66a6096..599984954 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -15,19 +15,19 @@ namespace spaces {
 
 dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI);
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
 
 dist_func_t<float> Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI);
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
 
 dist_func_t<float> Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_CosineSIMD32_AVX512F_BW_VL_VNNI);
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_CosineSIMD32_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
 
diff --git a/src/VecSim/spaces/functions/implementation_chooser.h b/src/VecSim/spaces/functions/implementation_chooser.h
index 6bb61815e..b32ad56c6 100644
--- a/src/VecSim/spaces/functions/implementation_chooser.h
+++ b/src/VecSim/spaces/functions/implementation_chooser.h
@@ -25,23 +25,28 @@
 // of 4N, 4N+1, 4N+2, 4N+3.
 #define C4(X, func, N) X(4 * N, func) X(4 * N + 1, func) X(4 * N + 2, func) X(4 * N + 3, func)
 
-// Macros for 8, 16 and 32 cases. Used to collapse the switch statement. Expands into 0-31, 0-15 or
-// 0-7 cases.
+// Macros for 8, 16, 32 and 64 cases. Used to collapse the switch statement. Expands into 0-63,
+// 0-31, 0-15 or 0-7 cases.
 #define CASES32(X, func)                                                                           \
     C4(X, func, 0)                                                                                 \
     C4(X, func, 1)                                                                                 \
     C4(X, func, 2) C4(X, func, 3) C4(X, func, 4) C4(X, func, 5) C4(X, func, 6) C4(X, func, 7)
 #define CASES16(X, func) C4(X, func, 0) C4(X, func, 1) C4(X, func, 2) C4(X, func, 3)
 #define CASES8(X, func)  C4(X, func, 0) C4(X, func, 1)
+#define CASES64(X, func)                                                                           \
+    CASES32(X, func)                                                                               \
+    C4(X, func, 8)                                                                                 \
+    C4(X, func, 9)                                                                                 \
+    C4(X, func, 10) C4(X, func, 11) C4(X, func, 12) C4(X, func, 13) C4(X, func, 14) C4(X, func, 15)
 
 // Main macro. Expands into a switch statement that chooses the implementation based on the
 // dimension's remainder.
 // @params:
 // out:     The output variable that will be set to the chosen implementation.
 // dim:     The dimension.
-// chunk:   The chunk size. Can be 32, 16 or 8. 32 for 16-bit elements, 16 for 32-bit elements, 8
-// for 64-bit elements.
-// func:    The templated function that we want to choose the implementation for.
+// chunk:   The chunk size. Can be 64, 32, 16 or 8. 64 for 8-bit elements, 32 for 16-bit elements,
+// 16 for 32-bit elements, 8 for 64-bit elements. func:    The templated function that we want to
+// choose the implementation for.
 #define CHOOSE_IMPLEMENTATION(out, dim, chunk, func)                                               \
     do {                                                                                           \
         decltype(out) __ret_dist_func;                                                             \

From 0d07c5d672b1355258a5deac68faafe0a200610f Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Tue, 10 Dec 2024 10:38:09 +0000
Subject: [PATCH 16/26] add int8 to bm

---
 tests/benchmark/benchmarks.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh
index 2e6664424..867077ede 100755
--- a/tests/benchmark/benchmarks.sh
+++ b/tests/benchmark/benchmarks.sh
@@ -13,6 +13,7 @@ if [ -z "$BM_TYPE"  ] || [ "$BM_TYPE" = "benchmarks-all" ]; then
     echo spaces_fp64
     echo spaces_bf16
     echo spaces_fp16
+    echo spaces_int8
 elif [ "$BM_TYPE" = "benchmarks-default" ]; then
     echo basics_single_fp32
     echo basics_multi_fp32
@@ -20,6 +21,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then
     echo spaces_fp64
     echo spaces_bf16
     echo spaces_fp16
+    echo spaces_int8
 # Basic benchmarks
 elif [ "$BM_TYPE" = "bm-basics-fp32-single" ] ; then
     echo basics_single_fp32
@@ -66,6 +68,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then
     echo spaces_fp16
     echo spaces_fp64
     echo spaces_bf16
+    echo spaces_int8
 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then
     echo spaces_fp32
 elif [ "$BM_TYPE" = "bm-spaces-fp64" ] ; then

From 3586a76f8d7137404a0fa3370bb89a2241aa9e24 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Tue, 10 Dec 2024 10:40:58 +0000
Subject: [PATCH 17/26] reanme to simd64

---
 src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h  | 4 ++--
 src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h  | 2 +-
 src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
index 28187bf31..ffda357ff 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
@@ -60,13 +60,13 @@ static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v,
 }
 
 template <unsigned char residual> // 0..32
-float INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+float INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
                                                  size_t dimension) {
 
     return 1 - INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
 }
 template <unsigned char residual> // 0..32
-float INT8_CosineSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+float INT8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
                                            size_t dimension) {
     float norm_v1 = *(float *)((int8_t *)pVect1v + dimension);
     float norm_v2 = *(float *)((int8_t *)pVect2v + dimension);
diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
index d47964ca2..3f4ba33a6 100644
--- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
@@ -24,7 +24,7 @@ static inline void L2SqrStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) {
 }
 
 template <unsigned char residual> // 0..64
-float INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+float INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
                                           size_t dimension) {
     int8_t *pVect1 = (int8_t *)pVect1v;
     int8_t *pVect2 = (int8_t *)pVect2v;
diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
index 599984954..661c2c945 100644
--- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
+++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp
@@ -15,19 +15,19 @@ namespace spaces {
 
 dist_func_t<float> Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI);
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
 
 dist_func_t<float> Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI);
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
 
 dist_func_t<float> Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) {
     dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_CosineSIMD32_AVX512F_BW_VL_VNNI);
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_CosineSIMD64_AVX512F_BW_VL_VNNI);
     return ret_dist_func;
 }
 

From adbc4d7e1be2e6b6efeb6d82b7e342e52a174c5e Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Tue, 10 Dec 2024 16:32:56 +0000
Subject: [PATCH 18/26] convert to int before multiplication

---
 tests/utils/tests_utils.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h
index 31dc3d9ef..01461a78d 100644
--- a/tests/utils/tests_utils.h
+++ b/tests/utils/tests_utils.h
@@ -23,7 +23,8 @@ static void populate_int8_vec(int8_t *v, size_t dim, int seed = 1234) {
 float compute_norm(const int8_t *vec, size_t dim) {
     int norm = 0;
     for (size_t i = 0; i < dim; i++) {
-        norm += vec[i] * vec[i];
+        int val = static_cast<int>(vec[i]);
+        norm += val * val;
     }
     return sqrt(norm);
 }

From cb2c88727f86a1fa6063d035ee0d933d4e2b3da9 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Mon, 16 Dec 2024 13:00:39 +0000
Subject: [PATCH 19/26] review comments:

align to vector size ncluding the norm in cosine dist

unit test cover small dim in cosine chooser
---
 src/VecSim/spaces/IP/IP.cpp                       | 11 ++++++-----
 src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h | 14 ++++++++------
 src/VecSim/spaces/IP_space.cpp                    |  3 ++-
 src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h |  2 +-
 src/VecSim/spaces/spaces.cpp                      |  1 +
 tests/unit/test_spaces.cpp                        |  3 +++
 6 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
index c3856abda..0884df3bb 100644
--- a/src/VecSim/spaces/IP/IP.cpp
+++ b/src/VecSim/spaces/IP/IP.cpp
@@ -73,9 +73,7 @@ static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v,
 
     int res = 0;
     for (size_t i = 0; i < dimension; i++) {
-        int16_t a = pVect1[i];
-        int16_t b = pVect2[i];
-        res += a * b;
+        res += pVect1[i] * pVect2[i];
     }
     return res;
 }
@@ -85,7 +83,10 @@ float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensi
 }
 
 float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    float norm_v1 = *(float *)((int8_t *)pVect1v + dimension);
-    float norm_v2 = *(float *)((int8_t *)pVect2v + dimension);
+    // We expect the vectors' norm to be stored at the end of the vector.
+    float norm_v1 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect1v) + dimension);
+    float norm_v2 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect2v) + dimension);
     return 1.0f - float(INT8_InnerProductImp(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2);
 }
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
index ffda357ff..7716d8ad7 100644
--- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
+++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h
@@ -59,17 +59,19 @@ static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v,
     return _mm512_reduce_add_epi32(sum);
 }
 
-template <unsigned char residual> // 0..32
+template <unsigned char residual> // 0..64
 float INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
                                                  size_t dimension) {
 
     return 1 - INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
 }
-template <unsigned char residual> // 0..32
+template <unsigned char residual> // 0..64
 float INT8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
                                            size_t dimension) {
-    float norm_v1 = *(float *)((int8_t *)pVect1v + dimension);
-    float norm_v2 = *(float *)((int8_t *)pVect2v + dimension);
-    return 1.0f -
-           float(INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2);
+    float ip = INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+    float norm_v1 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect1v) + dimension);
+    float norm_v2 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect2v) + dimension);
+    return 1.0f - ip / (norm_v1 * norm_v2);
 }
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index b168b2d7d..27e915e71 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -241,7 +241,8 @@ dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
                         : *static_cast<const cpu_features::X86Features *>(arch_opt);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
-        if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
+        // Align to vector memory size, including the norm at the end of the vector.
+        if (dim % 32 + 4 == 0) // no point in aligning if we have an offsetting residual
             *alignment = 32 * sizeof(int8_t); // align to 256 bits.
         return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
     }
diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
index 3f4ba33a6..2c8b846af 100644
--- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
+++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h
@@ -36,7 +36,7 @@ float INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect
     // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block,
     // so mask loading is guaranteed to be safe
     if constexpr (residual % 32) {
-        __mmask32 mask = (1LU << (residual % 32)) - 1;
+        constexpr __mmask32 mask = (1LU << (residual % 32)) - 1;
         __m256i temp_a = _mm256_loadu_epi8(pVect1);
         __m512i va = _mm512_cvtepi8_epi16(temp_a);
         pVect1 += residual % 32;
diff --git a/src/VecSim/spaces/spaces.cpp b/src/VecSim/spaces/spaces.cpp
index b512c9750..4385b5e94 100644
--- a/src/VecSim/spaces/spaces.cpp
+++ b/src/VecSim/spaces/spaces.cpp
@@ -69,6 +69,7 @@ dist_func_t<double> GetDistFunc<double, double>(VecSimMetric metric, size_t dim,
     }
     throw std::invalid_argument("Invalid metric");
 }
+
 template <>
 dist_func_t<float> GetDistFunc<int8_t, float>(VecSimMetric metric, size_t dim,
                                               unsigned char *alignment) {
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index f76069c46..552e7464a 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -299,6 +299,7 @@ TEST_F(SpacesTest, smallDimChooser) {
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
         ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct);
+        ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine);
     }
     for (size_t dim = 8; dim < 16; dim++) {
         ASSERT_EQ(L2_FP32_GetDistFunc(dim), FP32_L2Sqr);
@@ -309,6 +310,7 @@ TEST_F(SpacesTest, smallDimChooser) {
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
         ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct);
+        ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine);
     }
     for (size_t dim = 16; dim < 32; dim++) {
         ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian);
@@ -317,6 +319,7 @@ TEST_F(SpacesTest, smallDimChooser) {
         ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian);
         ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct);
         ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct);
+        ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine);
     }
 }
 

From 880dd332e5284aa96a46c6a4e25e341f585e9104 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Mon, 16 Dec 2024 13:02:16 +0000
Subject: [PATCH 20/26] use sizeof(float)instead of 4

---
 src/VecSim/spaces/IP_space.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 27e915e71..d97d53d96 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -242,7 +242,7 @@ dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         // Align to vector memory size, including the norm at the end of the vector.
-        if (dim % 32 + 4 == 0) // no point in aligning if we have an offsetting residual
+        if (dim % 32 + sizeof(float) == 0) // no point in aligning if we have an offsetting residual
             *alignment = 32 * sizeof(int8_t); // align to 256 bits.
         return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
     }

From b79777fe4daa5e84812d3bd6adae2afcd79dd48d Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Mon, 16 Dec 2024 13:10:20 +0000
Subject: [PATCH 21/26] remove int conversion in test_utils::compute_norm

---
 tests/utils/tests_utils.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h
index 01461a78d..31dc3d9ef 100644
--- a/tests/utils/tests_utils.h
+++ b/tests/utils/tests_utils.h
@@ -23,8 +23,7 @@ static void populate_int8_vec(int8_t *v, size_t dim, int seed = 1234) {
 float compute_norm(const int8_t *vec, size_t dim) {
     int norm = 0;
     for (size_t i = 0; i < dim; i++) {
-        int val = static_cast<int>(vec[i]);
-        norm += val * val;
+        norm += vec[i] * vec[i];
     }
     return sqrt(norm);
 }

From ab159bce10987ec372a3be77ed9aff234f70e565 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Mon, 16 Dec 2024 13:53:30 +0000
Subject: [PATCH 22/26] REVERT!!! malicious test to see if we get to the code

---
 tests/unit/test_spaces.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 552e7464a..3b3360765 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -1018,6 +1018,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
     }
+    ASSERT_EQ(1, 0);
 #endif
     unsigned char alignment = 0;
     arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization);

From 397ac3fb17d714caf2b507bf44963b4c5aadf643 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Mon, 16 Dec 2024 14:05:24 +0000
Subject: [PATCH 23/26] assert dummt

---
 tests/unit/test_spaces.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 3b3360765..b59e86532 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -1017,8 +1017,9 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
+
+        ASSERT_EQ(alignment, 2141);
     }
-    ASSERT_EQ(1, 0);
 #endif
     unsigned char alignment = 0;
     arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization);

From f9b7b87e851c1b99c3b8beb94bfbbe4f2c9625f7 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Mon, 16 Dec 2024 14:24:07 +0000
Subject: [PATCH 24/26] fix alignemnt test

---
 src/VecSim/spaces/IP_space.cpp | 3 ++-
 tests/unit/test_spaces.cpp     | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index d97d53d96..f4d1409e4 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -242,7 +242,8 @@ dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         // Align to vector memory size, including the norm at the end of the vector.
-        if (dim % 32 + sizeof(float) == 0) // no point in aligning if we have an offsetting residual
+        if ((dim + sizeof(float)) % 32 ==
+            0) // no point in aligning if we have an offsetting residual
             *alignment = 32 * sizeof(int8_t); // align to 256 bits.
         return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
     }
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index b59e86532..50dc270c2 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -1000,7 +1000,8 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
     *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim);
     auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
         size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8;
-        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0;
+        return ((dim + sizeof(float)) % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t)
+                                                              : 0;
     };
 
     dist_func_t<float> arch_opt_func;
@@ -1018,7 +1019,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
 
-        ASSERT_EQ(alignment, 2141);
+        // ASSERT_EQ(alignment, 2141)<< "alignemt for dim = " << dim;
     }
 #endif
     unsigned char alignment = 0;

From c4439f3b42fbd162d8bc26250cfef7c70727cfd0 Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Mon, 16 Dec 2024 14:25:25 +0000
Subject: [PATCH 25/26] remove assert

---
 tests/unit/test_spaces.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 50dc270c2..91bc02943 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -1018,8 +1018,6 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
-
-        // ASSERT_EQ(alignment, 2141)<< "alignemt for dim = " << dim;
     }
 #endif
     unsigned char alignment = 0;

From e526d02e87b91048abc13af9ddbf9d1ba0b4fbac Mon Sep 17 00:00:00 2001
From: meiravgri <meirav.grimberg@redis.com>
Date: Tue, 17 Dec 2024 05:42:00 +0000
Subject: [PATCH 26/26] remove cosine alignment

---
 src/VecSim/spaces/IP_space.cpp | 9 +++++----
 tests/unit/test_spaces.cpp     | 8 ++------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index f4d1409e4..e7129b2e8 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -241,10 +241,11 @@ dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
                         : *static_cast<const cpu_features::X86Features *>(arch_opt);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
-        // Align to vector memory size, including the norm at the end of the vector.
-        if ((dim + sizeof(float)) % 32 ==
-            0) // no point in aligning if we have an offsetting residual
-            *alignment = 32 * sizeof(int8_t); // align to 256 bits.
+        // For int8 vectors with cosine distance, the extra float for the norm shifts alignment to
+        // `(dim + sizeof(float)) % 32`.
+        // Vectors satisfying this have a residual, causing offset loads during calculation.
+        // To avoid complexity, we skip alignment here, assuming the performance impact is
+        // negligible.
         return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
     }
 #endif
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 91bc02943..9931d318a 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -998,11 +998,6 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
     // write the norm at the end of the vector
     *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim);
     *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim);
-    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
-        size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8;
-        return ((dim + sizeof(float)) % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t)
-                                                              : 0;
-    };
 
     dist_func_t<float> arch_opt_func;
     float baseline = INT8_Cosine(v1, v2, dim);
@@ -1014,7 +1009,8 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
         ASSERT_EQ(arch_opt_func, Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim))
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim;
-        ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim;
+        // We don't align int8 vectors with cosine distance
+        ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim;
         // Unset optimizations flag, so we'll choose the next optimization.
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;