From 2f34c1532b713d058fd242a204166675265393d5 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Tue, 3 Dec 2024 23:56:01 +0200 Subject: [PATCH 01/26] naive implementation of L2 --- src/VecSim/spaces/L2/L2.cpp | 24 +++++++++++++++++++++++ src/VecSim/types/int8.h | 39 +++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 src/VecSim/types/int8.h diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 5fba0555e..379e21c52 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -7,6 +7,7 @@ #include "L2.h" #include "VecSim/types/bfloat16.h" #include "VecSim/types/float16.h" +#include "VecSim/types/int8.h" #include using bfloat16 = vecsim_types::bfloat16; @@ -70,3 +71,26 @@ float FP16_L2Sqr(const void *pVect1, const void *pVect2, size_t dimension) { } return res; } + +template +float INT8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { + int8_t *pVect1 = (int8_t *)pVect1v; + int8_t *pVect2 = (int8_t *)pVect2v; + + int res = 0; + for (size_t i = 0; i < dimension; i++) { + int16_t a = vecsim_types::int8_to_int16(pVect1[i]); + int16_t b = vecsim_types::int8_to_int16(pVect2[i]); + int16_t diff = a - b; + res += diff * diff; + } + return float(res); +} + +float INT8_L2Sqr_LittleEndian(const void *pVect1v, const void *pVect2v, size_t dimension) { + return INT8_L2Sqr(pVect1v, pVect2v, dimension); +} + +float INT8_L2Sqr_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension) { + return INT8_L2Sqr(pVect1v, pVect2v, dimension); +} diff --git a/src/VecSim/types/int8.h b/src/VecSim/types/int8.h new file mode 100644 index 000000000..73d6b188a --- /dev/null +++ b/src/VecSim/types/int8.h @@ -0,0 +1,39 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once + +#include +#include +#include + +namespace vecsim_types { +struct bfloat16 { + uint16_t val; + bfloat16() = default; + explicit constexpr bfloat16(uint16_t val) : val(val) {} + operator uint16_t() const { return val; } +}; + +static inline bfloat16 float_to_bf16(const float ff) { + uint32_t *p_f32 = (uint32_t *)&ff; + uint32_t f32 = *p_f32; + uint32_t lsb = (f32 >> 16) & 1; + uint32_t round = lsb + 0x7FFF; + f32 += round; + return bfloat16(f32 >> 16); +} + +template +inline float bfloat16_to_float32(bfloat16 val) { + size_t constexpr bytes_offset = is_little ? 1 : 0; + float result = 0; + bfloat16 *p_result = (bfloat16 *)&result + bytes_offset; + *p_result = val; + return result; +} + +} // namespace vecsim_types From c641d2364975fe27413ec7cac5cb6db7e90488d8 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Wed, 4 Dec 2024 00:03:37 +0200 Subject: [PATCH 02/26] update --- src/VecSim/types/int8.h | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/src/VecSim/types/int8.h b/src/VecSim/types/int8.h index 73d6b188a..158f1622b 100644 --- a/src/VecSim/types/int8.h +++ b/src/VecSim/types/int8.h @@ -11,27 +11,12 @@ #include namespace vecsim_types { -struct bfloat16 { - uint16_t val; - bfloat16() = default; - explicit constexpr bfloat16(uint16_t val) : val(val) {} - operator uint16_t() const { return val; } -}; - -static inline bfloat16 float_to_bf16(const float ff) { - uint32_t *p_f32 = (uint32_t *)&ff; - uint32_t f32 = *p_f32; - uint32_t lsb = (f32 >> 16) & 1; - uint32_t round = lsb + 0x7FFF; - f32 += round; - return bfloat16(f32 >> 16); -} template -inline float bfloat16_to_float32(bfloat16 val) { +inline int16_t int8_to_int16(int8_t val) { size_t constexpr bytes_offset = is_little ? 1 : 0; - float result = 0; - bfloat16 *p_result = (bfloat16 *)&result + bytes_offset; + int result = 0; + int16_t *p_result = (int16_t *)&result + bytes_offset; *p_result = val; return result; } From 1c5eb9066083a990ddac89b7bcb3ce961d155159 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Fri, 6 Dec 2024 16:38:38 +0000 Subject: [PATCH 03/26] implment naive disatnce for int8 add cosine to spaces fix typos in calculator --- src/VecSim/spaces/CMakeLists.txt | 2 + src/VecSim/spaces/Cosine/Cosine.cpp | 23 +++++++++ src/VecSim/spaces/Cosine/Cosine.h | 11 ++++ src/VecSim/spaces/Cosine_space.cpp | 27 ++++++++++ src/VecSim/spaces/Cosine_space.h | 13 +++++ src/VecSim/spaces/IP/IP.cpp | 13 +++++ src/VecSim/spaces/IP/IP.h | 2 + src/VecSim/spaces/IP_space.cpp | 13 +++++ src/VecSim/spaces/IP_space.h | 2 + src/VecSim/spaces/L2/L2.cpp | 14 +----- src/VecSim/spaces/L2/L2.h | 2 + src/VecSim/spaces/L2_space.cpp | 14 ++++++ src/VecSim/spaces/L2_space.h | 2 + src/VecSim/spaces/computer/calculator.h | 4 +- .../spaces/functions/implementation_chooser.h | 4 +- src/VecSim/types/int8.h | 24 --------- tests/unit/test_spaces.cpp | 50 +++++++++++++++++++ 17 files changed, 180 insertions(+), 40 deletions(-) create mode 100644 src/VecSim/spaces/Cosine/Cosine.cpp create mode 100644 src/VecSim/spaces/Cosine/Cosine.h create mode 100644 src/VecSim/spaces/Cosine_space.cpp create mode 100644 src/VecSim/spaces/Cosine_space.h delete mode 100644 src/VecSim/types/int8.h diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt index 9cc0baaaf..ad22e8187 100644 --- a/src/VecSim/spaces/CMakeLists.txt +++ b/src/VecSim/spaces/CMakeLists.txt @@ -3,6 +3,7 @@ project(VectorSimilaritySpaces_no_optimization) add_library(VectorSimilaritySpaces_no_optimization L2/L2.cpp IP/IP.cpp + Cosine/Cosine.cpp ) include(${root}/cmake/cpu_features.cmake) @@ -79,6 +80,7 @@ endif() add_library(VectorSimilaritySpaces L2_space.cpp IP_space.cpp + Cosine_space.cpp spaces.cpp ${OPTIMIZATIONS} computer/preprocessor_container.cpp diff --git a/src/VecSim/spaces/Cosine/Cosine.cpp b/src/VecSim/spaces/Cosine/Cosine.cpp new file mode 100644 index 000000000..1cbc9a191 --- /dev/null +++ b/src/VecSim/spaces/Cosine/Cosine.cpp @@ -0,0 +1,23 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "Cosine.h" + +float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { + int8_t *pVect1 = (int8_t *)pVect1v; + int8_t *pVect2 = (int8_t *)pVect2v; + + int res = 0; + for (size_t i = 0; i < dimension; i++) { + int16_t a = pVect1[i]; + int16_t b = pVect2[i]; + res += a * b; + } + + float norm_v1 = *(float *)pVect1v; + float norm_v2 = *(float *)pVect2v; + return 1.0f - float(res) / (norm_v1 * norm_v2); +} diff --git a/src/VecSim/spaces/Cosine/Cosine.h b/src/VecSim/spaces/Cosine/Cosine.h new file mode 100644 index 000000000..c42f6c14f --- /dev/null +++ b/src/VecSim/spaces/Cosine/Cosine.h @@ -0,0 +1,11 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once + +#include + +float INT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/Cosine_space.cpp b/src/VecSim/spaces/Cosine_space.cpp new file mode 100644 index 000000000..7cace4c32 --- /dev/null +++ b/src/VecSim/spaces/Cosine_space.cpp @@ -0,0 +1,27 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/Cosine_space.h" +#include "VecSim/spaces/Cosine/Cosine.h" + +namespace spaces { +dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = INT8_Cosine; + // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. + if (dim < 32) { + return ret_dist_func; + } + return ret_dist_func; +} + +} // namespace spaces diff --git a/src/VecSim/spaces/Cosine_space.h b/src/VecSim/spaces/Cosine_space.h new file mode 100644 index 000000000..e139a5521 --- /dev/null +++ b/src/VecSim/spaces/Cosine_space.h @@ -0,0 +1,13 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once +#include "VecSim/spaces/spaces.h" + +namespace spaces { +dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); +} // namespace spaces diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 98ad07676..1562e5b1a 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -66,3 +66,16 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension } return 1.0f - res; } + +float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { + int8_t *pVect1 = (int8_t *)pVect1v; + int8_t *pVect2 = (int8_t *)pVect2v; + + int res = 0; + for (size_t i = 0; i < dimension; i++) { + int16_t a = pVect1[i]; + int16_t b = pVect2[i]; + res += a * b; + } + return 1.0f - float(res); +} diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index 50fecef33..64e11b52f 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -16,3 +16,5 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension float BF16_InnerProduct_LittleEndian(const void *pVect1v, const void *pVect2v, size_t dimension); float BF16_InnerProduct_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension); + +float INT8_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index e6da26947..699919dc2 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -196,4 +196,17 @@ dist_func_t IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con return ret_dist_func; } +dist_func_t IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = INT8_InnerProduct; + // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. + if (dim < 32) { + return ret_dist_func; + } + return ret_dist_func; +} } // namespace spaces diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index a3ab0f4f6..87407c1a3 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -16,4 +16,6 @@ dist_func_t IP_BF16_GetDistFunc(size_t dim, unsigned char *alignment = nu const void *arch_opt = nullptr); dist_func_t IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +dist_func_t IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 379e21c52..ef310418b 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -7,7 +7,6 @@ #include "L2.h" #include "VecSim/types/bfloat16.h" #include "VecSim/types/float16.h" -#include "VecSim/types/int8.h" #include using bfloat16 = vecsim_types::bfloat16; @@ -72,25 +71,16 @@ float FP16_L2Sqr(const void *pVect1, const void *pVect2, size_t dimension) { return res; } -template float INT8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { int8_t *pVect1 = (int8_t *)pVect1v; int8_t *pVect2 = (int8_t *)pVect2v; int res = 0; for (size_t i = 0; i < dimension; i++) { - int16_t a = vecsim_types::int8_to_int16(pVect1[i]); - int16_t b = vecsim_types::int8_to_int16(pVect2[i]); + int16_t a = pVect1[i]; + int16_t b = pVect2[i]; int16_t diff = a - b; res += diff * diff; } return float(res); } - -float INT8_L2Sqr_LittleEndian(const void *pVect1v, const void *pVect2v, size_t dimension) { - return INT8_L2Sqr(pVect1v, pVect2v, dimension); -} - -float INT8_L2Sqr_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension) { - return INT8_L2Sqr(pVect1v, pVect2v, dimension); -} diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h index c367f2ee1..65649d4eb 100644 --- a/src/VecSim/spaces/L2/L2.h +++ b/src/VecSim/spaces/L2/L2.h @@ -16,3 +16,5 @@ float BF16_L2Sqr_LittleEndian(const void *pVect1v, const void *pVect2v, size_t d float BF16_L2Sqr_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension); float FP16_L2Sqr(const void *pVect1, const void *pVect2, size_t dimension); + +float INT8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 1c2b2b59f..3f9d83f03 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -189,4 +189,18 @@ dist_func_t L2_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con return ret_dist_func; } +dist_func_t L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = INT8_L2Sqr; + // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. + if (dim < 32) { + return ret_dist_func; + } + return ret_dist_func; +} + } // namespace spaces diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h index 4a2ea801a..48e50a8c2 100644 --- a/src/VecSim/spaces/L2_space.h +++ b/src/VecSim/spaces/L2_space.h @@ -16,4 +16,6 @@ dist_func_t L2_BF16_GetDistFunc(size_t dim, unsigned char *alignment = nu const void *arch_opt = nullptr); dist_func_t L2_FP16_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +dist_func_t L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/computer/calculator.h b/src/VecSim/spaces/computer/calculator.h index 36e76deed..64e0d8dae 100644 --- a/src/VecSim/spaces/computer/calculator.h +++ b/src/VecSim/spaces/computer/calculator.h @@ -26,10 +26,10 @@ class IndexCalculatorInterface : public VecsimBaseObject { /** * This object purpose is to calculate the distance between two vectors. * It extends the IndexCalculatorInterface class' type to hold the distance function. - * Every specific implmentation of the distance claculater should hold by refrence or by value the + * Every specific implementation of the distance calculator should hold by reference or by value the * parameters required for the calculation. The distance calculation API of all DistanceCalculator * classes is: calc_dist(v1,v2,dim). Internally it calls the distance function according the - * template signature, allowing fexability in the distance function arguments. + * template signature, allowing flexibility in the distance function arguments. */ template class DistanceCalculatorInterface : public IndexCalculatorInterface { diff --git a/src/VecSim/spaces/functions/implementation_chooser.h b/src/VecSim/spaces/functions/implementation_chooser.h index 2903b8cc4..6bb61815e 100644 --- a/src/VecSim/spaces/functions/implementation_chooser.h +++ b/src/VecSim/spaces/functions/implementation_chooser.h @@ -40,8 +40,8 @@ // out: The output variable that will be set to the chosen implementation. // dim: The dimension. // chunk: The chunk size. Can be 32, 16 or 8. 32 for 16-bit elements, 16 for 32-bit elements, 8 -// for 64-bit elements. func: The templated function that we want to choose the implementation -// for. +// for 64-bit elements. +// func: The templated function that we want to choose the implementation for. #define CHOOSE_IMPLEMENTATION(out, dim, chunk, func) \ do { \ decltype(out) __ret_dist_func; \ diff --git a/src/VecSim/types/int8.h b/src/VecSim/types/int8.h deleted file mode 100644 index 158f1622b..000000000 --- a/src/VecSim/types/int8.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - *Copyright Redis Ltd. 2021 - present - *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or - *the Server Side Public License v1 (SSPLv1). - */ - -#pragma once - -#include -#include -#include - -namespace vecsim_types { - -template -inline int16_t int8_to_int16(int8_t val) { - size_t constexpr bytes_offset = is_little ? 1 : 0; - int result = 0; - int16_t *p_result = (int16_t *)&result + bytes_offset; - *p_result = val; - return result; -} - -} // namespace vecsim_types diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 7cf7de92b..2a58d6072 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -12,10 +12,12 @@ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/IP/IP.h" #include "VecSim/spaces/L2/L2.h" +#include "VecSim/spaces/Cosine/Cosine.h" #include "VecSim/utils/vec_utils.h" #include "VecSim/types/bfloat16.h" #include "VecSim/spaces/IP_space.h" #include "VecSim/spaces/L2_space.h" +#include "VecSim/spaces/Cosine_space.h" #include "VecSim/types/float16.h" #include "VecSim/spaces/functions/AVX512F.h" #include "VecSim/spaces/functions/AVX.h" @@ -102,6 +104,21 @@ TEST_F(SpacesTest, fp16_l2_no_optimization_func_test) { ASSERT_EQ(dist, FP32_L2Sqr((const void *)sanity_a, (const void *)sanity_b, dim)); } +TEST_F(SpacesTest, int8_l2_no_optimization_func_test) { + size_t dim = 5; + + int8_t a[dim], b[dim]; + for (size_t i = 0; i < dim; i++) { + a[i] = (i + 1); + b[i] = (i + 2); + } + + float dist = INT8_L2Sqr((const void *)a, (const void *)b, dim); + ASSERT_EQ(dist, 5.0); +} + +/* ======================== IP NO OPT ======================== */ + TEST_F(SpacesTest, float_ip_no_optimization_func_test) { size_t dim = 5; @@ -211,6 +228,34 @@ TEST_F(SpacesTest, fp16_ip_no_optimization_func_test) { ASSERT_EQ(dist, FP32_InnerProduct((const void *)sanity_a, (const void *)sanity_b, dim)); } +TEST_F(SpacesTest, int8_ip_no_optimization_func_test) { + size_t dim = 4; + int8_t a[] = {1, 0, 0, 0}; + int8_t b[] = {1, 0, 0, 0}; + + float dist = INT8_InnerProduct((const void *)a, (const void *)b, dim); + ASSERT_EQ(dist, 0.0); +} + +/* ======================== Cosine NO OPT ======================== */ + +TEST_F(SpacesTest, int8_Cosine_no_optimization_func_test) { + size_t dim = 4; + // create normalized vector with extra space for the norm + std::vector vec1(dim + sizeof(float), 0); + std::vector vec2(dim + sizeof(float), 0); + + vec1[0] = 1; // {1, 0, 0, 0} + vec2[1] = 1; // {1, 0, 0, 0} + + // write the norm at the end of the vector + *(float *)(vec1.data() + dim) = 1.0; + *(float *)(vec2.data() + dim) = 1.0; + + float dist = INT8_InnerProduct((const void *)vec1.data(), (const void *)vec2.data(), dim); + ASSERT_EQ(dist, 1.0); +} + TEST_F(SpacesTest, GetDistFuncInvalidMetricFP32) { EXPECT_THROW( (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)), @@ -231,6 +276,11 @@ TEST_F(SpacesTest, GetDistFuncInvalidMetricFP16) { (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)), std::invalid_argument); } +// TEST_F(SpacesTest, GetDistFuncInvalidMetricINT8) { +// EXPECT_THROW( +// (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, +// nullptr)), std::invalid_argument); +// } using namespace spaces; From fa8e9ff6856072b6d72fa4b4c0c84473b0268017 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 8 Dec 2024 05:58:56 +0000 Subject: [PATCH 04/26] imp choose L2 int8 with 256bit loop add spaces unit tests for int8 L2 add compilation flags introduce tests/utils for general utils --- cmake/x86_64InstructionFlags.cmake | 5 ++ src/VecSim/spaces/CMakeLists.txt | 6 ++ .../spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h | 58 +++++++++++++++++++ src/VecSim/spaces/L2_space.cpp | 13 +++++ .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 23 ++++++++ .../spaces/functions/AVX512F_BW_VL_VNNI.h | 15 +++++ tests/unit/test_spaces.cpp | 49 ++++++++++++++++ tests/utils/tests_utils.h | 24 ++++++++ 8 files changed, 193 insertions(+) create mode 100644 src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h create mode 100644 src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp create mode 100644 src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h create mode 100644 tests/utils/tests_utils.h diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake index 1fedda7fe..1ff8f48f2 100644 --- a/cmake/x86_64InstructionFlags.cmake +++ b/cmake/x86_64InstructionFlags.cmake @@ -13,6 +13,7 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") CHECK_CXX_COMPILER_FLAG(-mavx512vbmi2 CXX_AVX512VBMI2) CHECK_CXX_COMPILER_FLAG(-mavx512fp16 CXX_AVX512FP16) CHECK_CXX_COMPILER_FLAG(-mavx512f CXX_AVX512F) + CHECK_CXX_COMPILER_FLAG(-mavx512vnni CXX_AVX512VNNI) CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2) CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX) CHECK_CXX_COMPILER_FLAG(-mf16c CXX_F16C) @@ -48,6 +49,10 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") add_compile_definitions(OPT_AVX512_BW_VBMI2) endif() + if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI) + add_compile_definitions(OPT_AVX512_F_BW_VL_VNNI) + endif() + if(CXX_F16C AND CXX_FMA AND CXX_AVX) add_compile_definitions(OPT_F16C) endif() diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt index ad22e8187..fc23adc18 100644 --- a/src/VecSim/spaces/CMakeLists.txt +++ b/src/VecSim/spaces/CMakeLists.txt @@ -45,6 +45,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") list(APPEND OPTIMIZATIONS functions/AVX512F.cpp) endif() + if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI) + message("Building with AVX512F, AVX512BW, AVX512VL and AVX512VNNI") + set_source_files_properties(functions/AVX512F_BW_VL_VNNI.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512vl -mavx512vnni") + list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp) + endif() + if(CXX_AVX2) message("Building with AVX2") set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2) diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h new file mode 100644 index 000000000..9130d6414 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h @@ -0,0 +1,58 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" + +static inline void L2SqrStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) { + __m256i temp_a = _mm256_loadu_epi8(pVect1); + __m512i va = _mm512_cvtepi8_epi16(temp_a); + pVect1 += 32; + + __m256i temp_b = _mm256_loadu_epi8(pVect2); + __m512i vb = _mm512_cvtepi8_epi16(temp_b); + pVect2 += 32; + + __m512i diff = _mm512_sub_epi16(va, vb); + // _mm512_dpwssd_epi32(src, a, b) + // Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding + // 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results + // with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. + sum = _mm512_dpwssd_epi32(sum, diff, diff); +} + +template // 0..32 +float INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, + size_t dimension) { + int8_t *pVect1 = (int8_t *)pVect1v; + int8_t *pVect2 = (int8_t *)pVect2v; + + const int8_t *pEnd1 = pVect1 + dimension; + + __m512i sum = _mm512_setzero_epi32(); + + // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block, + // so mask loading is guaranteed to be safe + if constexpr (residual) { + __mmask32 mask = (1LU << residual) - 1; + __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1); + __m512i va = _mm512_cvtepi8_epi16(temp_a); + pVect1 += residual; + + __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2); + __m512i vb = _mm512_cvtepi8_epi16(temp_b); + pVect2 += residual; + + __m512i diff = _mm512_sub_epi16(va, vb); + sum = _mm512_dpwssd_epi32(sum, diff, diff); + } + + // We dealt with the residual part. We are left with some multiple of 32-int_8. + do { + L2SqrStep(pVect1, pVect2, sum); + } while (pVect1 < pEnd1); + + return _mm512_reduce_add_epi32(sum); +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 3f9d83f03..3ae927224 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -15,6 +15,7 @@ #include "VecSim/spaces/functions/SSE.h" #include "VecSim/spaces/functions/AVX512BW_VBMI2.h" #include "VecSim/spaces/functions/AVX512FP16_VL.h" +#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" @@ -200,6 +201,18 @@ dist_func_t L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con if (dim < 32) { return ret_dist_func; } +#ifdef CPU_FEATURES_ARCH_X86_64 + auto features = (arch_opt == nullptr) + ? cpu_features::GetX86Info().features + : *static_cast(arch_opt); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { + if (dim % 32 == 0) // no point in aligning if we have an offsetting residual + *alignment = 32 * sizeof(int8_t); // align to 256 bits. + return Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(dim); + } +#endif +#endif // __x86_64__ return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp new file mode 100644 index 000000000..d906a5775 --- /dev/null +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -0,0 +1,23 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "AVX512BW_VBMI2.h" + +#include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h new file mode 100644 index 000000000..c1f32ff10 --- /dev/null +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -0,0 +1,15 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(size_t dim); + +} // namespace spaces diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 2a58d6072..c9e8d68b8 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -25,9 +25,11 @@ #include "VecSim/spaces/functions/AVX512BW_VBMI2.h" #include "VecSim/spaces/functions/AVX512BF16_VL.h" #include "VecSim/spaces/functions/AVX512FP16_VL.h" +#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/F16C.h" +#include "../utils/tests_utils.h" using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; @@ -256,6 +258,8 @@ TEST_F(SpacesTest, int8_Cosine_no_optimization_func_test) { ASSERT_EQ(dist, 1.0); } +/* ======================== Test Getters ======================== */ + TEST_F(SpacesTest, GetDistFuncInvalidMetricFP32) { EXPECT_THROW( (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)), @@ -291,6 +295,7 @@ TEST_F(SpacesTest, smallDimChooser) { ASSERT_EQ(L2_FP64_GetDistFunc(dim), FP64_L2Sqr); ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian); ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr); + ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr); ASSERT_EQ(IP_FP32_GetDistFunc(dim), FP32_InnerProduct); ASSERT_EQ(IP_FP64_GetDistFunc(dim), FP64_InnerProduct); ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); @@ -300,6 +305,7 @@ TEST_F(SpacesTest, smallDimChooser) { ASSERT_EQ(L2_FP32_GetDistFunc(dim), FP32_L2Sqr); ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian); ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr); + ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr); ASSERT_EQ(IP_FP32_GetDistFunc(dim), FP32_InnerProduct); ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); @@ -307,11 +313,14 @@ TEST_F(SpacesTest, smallDimChooser) { for (size_t dim = 16; dim < 32; dim++) { ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian); ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr); + ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr); ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); } } +/* ======================== Test SIMD Functions ======================== */ + // In this following tests we assume that compiler supports all X86 optimizations, so if we have // some hardware flag enabled, we check that the corresponding optimization function was chosen. #ifdef CPU_FEATURES_ARCH_X86_64 @@ -899,4 +908,44 @@ INSTANTIATE_TEST_SUITE_P(, FP16SpacesOptimizationTestAdvanced, #endif +class INT8SpacesOptimizationTest : public testing::TestWithParam {}; + +TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { + auto optimization = cpu_features::GetX86Info().features; + size_t dim = GetParam(); + auto v1 = test_utils::create_int8_vec(dim); + auto v2 = test_utils::create_int8_vec(dim); + + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = INT8_L2Sqr(v1.data(), v2.data(), dim); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && + optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim; + ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.avx512f = optimization.avx512bw = optimization.avx512vl = + optimization.avx512vnni = 0; + } +#endif + unsigned char alignment = 0; + arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, INT8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +INSTANTIATE_TEST_SUITE_P(INT8OptFuncs, INT8SpacesOptimizationTest, + testing::Range(32UL, 32 * 2UL + 1)); + #endif // CPU_FEATURES_ARCH_X86_64 diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h new file mode 100644 index 000000000..568fc1a49 --- /dev/null +++ b/tests/utils/tests_utils.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include + +namespace test_utils { + +std::vector create_int8_vec(size_t dim) { + + std::mt19937 gen(1234); // Mersenne Twister engine initialized with the fixed seed + + // uniform_int_distribution doesn't support int8, + // Define a distribution range for int8_t + std::uniform_int_distribution dis(-128, 127); + + std::vector vec(dim); + for (auto &num : vec) { + num = static_cast(dis(gen)); + } + + return vec; +} + +} // namespace test_utils From a7a556f43430ebba03121c7cf753464472150ed3 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 8 Dec 2024 11:23:30 +0200 Subject: [PATCH 05/26] imp space bm for int8 change INITIALIZE_BENCHMARKS_SET to INITIALIZE_BENCHMARKS_SET_L2_IP introduce INITIALIZE_BENCHMARKS_SET_COSINE fix typos in Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI name --- src/VecSim/spaces/L2_space.cpp | 2 +- .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 2 +- .../spaces/functions/AVX512F_BW_VL_VNNI.h | 2 +- tests/benchmark/spaces_benchmarks/bm_spaces.h | 6 ++ .../spaces_benchmarks/bm_spaces_bf16.cpp | 8 +-- .../spaces_benchmarks/bm_spaces_fp16.cpp | 8 +-- .../spaces_benchmarks/bm_spaces_fp32.cpp | 6 +- .../spaces_benchmarks/bm_spaces_fp64.cpp | 6 +- .../spaces_benchmarks/bm_spaces_int8.cpp | 56 +++++++++++++++++++ tests/unit/test_spaces.cpp | 2 +- 10 files changed, 80 insertions(+), 18 deletions(-) create mode 100644 tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 3ae927224..c0bec428f 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -209,7 +209,7 @@ dist_func_t L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { if (dim % 32 == 0) // no point in aligning if we have an offsetting residual *alignment = 32 * sizeof(int8_t); // align to 256 bits. - return Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(dim); + return Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim); } #endif #endif // __x86_64__ diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index d906a5775..d82d4141d 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -12,7 +12,7 @@ namespace spaces { #include "implementation_chooser.h" -dist_func_t Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(size_t dim) { +dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI); return ret_dist_func; diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index c1f32ff10..818b9529f 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -10,6 +10,6 @@ namespace spaces { -dist_func_t Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(size_t dim); +dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); } // namespace spaces diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index 3b55a9032..8b42ac030 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -123,6 +123,12 @@ static constexpr size_t start = min_no_res_th_dim; INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported); \ INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported); +#define INITIALIZE_BENCHMARKS_SET_COSINE(bm_class, type_prefix, arch, dim_opt, arch_supported) \ + INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, COSINE, arch_supported); \ + INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, COSINE, arch_supported); \ + INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported); \ + INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported); + #define INITIALIZE_BENCHMARKS_SET(bm_class, type_prefix, arch, dim_opt, arch_supported) \ INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported) \ INITIALIZE_BENCHMARKS_SET_IP(bm_class, type_prefix, arch, dim_opt, arch_supported) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp index 8022c712a..27fe82a3d 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp @@ -26,20 +26,20 @@ INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_BF16, BF16, AVX512BF16_VL, 32, // AVX512 functions #ifdef OPT_AVX512_BW_VBMI2 bool avx512_bw_vbmi2_supported = opt.avx512bw && opt.avx512vbmi2; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, AVX512BW_VBMI2, 32, - avx512_bw_vbmi2_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, AVX512BW_VBMI2, 32, + avx512_bw_vbmi2_supported); #endif // AVX512F // AVX functions #ifdef OPT_AVX2 bool avx2_supported = opt.avx2; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, AVX2, 32, avx2_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, AVX2, 32, avx2_supported); #endif // AVX // SSE functions #ifdef OPT_SSE3 bool sse3_supported = opt.sse3; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, SSE3, 32, sse3_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, SSE3, 32, sse3_supported); #endif // SSE #endif // x86_64 diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp index c9bc42b0b..9457bc77d 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp @@ -22,8 +22,8 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; class BM_VecSimSpaces_FP16_adv : public BM_VecSimSpaces<_Float16> {}; bool avx512fp16_vl_supported = opt.avx512_fp16 && opt.avx512vl; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16_adv, FP16, AVX512FP16_VL, 32, - avx512fp16_vl_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16_adv, FP16, AVX512FP16_VL, 32, + avx512fp16_vl_supported); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, InnerProduct, 32); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, L2Sqr, 32); @@ -32,12 +32,12 @@ INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, L2Sqr, 32); // OPT_AVX512F functions #ifdef OPT_AVX512F bool avx512f_supported = opt.avx512f; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16, FP16, AVX512F, 32, avx512f_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, AVX512F, 32, avx512f_supported); #endif // OPT_AVX512F // AVX functions #ifdef OPT_F16C bool avx512_bw_f16c_supported = opt.f16c && opt.fma3 && opt.avx; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16, FP16, F16C, 32, avx512_bw_f16c_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, F16C, 32, avx512_bw_f16c_supported); #endif // OPT_F16C #endif // x86_64 diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp index 289e42405..106b2abc8 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp @@ -13,19 +13,19 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512 functions #ifdef OPT_AVX512F bool avx512f_supported = opt.avx512f; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, AVX512F, 16, avx512f_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, AVX512F, 16, avx512f_supported); #endif // AVX512F // AVX functions #ifdef OPT_AVX bool avx_supported = opt.avx; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, AVX, 16, avx_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, AVX, 16, avx_supported); #endif // AVX // SSE functions #ifdef OPT_SSE bool sse_supported = opt.sse; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, SSE, 16, sse_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, SSE, 16, sse_supported); #endif // SSE #endif // x86_64 diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp index 19157f03f..01052cebc 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp @@ -13,19 +13,19 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512 functions #ifdef OPT_AVX512F bool avx512f_supported = opt.avx512f; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, AVX512F, 8, avx512f_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, AVX512F, 8, avx512f_supported); #endif // AVX512F // AVX functions #ifdef OPT_AVX bool avx_supported = opt.avx; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, AVX, 8, avx_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, AVX, 8, avx_supported); #endif // AVX // SSE functions #ifdef OPT_SSE bool sse_supported = opt.sse; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, SSE, 8, sse_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, SSE, 8, sse_supported); #endif // SSE #endif // x86_64 diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp new file mode 100644 index 000000000..def14a8bd --- /dev/null +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp @@ -0,0 +1,56 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ +#include +#include +#include "../../utils/tests_utils.h" +#include "bm_spaces.h" + +class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture { +protected: + std::mt19937 rng; + size_t dim; + int8_t *v1, *v2; + +public: + BM_VecSimSpaces_Integers_INT8() { rng.seed(47); } + ~BM_VecSimSpaces_Integers_INT8() = default; + + void SetUp(const ::benchmark::State &state) { + dim = state.range(0); + v1 = new int8_t[dim]; + v2 = new int8_t[dim]; + + // random for int8_t and uint8_t is not provided by the standard library + memcpy(v1, test_utils::create_int8_vec(dim).data(), dim); + memcpy(v2, test_utils::create_int8_vec(dim).data(), dim); + } + void TearDown(const ::benchmark::State &state) { + delete v1; + delete v2; + } +}; + + +#ifdef CPU_FEATURES_ARCH_X86_64 +cpu_features::X86Features opt = cpu_features::GetX86Info().features; + +// AVX512_BF16 functions +#ifdef OPT_AVX512_F_BW_VL_VNNI +bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; +INITIALIZE_BENCHMARKS_SET_L2(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, + avx512_f_bw_vl_vnni_supported); +// INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32, +// avx512_f_bw_vl_vnni_supported); +// INITIALIZE_BENCHMARKS_SET_COSINE(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32, +// avx512_f_bw_vl_vnni_supported) +#endif // AVX512_BF16 + + +#endif // x86_64 + +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, InnerProduct, 32); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, L2Sqr, 32); +BENCHMARK_MAIN(); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index c9e8d68b8..2968b1c9f 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -928,7 +928,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { optimization.avx512vnni) { unsigned char alignment = 0; arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_VW_CL_VNNI(dim)) + ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim; ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; From 43064e865041ef58ffb3c272d5c0998c8ea9559c Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 8 Dec 2024 09:26:01 +0000 Subject: [PATCH 06/26] fix INITIALIZE_BENCHMARKS_SET_L2_IP and add include to F_BW_VL_VNNI --- tests/benchmark/spaces_benchmarks/bm_spaces.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index 8b42ac030..86cb45553 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -19,6 +19,7 @@ #include "VecSim/spaces/functions/AVX.h" #include "VecSim/spaces/functions/AVX512BW_VBMI2.h" #include "VecSim/spaces/functions/AVX512BF16_VL.h" +#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/F16C.h" #include "VecSim/spaces/functions/SSE3.h" @@ -129,6 +130,6 @@ static constexpr size_t start = min_no_res_th_dim; INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported); \ INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported); -#define INITIALIZE_BENCHMARKS_SET(bm_class, type_prefix, arch, dim_opt, arch_supported) \ +#define INITIALIZE_BENCHMARKS_SET_L2_IP(bm_class, type_prefix, arch, dim_opt, arch_supported) \ INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported) \ INITIALIZE_BENCHMARKS_SET_IP(bm_class, type_prefix, arch, dim_opt, arch_supported) From fb9f1ccf7064702b4e0b08b6633336223bbe4524 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 8 Dec 2024 09:47:20 +0000 Subject: [PATCH 07/26] rename unit/test_utuils to unit_test_utils --- tests/benchmark/CMakeLists.txt | 2 +- tests/benchmark/benchmarks.sh | 10 ++++++++++ .../benchmark/spaces_benchmarks/bm_spaces_int8.cpp | 3 ++- tests/unit/CMakeLists.txt | 14 +++++++------- tests/unit/test_allocator.cpp | 4 ++-- tests/unit/test_bf16.cpp | 2 +- tests/unit/test_bruteforce.cpp | 4 ++-- tests/unit/test_bruteforce_multi.cpp | 4 ++-- tests/unit/test_common.cpp | 4 ++-- tests/unit/test_fp16.cpp | 2 +- tests/unit/test_hnsw.cpp | 4 ++-- tests/unit/test_hnsw_multi.cpp | 4 ++-- tests/unit/test_hnsw_parallel.cpp | 4 ++-- tests/unit/test_hnsw_tiered.cpp | 2 +- tests/unit/test_spaces.cpp | 2 +- tests/unit/{test_utils.cpp => unit_test_utils.cpp} | 2 +- tests/unit/{test_utils.h => unit_test_utils.h} | 0 17 files changed, 39 insertions(+), 28 deletions(-) rename tests/unit/{test_utils.cpp => unit_test_utils.cpp} (99%) rename tests/unit/{test_utils.h => unit_test_utils.h} (100%) diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt index 4d25a5499..2fa066e82 100644 --- a/tests/benchmark/CMakeLists.txt +++ b/tests/benchmark/CMakeLists.txt @@ -31,7 +31,7 @@ endforeach() include(${root}/cmake/x86_64InstructionFlags.cmake) -set(DATA_TYPE fp32 fp64 bf16 fp16) +set(DATA_TYPE fp32 fp64 bf16 fp16 int8) foreach(data_type IN LISTS DATA_TYPE) add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp) target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark) diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh index 11872e869..2e6664424 100755 --- a/tests/benchmark/benchmarks.sh +++ b/tests/benchmark/benchmarks.sh @@ -66,4 +66,14 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then echo spaces_fp16 echo spaces_fp64 echo spaces_bf16 +elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then + echo spaces_fp32 +elif [ "$BM_TYPE" = "bm-spaces-fp64" ] ; then + echo spaces_fp64 +elif [ "$BM_TYPE" = "bm-spaces-bf16" ] ; then + echo spaces_bf16 +elif [ "$BM_TYPE" = "bm-spaces-fp16" ] ; then + echo spaces_fp16 +elif [ "$BM_TYPE" = "bm-spaces-int8" ] ; then + echo spaces_int8 fi diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp index def14a8bd..8cb323043 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp @@ -5,7 +5,8 @@ */ #include #include -#include "../../utils/tests_utils.h" +#include +#include "utils/tests_utils.h" #include "bm_spaces.h" class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture { diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index b16bddac6..caa3fc522 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -30,15 +30,15 @@ endif() include(${root}/cmake/x86_64InstructionFlags.cmake) -add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp test_utils.cpp) -add_executable(test_hnsw_parallel test_hnsw_parallel.cpp test_utils.cpp) -add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp test_utils.cpp) -add_executable(test_allocator test_allocator.cpp test_utils.cpp) +add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp unit_test_utils.cpp) +add_executable(test_hnsw_parallel test_hnsw_parallel.cpp unit_test_utils.cpp) +add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp unit_test_utils.cpp) +add_executable(test_allocator test_allocator.cpp unit_test_utils.cpp) add_executable(test_spaces test_spaces.cpp) add_executable(test_types test_types.cpp) -add_executable(test_common ../utils/mock_thread_pool.cpp test_utils.cpp test_common.cpp) -add_executable(test_bf16 ../utils/mock_thread_pool.cpp test_bf16.cpp test_utils.cpp) -add_executable(test_fp16 ../utils/mock_thread_pool.cpp test_fp16.cpp test_utils.cpp) +add_executable(test_common ../utils/mock_thread_pool.cpp unit_test_utils.cpp test_common.cpp) +add_executable(test_bf16 ../utils/mock_thread_pool.cpp test_bf16.cpp unit_test_utils.cpp) +add_executable(test_fp16 ../utils/mock_thread_pool.cpp test_fp16.cpp unit_test_utils.cpp) target_link_libraries(test_hnsw PUBLIC gtest_main VectorSimilarity) target_link_libraries(test_hnsw_parallel PUBLIC gtest_main VectorSimilarity) diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp index 4eb389260..03f0b4e52 100644 --- a/tests/unit/test_allocator.cpp +++ b/tests/unit/test_allocator.cpp @@ -10,7 +10,7 @@ #include "VecSim/memory/vecsim_base.h" #include "VecSim/algorithms/brute_force/brute_force_single.h" #include "VecSim/algorithms/hnsw/hnsw_single.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/utils/serializer.h" #include "VecSim/index_factories/hnsw_factory.h" @@ -83,7 +83,7 @@ TEST_F(AllocatorTest, test_nested_object) { template class IndexAllocatorTest : public ::testing::Test {}; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(IndexAllocatorTest, DataTypeSet); diff --git a/tests/unit/test_bf16.cpp b/tests/unit/test_bf16.cpp index 921c80c35..95e12c98b 100644 --- a/tests/unit/test_bf16.cpp +++ b/tests/unit/test_bf16.cpp @@ -2,7 +2,7 @@ #include "VecSim/vec_sim.h" #include "VecSim/algorithms/hnsw/hnsw_single.h" #include "VecSim/index_factories/hnsw_factory.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/utils/serializer.h" #include "mock_thread_pool.h" #include "VecSim/query_result_definitions.h" diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index c56415e3d..b3d5b1192 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -6,7 +6,7 @@ #include "gtest/gtest.h" #include "VecSim/vec_sim.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/algorithms/brute_force/brute_force.h" #include "VecSim/algorithms/brute_force/brute_force_single.h" #include "cpu_features_macros.h" @@ -32,7 +32,7 @@ class BruteForceTest : public ::testing::Test { } }; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(BruteForceTest, DataTypeSet); diff --git a/tests/unit/test_bruteforce_multi.cpp b/tests/unit/test_bruteforce_multi.cpp index ef9cfc636..55aadedd4 100644 --- a/tests/unit/test_bruteforce_multi.cpp +++ b/tests/unit/test_bruteforce_multi.cpp @@ -6,7 +6,7 @@ #include "gtest/gtest.h" #include "VecSim/vec_sim.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/algorithms/brute_force/brute_force_multi.h" #include @@ -27,7 +27,7 @@ class BruteForceMultiTest : public ::testing::Test { } }; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(BruteForceMultiTest, DataTypeSet); diff --git a/tests/unit/test_common.cpp b/tests/unit/test_common.cpp index 58df46fba..bdfd6d9f2 100644 --- a/tests/unit/test_common.cpp +++ b/tests/unit/test_common.cpp @@ -10,7 +10,7 @@ #include "VecSim/query_result_definitions.h" #include "VecSim/utils/updatable_heap.h" #include "VecSim/utils/vec_utils.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/containers/vecsim_results_container.h" #include "VecSim/algorithms/hnsw/hnsw.h" #include "VecSim/index_factories/hnsw_factory.h" @@ -32,7 +32,7 @@ using float16 = vecsim_types::float16; template class CommonIndexTest : public ::testing::Test {}; -// DataTypeSet are defined in test_utils.h +// DataTypeSet are defined in unit_test_utils.h TYPED_TEST_SUITE(CommonIndexTest, DataTypeSet); diff --git a/tests/unit/test_fp16.cpp b/tests/unit/test_fp16.cpp index 377ef8f32..244bb9d0c 100644 --- a/tests/unit/test_fp16.cpp +++ b/tests/unit/test_fp16.cpp @@ -2,7 +2,7 @@ #include "VecSim/vec_sim.h" #include "VecSim/algorithms/hnsw/hnsw_single.h" #include "VecSim/index_factories/hnsw_factory.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/utils/serializer.h" #include "mock_thread_pool.h" #include "VecSim/query_result_definitions.h" diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp index f57a6d3e3..cc400d48a 100644 --- a/tests/unit/test_hnsw.cpp +++ b/tests/unit/test_hnsw.cpp @@ -9,7 +9,7 @@ #include "VecSim/vec_sim_debug.h" #include "VecSim/algorithms/hnsw/hnsw_single.h" #include "VecSim/index_factories/hnsw_factory.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/utils/serializer.h" #include "VecSim/query_result_definitions.h" #include @@ -36,7 +36,7 @@ class HNSWTest : public ::testing::Test { } }; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(HNSWTest, DataTypeSet); diff --git a/tests/unit/test_hnsw_multi.cpp b/tests/unit/test_hnsw_multi.cpp index ba87f1759..026f96e62 100644 --- a/tests/unit/test_hnsw_multi.cpp +++ b/tests/unit/test_hnsw_multi.cpp @@ -6,7 +6,7 @@ #include "gtest/gtest.h" #include "VecSim/vec_sim.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/algorithms/hnsw/hnsw_multi.h" #include #include @@ -31,7 +31,7 @@ class HNSWMultiTest : public ::testing::Test { } }; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(HNSWMultiTest, DataTypeSet); diff --git a/tests/unit/test_hnsw_parallel.cpp b/tests/unit/test_hnsw_parallel.cpp index a2d4827ca..0354a6af1 100644 --- a/tests/unit/test_hnsw_parallel.cpp +++ b/tests/unit/test_hnsw_parallel.cpp @@ -7,7 +7,7 @@ #include "gtest/gtest.h" #include "VecSim/vec_sim.h" #include "VecSim/algorithms/hnsw/hnsw_single.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/query_result_definitions.h" #include "VecSim/vec_sim_debug.h" #include @@ -124,7 +124,7 @@ class HNSWTestParallel : public ::testing::Test { void parallelInsertSearch(bool is_multi); }; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(HNSWTestParallel, DataTypeSet); diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp index 4b1df107a..db751b792 100644 --- a/tests/unit/test_hnsw_tiered.cpp +++ b/tests/unit/test_hnsw_tiered.cpp @@ -6,7 +6,7 @@ #include #include -#include "test_utils.h" +#include "unit_test_utils.h" #include "mock_thread_pool.h" #include diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 2968b1c9f..4a546b6ca 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -29,7 +29,7 @@ #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/F16C.h" -#include "../utils/tests_utils.h" +#include "tests_utils.h" using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; diff --git a/tests/unit/test_utils.cpp b/tests/unit/unit_test_utils.cpp similarity index 99% rename from tests/unit/test_utils.cpp rename to tests/unit/unit_test_utils.cpp index 7b99eba22..89973d19d 100644 --- a/tests/unit/test_utils.cpp +++ b/tests/unit/unit_test_utils.cpp @@ -4,7 +4,7 @@ *the Server Side Public License v1 (SSPLv1). */ -#include "test_utils.h" +#include "unit_test_utils.h" #include "gtest/gtest.h" #include "VecSim/utils/vec_utils.h" #include "VecSim/memory/vecsim_malloc.h" diff --git a/tests/unit/test_utils.h b/tests/unit/unit_test_utils.h similarity index 100% rename from tests/unit/test_utils.h rename to tests/unit/unit_test_utils.h From 602f8e94362d966c1f9f54de11618c94f7222363 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 8 Dec 2024 11:22:32 +0000 Subject: [PATCH 08/26] seed create vec --- tests/benchmark/spaces_benchmarks/bm_spaces.h | 2 +- tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp | 5 ++--- tests/unit/test_spaces.cpp | 8 ++++---- tests/utils/tests_utils.h | 4 ++-- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index 86cb45553..909f4d6dd 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -130,6 +130,6 @@ static constexpr size_t start = min_no_res_th_dim; INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported); \ INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported); -#define INITIALIZE_BENCHMARKS_SET_L2_IP(bm_class, type_prefix, arch, dim_opt, arch_supported) \ +#define INITIALIZE_BENCHMARKS_SET_L2_IP(bm_class, type_prefix, arch, dim_opt, arch_supported) \ INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported) \ INITIALIZE_BENCHMARKS_SET_IP(bm_class, type_prefix, arch, dim_opt, arch_supported) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp index 8cb323043..234550202 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp @@ -25,8 +25,8 @@ class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture { v2 = new int8_t[dim]; // random for int8_t and uint8_t is not provided by the standard library - memcpy(v1, test_utils::create_int8_vec(dim).data(), dim); - memcpy(v2, test_utils::create_int8_vec(dim).data(), dim); + memcpy(v1, test_utils::create_int8_vec(dim, 123).data(), dim); + memcpy(v2, test_utils::create_int8_vec(dim, 1234).data(), dim); } void TearDown(const ::benchmark::State &state) { delete v1; @@ -34,7 +34,6 @@ class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture { } }; - #ifdef CPU_FEATURES_ARCH_X86_64 cpu_features::X86Features opt = cpu_features::GetX86Info().features; diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 4a546b6ca..00f2a2d2f 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -913,8 +913,8 @@ class INT8SpacesOptimizationTest : public testing::TestWithParam {}; TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { auto optimization = cpu_features::GetX86Info().features; size_t dim = GetParam(); - auto v1 = test_utils::create_int8_vec(dim); - auto v2 = test_utils::create_int8_vec(dim); + auto v1 = test_utils::create_int8_vec(dim, 123); + auto v2 = test_utils::create_int8_vec(dim, 1234); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8; @@ -931,7 +931,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim; - ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; + ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; @@ -946,6 +946,6 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { } INSTANTIATE_TEST_SUITE_P(INT8OptFuncs, INT8SpacesOptimizationTest, - testing::Range(32UL, 32 * 2UL + 1)); + testing::Range(64UL, 64 * 2UL + 1)); #endif // CPU_FEATURES_ARCH_X86_64 diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 568fc1a49..0231a9838 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -5,9 +5,9 @@ namespace test_utils { -std::vector create_int8_vec(size_t dim) { +static std::vector create_int8_vec(size_t dim, int seed = 1234) { - std::mt19937 gen(1234); // Mersenne Twister engine initialized with the fixed seed + std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed // uniform_int_distribution doesn't support int8, // Define a distribution range for int8_t From cde5e2d020910ce15e03fff6990bf2a70048acc6 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 8 Dec 2024 12:22:22 +0000 Subject: [PATCH 09/26] format --- tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp index 234550202..96c02a44c 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp @@ -37,7 +37,7 @@ class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture { #ifdef CPU_FEATURES_ARCH_X86_64 cpu_features::X86Features opt = cpu_features::GetX86Info().features; -// AVX512_BF16 functions +// AVX512_F_BW_VL_VNNI functions #ifdef OPT_AVX512_F_BW_VL_VNNI bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; INITIALIZE_BENCHMARKS_SET_L2(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, @@ -46,8 +46,7 @@ INITIALIZE_BENCHMARKS_SET_L2(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_ // avx512_f_bw_vl_vnni_supported); // INITIALIZE_BENCHMARKS_SET_COSINE(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32, // avx512_f_bw_vl_vnni_supported) -#endif // AVX512_BF16 - +#endif // AVX512_F_BW_VL_VNNI #endif // x86_64 From cdb4d7f621513c70c5bdd00fafda2eb566ec014f Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 8 Dec 2024 12:46:29 +0000 Subject: [PATCH 10/26] implmenet IP + unit test --- .../spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h | 56 +++++++++++++++++++ src/VecSim/spaces/IP_space.cpp | 13 +++++ .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 6 ++ .../spaces/functions/AVX512F_BW_VL_VNNI.h | 1 + tests/unit/test_spaces.cpp | 43 +++++++++++++- 5 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h new file mode 100644 index 000000000..a7b99fcb8 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h @@ -0,0 +1,56 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" + +static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) { + __m256i temp_a = _mm256_loadu_epi8(pVect1); + __m512i va = _mm512_cvtepi8_epi16(temp_a); + pVect1 += 32; + + __m256i temp_b = _mm256_loadu_epi8(pVect2); + __m512i vb = _mm512_cvtepi8_epi16(temp_b); + pVect2 += 32; + + // _mm512_dpwssd_epi32(src, a, b) + // Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding + // 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results + // with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. + sum = _mm512_dpwssd_epi32(sum, va, vb); +} + +template // 0..32 +float INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, + size_t dimension) { + int8_t *pVect1 = (int8_t *)pVect1v; + int8_t *pVect2 = (int8_t *)pVect2v; + + const int8_t *pEnd1 = pVect1 + dimension; + + __m512i sum = _mm512_setzero_epi32(); + + // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block, + // so mask loading is guaranteed to be safe + if constexpr (residual) { + __mmask32 mask = (1LU << residual) - 1; + __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1); + __m512i va = _mm512_cvtepi8_epi16(temp_a); + pVect1 += residual; + + __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2); + __m512i vb = _mm512_cvtepi8_epi16(temp_b); + pVect2 += residual; + + sum = _mm512_dpwssd_epi32(sum, va, vb); + } + + // We dealt with the residual part. We are left with some multiple of 32-int_8. + do { + InnerProductStep(pVect1, pVect2, sum); + } while (pVect1 < pEnd1); + + return 1.0f - float(_mm512_reduce_add_epi32(sum)); +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 699919dc2..0cebb3bfb 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -16,6 +16,7 @@ #include "VecSim/spaces/functions/AVX512BW_VBMI2.h" #include "VecSim/spaces/functions/AVX512FP16_VL.h" #include "VecSim/spaces/functions/AVX512BF16_VL.h" +#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" @@ -207,6 +208,18 @@ dist_func_t IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con if (dim < 32) { return ret_dist_func; } +#ifdef CPU_FEATURES_ARCH_X86_64 + auto features = (arch_opt == nullptr) + ? cpu_features::GetX86Info().features + : *static_cast(arch_opt); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { + if (dim % 32 == 0) // no point in aligning if we have an offsetting residual + *alignment = 32 * sizeof(int8_t); // align to 256 bits. + return Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ return ret_dist_func; } } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index d82d4141d..3d3da5546 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -7,6 +7,7 @@ #include "AVX512BW_VBMI2.h" #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h" namespace spaces { @@ -17,6 +18,11 @@ dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI); return ret_dist_func; } +dist_func_t Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} #include "implementation_chooser_cleanup.h" diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index 818b9529f..c1ef5d6b8 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -11,5 +11,6 @@ namespace spaces { dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); } // namespace spaces diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 00f2a2d2f..61e6f167c 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -300,6 +300,7 @@ TEST_F(SpacesTest, smallDimChooser) { ASSERT_EQ(IP_FP64_GetDistFunc(dim), FP64_InnerProduct); ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); + ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct); } for (size_t dim = 8; dim < 16; dim++) { ASSERT_EQ(L2_FP32_GetDistFunc(dim), FP32_L2Sqr); @@ -309,6 +310,7 @@ TEST_F(SpacesTest, smallDimChooser) { ASSERT_EQ(IP_FP32_GetDistFunc(dim), FP32_InnerProduct); ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); + ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct); } for (size_t dim = 16; dim < 32; dim++) { ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian); @@ -316,6 +318,7 @@ TEST_F(SpacesTest, smallDimChooser) { ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr); ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); + ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct); } } @@ -931,7 +934,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim; - ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; + ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; @@ -945,7 +948,43 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } +TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) { + auto optimization = cpu_features::GetX86Info().features; + size_t dim = GetParam(); + auto v1 = test_utils::create_int8_vec(dim, 123); + auto v2 = test_utils::create_int8_vec(dim, 1234); + + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = INT8_InnerProduct(v1.data(), v2.data(), dim); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && + optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim; + ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.avx512f = optimization.avx512bw = optimization.avx512vl = + optimization.avx512vnni = 0; + } +#endif + unsigned char alignment = 0; + arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, INT8_InnerProduct) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + INSTANTIATE_TEST_SUITE_P(INT8OptFuncs, INT8SpacesOptimizationTest, - testing::Range(64UL, 64 * 2UL + 1)); + testing::Range(32UL, 32 * 2UL + 1)); #endif // CPU_FEATURES_ARCH_X86_64 From 5f018903fd7b6ca1da042ffd24571839d363d1f0 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 8 Dec 2024 12:50:45 +0000 Subject: [PATCH 11/26] ip bm --- tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp index 96c02a44c..5ab3ebf7f 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp @@ -40,7 +40,7 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512_F_BW_VL_VNNI functions #ifdef OPT_AVX512_F_BW_VL_VNNI bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; -INITIALIZE_BENCHMARKS_SET_L2(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, avx512_f_bw_vl_vnni_supported); // INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32, // avx512_f_bw_vl_vnni_supported); From 2dce6f0d3da611587f87001a9c7be9d7ea61e384 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 8 Dec 2024 12:54:20 +0000 Subject: [PATCH 12/26] format --- tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp index 5ab3ebf7f..cb12f10cb 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp @@ -41,7 +41,7 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; #ifdef OPT_AVX512_F_BW_VL_VNNI bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, - avx512_f_bw_vl_vnni_supported); + avx512_f_bw_vl_vnni_supported); // INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32, // avx512_f_bw_vl_vnni_supported); // INITIALIZE_BENCHMARKS_SET_COSINE(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32, From 3d3b3758e3e2f701917cd680cc20a267641ecad0 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 8 Dec 2024 15:34:34 +0000 Subject: [PATCH 13/26] implement cosine in ip API change create_int8_vec to populate_int8_vec add compute norm --- src/VecSim/spaces/CMakeLists.txt | 2 - src/VecSim/spaces/Cosine/Cosine.cpp | 23 ----- src/VecSim/spaces/Cosine/Cosine.h | 11 --- src/VecSim/spaces/Cosine_space.cpp | 27 ------ src/VecSim/spaces/Cosine_space.h | 13 --- src/VecSim/spaces/IP/IP.cpp | 14 ++- src/VecSim/spaces/IP/IP.h | 1 + .../spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h | 20 +++- src/VecSim/spaces/IP_space.cpp | 27 ++++++ src/VecSim/spaces/IP_space.h | 2 + .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 7 ++ .../spaces/functions/AVX512F_BW_VL_VNNI.h | 1 + src/VecSim/spaces/spaces.cpp | 13 +++ tests/benchmark/spaces_benchmarks/bm_spaces.h | 10 +- .../spaces_benchmarks/bm_spaces_int8.cpp | 23 ++--- tests/unit/test_spaces.cpp | 95 +++++++++++++------ tests/utils/tests_utils.h | 17 +++- 17 files changed, 176 insertions(+), 130 deletions(-) delete mode 100644 src/VecSim/spaces/Cosine/Cosine.cpp delete mode 100644 src/VecSim/spaces/Cosine/Cosine.h delete mode 100644 src/VecSim/spaces/Cosine_space.cpp delete mode 100644 src/VecSim/spaces/Cosine_space.h diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt index fc23adc18..1fc9473b2 100644 --- a/src/VecSim/spaces/CMakeLists.txt +++ b/src/VecSim/spaces/CMakeLists.txt @@ -3,7 +3,6 @@ project(VectorSimilaritySpaces_no_optimization) add_library(VectorSimilaritySpaces_no_optimization L2/L2.cpp IP/IP.cpp - Cosine/Cosine.cpp ) include(${root}/cmake/cpu_features.cmake) @@ -86,7 +85,6 @@ endif() add_library(VectorSimilaritySpaces L2_space.cpp IP_space.cpp - Cosine_space.cpp spaces.cpp ${OPTIMIZATIONS} computer/preprocessor_container.cpp diff --git a/src/VecSim/spaces/Cosine/Cosine.cpp b/src/VecSim/spaces/Cosine/Cosine.cpp deleted file mode 100644 index 1cbc9a191..000000000 --- a/src/VecSim/spaces/Cosine/Cosine.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - *Copyright Redis Ltd. 2021 - present - *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or - *the Server Side Public License v1 (SSPLv1). - */ - -#include "Cosine.h" - -float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { - int8_t *pVect1 = (int8_t *)pVect1v; - int8_t *pVect2 = (int8_t *)pVect2v; - - int res = 0; - for (size_t i = 0; i < dimension; i++) { - int16_t a = pVect1[i]; - int16_t b = pVect2[i]; - res += a * b; - } - - float norm_v1 = *(float *)pVect1v; - float norm_v2 = *(float *)pVect2v; - return 1.0f - float(res) / (norm_v1 * norm_v2); -} diff --git a/src/VecSim/spaces/Cosine/Cosine.h b/src/VecSim/spaces/Cosine/Cosine.h deleted file mode 100644 index c42f6c14f..000000000 --- a/src/VecSim/spaces/Cosine/Cosine.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - *Copyright Redis Ltd. 2021 - present - *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or - *the Server Side Public License v1 (SSPLv1). - */ - -#pragma once - -#include - -float INT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/Cosine_space.cpp b/src/VecSim/spaces/Cosine_space.cpp deleted file mode 100644 index 7cace4c32..000000000 --- a/src/VecSim/spaces/Cosine_space.cpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - *Copyright Redis Ltd. 2021 - present - *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or - *the Server Side Public License v1 (SSPLv1). - */ - -#include "VecSim/spaces/space_includes.h" -#include "VecSim/spaces/Cosine_space.h" -#include "VecSim/spaces/Cosine/Cosine.h" - -namespace spaces { -dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment, - const void *arch_opt) { - unsigned char dummy_alignment; - if (alignment == nullptr) { - alignment = &dummy_alignment; - } - - dist_func_t ret_dist_func = INT8_Cosine; - // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. - if (dim < 32) { - return ret_dist_func; - } - return ret_dist_func; -} - -} // namespace spaces diff --git a/src/VecSim/spaces/Cosine_space.h b/src/VecSim/spaces/Cosine_space.h deleted file mode 100644 index e139a5521..000000000 --- a/src/VecSim/spaces/Cosine_space.h +++ /dev/null @@ -1,13 +0,0 @@ -/* - *Copyright Redis Ltd. 2021 - present - *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or - *the Server Side Public License v1 (SSPLv1). - */ - -#pragma once -#include "VecSim/spaces/spaces.h" - -namespace spaces { -dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, - const void *arch_opt = nullptr); -} // namespace spaces diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 1562e5b1a..c3856abda 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -67,7 +67,7 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension return 1.0f - res; } -float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { +static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { int8_t *pVect1 = (int8_t *)pVect1v; int8_t *pVect2 = (int8_t *)pVect2v; @@ -77,5 +77,15 @@ float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensi int16_t b = pVect2[i]; res += a * b; } - return 1.0f - float(res); + return res; +} + +float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1 - INT8_InnerProductImp(pVect1v, pVect2v, dimension); +} + +float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { + float norm_v1 = *(float *)((int8_t *)pVect1v + dimension); + float norm_v2 = *(float *)((int8_t *)pVect2v + dimension); + return 1.0f - float(INT8_InnerProductImp(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2); } diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index 64e11b52f..d712499ed 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -18,3 +18,4 @@ float BF16_InnerProduct_LittleEndian(const void *pVect1v, const void *pVect2v, s float BF16_InnerProduct_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension); float INT8_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); +float INT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h index a7b99fcb8..fcd33c00c 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h @@ -23,8 +23,7 @@ static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &s } template // 0..32 -float INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, - size_t dimension) { +static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { int8_t *pVect1 = (int8_t *)pVect1v; int8_t *pVect2 = (int8_t *)pVect2v; @@ -52,5 +51,20 @@ float INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void InnerProductStep(pVect1, pVect2, sum); } while (pVect1 < pEnd1); - return 1.0f - float(_mm512_reduce_add_epi32(sum)); + return _mm512_reduce_add_epi32(sum); +} + +template // 0..32 +float INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, + size_t dimension) { + + return 1 - INT8_InnerProductImp(pVect1v, pVect2v, dimension); +} +template // 0..32 +float INT8_CosineSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, + size_t dimension) { + float norm_v1 = *(float *)((int8_t *)pVect1v + dimension); + float norm_v2 = *(float *)((int8_t *)pVect2v + dimension); + return 1.0f - + float(INT8_InnerProductImp(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2); } diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 0cebb3bfb..b168b2d7d 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -222,4 +222,31 @@ dist_func_t IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con #endif // __x86_64__ return ret_dist_func; } + +dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = INT8_Cosine; + // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. + if (dim < 32) { + return ret_dist_func; + } +#ifdef CPU_FEATURES_ARCH_X86_64 + auto features = (arch_opt == nullptr) + ? cpu_features::GetX86Info().features + : *static_cast(arch_opt); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { + if (dim % 32 == 0) // no point in aligning if we have an offsetting residual + *alignment = 32 * sizeof(int8_t); // align to 256 bits. + return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} } // namespace spaces diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index 87407c1a3..0d8c3a836 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -18,4 +18,6 @@ dist_func_t IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment = nu const void *arch_opt = nullptr); dist_func_t IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 3d3da5546..cd66a6096 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -18,12 +18,19 @@ dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI); return ret_dist_func; } + dist_func_t Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI); return ret_dist_func; } +dist_func_t Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_CosineSIMD32_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index c1ef5d6b8..532a33c76 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -12,5 +12,6 @@ namespace spaces { dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); } // namespace spaces diff --git a/src/VecSim/spaces/spaces.cpp b/src/VecSim/spaces/spaces.cpp index 84f71b463..b512c9750 100644 --- a/src/VecSim/spaces/spaces.cpp +++ b/src/VecSim/spaces/spaces.cpp @@ -69,6 +69,19 @@ dist_func_t GetDistFunc(VecSimMetric metric, size_t dim, } throw std::invalid_argument("Invalid metric"); } +template <> +dist_func_t GetDistFunc(VecSimMetric metric, size_t dim, + unsigned char *alignment) { + switch (metric) { + case VecSimMetric_Cosine: + return Cosine_INT8_GetDistFunc(dim, alignment); + case VecSimMetric_IP: + return IP_INT8_GetDistFunc(dim, alignment); + case VecSimMetric_L2: + return L2_INT8_GetDistFunc(dim, alignment); + } + throw std::invalid_argument("Invalid metric"); +} template <> normalizeVector_f GetNormalizeFunc(void) { diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index 909f4d6dd..b7431c43c 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -124,11 +124,11 @@ static constexpr size_t start = min_no_res_th_dim; INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported); \ INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported); -#define INITIALIZE_BENCHMARKS_SET_COSINE(bm_class, type_prefix, arch, dim_opt, arch_supported) \ - INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, COSINE, arch_supported); \ - INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, COSINE, arch_supported); \ - INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported); \ - INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, COSINE, dim_opt, arch_supported); +#define INITIALIZE_BENCHMARKS_SET_Cosine(bm_class, type_prefix, arch, dim_opt, arch_supported) \ + INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, Cosine, arch_supported); \ + INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, Cosine, arch_supported); \ + INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, Cosine, dim_opt, arch_supported); \ + INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, Cosine, dim_opt, arch_supported); #define INITIALIZE_BENCHMARKS_SET_L2_IP(bm_class, type_prefix, arch, dim_opt, arch_supported) \ INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported) \ diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp index cb12f10cb..0adde8972 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp @@ -21,12 +21,15 @@ class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture { void SetUp(const ::benchmark::State &state) { dim = state.range(0); - v1 = new int8_t[dim]; - v2 = new int8_t[dim]; - - // random for int8_t and uint8_t is not provided by the standard library - memcpy(v1, test_utils::create_int8_vec(dim, 123).data(), dim); - memcpy(v2, test_utils::create_int8_vec(dim, 1234).data(), dim); + // Allocate vector with extra space for cosine calculations + v1 = new int8_t[dim + sizeof(float)]; + v2 = new int8_t[dim + sizeof(float)]; + test_utils::populate_int8_vec(v1, dim, 123); + test_utils::populate_int8_vec(v2, dim, 1234); + + // Store the norm in the extra space for cosine calculations + *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim); + *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim); } void TearDown(const ::benchmark::State &state) { delete v1; @@ -42,14 +45,12 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, avx512_f_bw_vl_vnni_supported); -// INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32, -// avx512_f_bw_vl_vnni_supported); -// INITIALIZE_BENCHMARKS_SET_COSINE(BM_VecSimSpaces_Integers_INT8, INT8, AVX512_F_BW_VL_VNNI, 32, -// avx512_f_bw_vl_vnni_supported) +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, + avx512_f_bw_vl_vnni_supported) #endif // AVX512_F_BW_VL_VNNI #endif // x86_64 -INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, InnerProduct, 32); + INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, InnerProduct, 32); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, L2Sqr, 32); BENCHMARK_MAIN(); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 61e6f167c..f76069c46 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -12,12 +12,10 @@ #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/IP/IP.h" #include "VecSim/spaces/L2/L2.h" -#include "VecSim/spaces/Cosine/Cosine.h" #include "VecSim/utils/vec_utils.h" #include "VecSim/types/bfloat16.h" #include "VecSim/spaces/IP_space.h" #include "VecSim/spaces/L2_space.h" -#include "VecSim/spaces/Cosine_space.h" #include "VecSim/types/float16.h" #include "VecSim/spaces/functions/AVX512F.h" #include "VecSim/spaces/functions/AVX.h" @@ -243,19 +241,19 @@ TEST_F(SpacesTest, int8_ip_no_optimization_func_test) { TEST_F(SpacesTest, int8_Cosine_no_optimization_func_test) { size_t dim = 4; - // create normalized vector with extra space for the norm - std::vector vec1(dim + sizeof(float), 0); - std::vector vec2(dim + sizeof(float), 0); + // create a vector with extra space for the norm + int8_t *v1 = new int8_t[dim + sizeof(float)]; + int8_t *v2 = new int8_t[dim + sizeof(float)]; - vec1[0] = 1; // {1, 0, 0, 0} - vec2[1] = 1; // {1, 0, 0, 0} + test_utils::populate_int8_vec(v1, dim, 123); + test_utils::populate_int8_vec(v2, dim, 123); // write the norm at the end of the vector - *(float *)(vec1.data() + dim) = 1.0; - *(float *)(vec2.data() + dim) = 1.0; + *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim); + *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim); - float dist = INT8_InnerProduct((const void *)vec1.data(), (const void *)vec2.data(), dim); - ASSERT_EQ(dist, 1.0); + float dist = INT8_Cosine((const void *)v1, (const void *)v2, dim); + ASSERT_NEAR(dist, 0.0, 0.000001); } /* ======================== Test Getters ======================== */ @@ -280,11 +278,11 @@ TEST_F(SpacesTest, GetDistFuncInvalidMetricFP16) { (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)), std::invalid_argument); } -// TEST_F(SpacesTest, GetDistFuncInvalidMetricINT8) { -// EXPECT_THROW( -// (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, -// nullptr)), std::invalid_argument); -// } +TEST_F(SpacesTest, GetDistFuncInvalidMetricINT8) { + EXPECT_THROW( + (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)), + std::invalid_argument); +} using namespace spaces; @@ -916,8 +914,10 @@ class INT8SpacesOptimizationTest : public testing::TestWithParam {}; TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { auto optimization = cpu_features::GetX86Info().features; size_t dim = GetParam(); - auto v1 = test_utils::create_int8_vec(dim, 123); - auto v2 = test_utils::create_int8_vec(dim, 1234); + int8_t *v1 = new int8_t[dim]; + int8_t *v2 = new int8_t[dim]; + test_utils::populate_int8_vec(v1, dim, 123); + test_utils::populate_int8_vec(v2, dim, 1234); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8; @@ -925,7 +925,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { }; dist_func_t arch_opt_func; - float baseline = INT8_L2Sqr(v1.data(), v2.data(), dim); + float baseline = INT8_L2Sqr(v1, v2, dim); #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && optimization.avx512vnni) { @@ -933,7 +933,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim; ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = optimization.avx512bw = optimization.avx512vl = @@ -943,16 +943,17 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { unsigned char alignment = 0; arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, INT8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; - ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) - << "No optimization with dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) { auto optimization = cpu_features::GetX86Info().features; size_t dim = GetParam(); - auto v1 = test_utils::create_int8_vec(dim, 123); - auto v2 = test_utils::create_int8_vec(dim, 1234); + int8_t *v1 = new int8_t[dim]; + int8_t *v2 = new int8_t[dim]; + test_utils::populate_int8_vec(v1, dim, 123); + test_utils::populate_int8_vec(v2, dim, 1234); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8; @@ -960,7 +961,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) { }; dist_func_t arch_opt_func; - float baseline = INT8_InnerProduct(v1.data(), v2.data(), dim); + float baseline = INT8_InnerProduct(v1, v2, dim); #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && optimization.avx512vnni) { @@ -968,7 +969,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) { arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) << "AVX512 with dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim; ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = optimization.avx512bw = optimization.avx512vl = @@ -979,8 +980,46 @@ TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) { arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, INT8_InnerProduct) << "Unexpected distance function chosen for dim " << dim; - ASSERT_EQ(baseline, arch_opt_func(v1.data(), v2.data(), dim)) - << "No optimization with dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { + auto optimization = cpu_features::GetX86Info().features; + size_t dim = GetParam(); + int8_t *v1 = new int8_t[dim + sizeof(float)]; + int8_t *v2 = new int8_t[dim + sizeof(float)]; + test_utils::populate_int8_vec(v1, dim, 123); + test_utils::populate_int8_vec(v2, dim, 1234); + + // write the norm at the end of the vector + *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim); + *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim); + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = INT8_Cosine(v1, v2, dim); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && + optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim; + ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.avx512f = optimization.avx512bw = optimization.avx512vl = + optimization.avx512vnni = 0; + } +#endif + unsigned char alignment = 0; + arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, INT8_Cosine) << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 0231a9838..31dc3d9ef 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -5,7 +5,8 @@ namespace test_utils { -static std::vector create_int8_vec(size_t dim, int seed = 1234) { +// Assuming v is a memory allocation of size dim * sizeof(float) +static void populate_int8_vec(int8_t *v, size_t dim, int seed = 1234) { std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed @@ -13,12 +14,18 @@ static std::vector create_int8_vec(size_t dim, int seed = 1234) { // Define a distribution range for int8_t std::uniform_int_distribution dis(-128, 127); - std::vector vec(dim); - for (auto &num : vec) { - num = static_cast(dis(gen)); + for (size_t i = 0; i < dim; i++) { + v[i] = static_cast(dis(gen)); } +} - return vec; +// TODO: replace with normalize function from VecSim +float compute_norm(const int8_t *vec, size_t dim) { + int norm = 0; + for (size_t i = 0; i < dim; i++) { + norm += vec[i] * vec[i]; + } + return sqrt(norm); } } // namespace test_utils From 6f211b32159cd391e9ec4ec5b806f6109dcd899b Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 9 Dec 2024 11:43:18 +0000 Subject: [PATCH 14/26] use mask sub instead of msk load --- src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h index 9130d6414..edeb37b4b 100644 --- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h @@ -37,15 +37,15 @@ float INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect // so mask loading is guaranteed to be safe if constexpr (residual) { __mmask32 mask = (1LU << residual) - 1; - __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1); + __m256i temp_a = _mm256_loadu_epi8(pVect1); __m512i va = _mm512_cvtepi8_epi16(temp_a); pVect1 += residual; - __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2); + __m256i temp_b = _mm256_loadu_epi8(pVect2); __m512i vb = _mm512_cvtepi8_epi16(temp_b); pVect2 += residual; - __m512i diff = _mm512_sub_epi16(va, vb); + __m512i diff = _mm512_maskz_sub_epi16(mask, va, vb); sum = _mm512_dpwssd_epi32(sum, diff, diff); } From 6ac65a3476d3de2c261fb41ae581a785baf85623 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Tue, 10 Dec 2024 06:14:36 +0000 Subject: [PATCH 15/26] loop size = 512 minimal dim = 32 --- .../spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h | 21 ++++++++++++------- .../spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h | 21 ++++++++++++------- .../spaces/functions/AVX512F_BW_VL_VNNI.cpp | 6 +++--- .../spaces/functions/implementation_chooser.h | 15 ++++++++----- 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h index fcd33c00c..28187bf31 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h @@ -22,7 +22,7 @@ static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &s sum = _mm512_dpwssd_epi32(sum, va, vb); } -template // 0..32 +template // 0..64 static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { int8_t *pVect1 = (int8_t *)pVect1v; int8_t *pVect2 = (int8_t *)pVect2v; @@ -33,23 +33,28 @@ static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block, // so mask loading is guaranteed to be safe - if constexpr (residual) { - __mmask32 mask = (1LU << residual) - 1; + if constexpr (residual % 32) { + __mmask32 mask = (1LU << (residual % 32)) - 1; __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1); __m512i va = _mm512_cvtepi8_epi16(temp_a); - pVect1 += residual; + pVect1 += residual % 32; __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2); __m512i vb = _mm512_cvtepi8_epi16(temp_b); - pVect2 += residual; + pVect2 += residual % 32; sum = _mm512_dpwssd_epi32(sum, va, vb); } - // We dealt with the residual part. We are left with some multiple of 32-int_8. - do { + if constexpr (residual >= 32) { + InnerProductStep(pVect1, pVect2, sum); + } + + // We dealt with the residual part. We are left with some multiple of 64-int_8. + while (pVect1 < pEnd1) { + InnerProductStep(pVect1, pVect2, sum); InnerProductStep(pVect1, pVect2, sum); - } while (pVect1 < pEnd1); + } return _mm512_reduce_add_epi32(sum); } diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h index edeb37b4b..d47964ca2 100644 --- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h @@ -23,7 +23,7 @@ static inline void L2SqrStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) { sum = _mm512_dpwssd_epi32(sum, diff, diff); } -template // 0..32 +template // 0..64 float INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, size_t dimension) { int8_t *pVect1 = (int8_t *)pVect1v; @@ -35,24 +35,29 @@ float INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block, // so mask loading is guaranteed to be safe - if constexpr (residual) { - __mmask32 mask = (1LU << residual) - 1; + if constexpr (residual % 32) { + __mmask32 mask = (1LU << (residual % 32)) - 1; __m256i temp_a = _mm256_loadu_epi8(pVect1); __m512i va = _mm512_cvtepi8_epi16(temp_a); - pVect1 += residual; + pVect1 += residual % 32; __m256i temp_b = _mm256_loadu_epi8(pVect2); __m512i vb = _mm512_cvtepi8_epi16(temp_b); - pVect2 += residual; + pVect2 += residual % 32; __m512i diff = _mm512_maskz_sub_epi16(mask, va, vb); sum = _mm512_dpwssd_epi32(sum, diff, diff); } - // We dealt with the residual part. We are left with some multiple of 32-int_8. - do { + if constexpr (residual >= 32) { L2SqrStep(pVect1, pVect2, sum); - } while (pVect1 < pEnd1); + } + + // We dealt with the residual part. We are left with some multiple of 64-int_8. + while (pVect1 < pEnd1) { + L2SqrStep(pVect1, pVect2, sum); + L2SqrStep(pVect1, pVect2, sum); + } return _mm512_reduce_add_epi32(sum); } diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index cd66a6096..599984954 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -15,19 +15,19 @@ namespace spaces { dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI); return ret_dist_func; } dist_func_t Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI); return ret_dist_func; } dist_func_t Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, INT8_CosineSIMD32_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_CosineSIMD32_AVX512F_BW_VL_VNNI); return ret_dist_func; } diff --git a/src/VecSim/spaces/functions/implementation_chooser.h b/src/VecSim/spaces/functions/implementation_chooser.h index 6bb61815e..b32ad56c6 100644 --- a/src/VecSim/spaces/functions/implementation_chooser.h +++ b/src/VecSim/spaces/functions/implementation_chooser.h @@ -25,23 +25,28 @@ // of 4N, 4N+1, 4N+2, 4N+3. #define C4(X, func, N) X(4 * N, func) X(4 * N + 1, func) X(4 * N + 2, func) X(4 * N + 3, func) -// Macros for 8, 16 and 32 cases. Used to collapse the switch statement. Expands into 0-31, 0-15 or -// 0-7 cases. +// Macros for 8, 16, 32 and 64 cases. Used to collapse the switch statement. Expands into 0-63, +// 0-31, 0-15 or 0-7 cases. #define CASES32(X, func) \ C4(X, func, 0) \ C4(X, func, 1) \ C4(X, func, 2) C4(X, func, 3) C4(X, func, 4) C4(X, func, 5) C4(X, func, 6) C4(X, func, 7) #define CASES16(X, func) C4(X, func, 0) C4(X, func, 1) C4(X, func, 2) C4(X, func, 3) #define CASES8(X, func) C4(X, func, 0) C4(X, func, 1) +#define CASES64(X, func) \ + CASES32(X, func) \ + C4(X, func, 8) \ + C4(X, func, 9) \ + C4(X, func, 10) C4(X, func, 11) C4(X, func, 12) C4(X, func, 13) C4(X, func, 14) C4(X, func, 15) // Main macro. Expands into a switch statement that chooses the implementation based on the // dimension's remainder. // @params: // out: The output variable that will be set to the chosen implementation. // dim: The dimension. -// chunk: The chunk size. Can be 32, 16 or 8. 32 for 16-bit elements, 16 for 32-bit elements, 8 -// for 64-bit elements. -// func: The templated function that we want to choose the implementation for. +// chunk: The chunk size. Can be 64, 32, 16 or 8. 64 for 8-bit elements, 32 for 16-bit elements, +// 16 for 32-bit elements, 8 for 64-bit elements. func: The templated function that we want to +// choose the implementation for. #define CHOOSE_IMPLEMENTATION(out, dim, chunk, func) \ do { \ decltype(out) __ret_dist_func; \ From 0d07c5d672b1355258a5deac68faafe0a200610f Mon Sep 17 00:00:00 2001 From: meiravgri Date: Tue, 10 Dec 2024 10:38:09 +0000 Subject: [PATCH 16/26] add int8 to bm --- tests/benchmark/benchmarks.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh index 2e6664424..867077ede 100755 --- a/tests/benchmark/benchmarks.sh +++ b/tests/benchmark/benchmarks.sh @@ -13,6 +13,7 @@ if [ -z "$BM_TYPE" ] || [ "$BM_TYPE" = "benchmarks-all" ]; then echo spaces_fp64 echo spaces_bf16 echo spaces_fp16 + echo spaces_int8 elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo basics_single_fp32 echo basics_multi_fp32 @@ -20,6 +21,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo spaces_fp64 echo spaces_bf16 echo spaces_fp16 + echo spaces_int8 # Basic benchmarks elif [ "$BM_TYPE" = "bm-basics-fp32-single" ] ; then echo basics_single_fp32 @@ -66,6 +68,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then echo spaces_fp16 echo spaces_fp64 echo spaces_bf16 + echo spaces_int8 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then echo spaces_fp32 elif [ "$BM_TYPE" = "bm-spaces-fp64" ] ; then From 3586a76f8d7137404a0fa3370bb89a2241aa9e24 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Tue, 10 Dec 2024 10:40:58 +0000 Subject: [PATCH 17/26] reanme to simd64 --- src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h | 4 ++-- src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h | 2 +- src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h index 28187bf31..ffda357ff 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h @@ -60,13 +60,13 @@ static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, } template // 0..32 -float INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, +float INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, size_t dimension) { return 1 - INT8_InnerProductImp(pVect1v, pVect2v, dimension); } template // 0..32 -float INT8_CosineSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, +float INT8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, size_t dimension) { float norm_v1 = *(float *)((int8_t *)pVect1v + dimension); float norm_v2 = *(float *)((int8_t *)pVect2v + dimension); diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h index d47964ca2..3f4ba33a6 100644 --- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h @@ -24,7 +24,7 @@ static inline void L2SqrStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) { } template // 0..64 -float INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, +float INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, size_t dimension) { int8_t *pVect1 = (int8_t *)pVect1v; int8_t *pVect2 = (int8_t *)pVect2v; diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 599984954..661c2c945 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -15,19 +15,19 @@ namespace spaces { dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD32_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI); return ret_dist_func; } dist_func_t Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_InnerProductSIMD32_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI); return ret_dist_func; } dist_func_t Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_CosineSIMD32_AVX512F_BW_VL_VNNI); + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_CosineSIMD64_AVX512F_BW_VL_VNNI); return ret_dist_func; } From adbc4d7e1be2e6b6efeb6d82b7e342e52a174c5e Mon Sep 17 00:00:00 2001 From: meiravgri Date: Tue, 10 Dec 2024 16:32:56 +0000 Subject: [PATCH 18/26] convert to int before multiplication --- tests/utils/tests_utils.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 31dc3d9ef..01461a78d 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -23,7 +23,8 @@ static void populate_int8_vec(int8_t *v, size_t dim, int seed = 1234) { float compute_norm(const int8_t *vec, size_t dim) { int norm = 0; for (size_t i = 0; i < dim; i++) { - norm += vec[i] * vec[i]; + int val = static_cast(vec[i]); + norm += val * val; } return sqrt(norm); } From cb2c88727f86a1fa6063d035ee0d933d4e2b3da9 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 16 Dec 2024 13:00:39 +0000 Subject: [PATCH 19/26] review comments: align to vector size ncluding the norm in cosine dist unit test cover small dim in cosine chooser --- src/VecSim/spaces/IP/IP.cpp | 11 ++++++----- src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h | 14 ++++++++------ src/VecSim/spaces/IP_space.cpp | 3 ++- src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h | 2 +- src/VecSim/spaces/spaces.cpp | 1 + tests/unit/test_spaces.cpp | 3 +++ 6 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index c3856abda..0884df3bb 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -73,9 +73,7 @@ static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, int res = 0; for (size_t i = 0; i < dimension; i++) { - int16_t a = pVect1[i]; - int16_t b = pVect2[i]; - res += a * b; + res += pVect1[i] * pVect2[i]; } return res; } @@ -85,7 +83,10 @@ float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensi } float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { - float norm_v1 = *(float *)((int8_t *)pVect1v + dimension); - float norm_v2 = *(float *)((int8_t *)pVect2v + dimension); + // We expect the vectors' norm to be stored at the end of the vector. + float norm_v1 = + *reinterpret_cast(static_cast(pVect1v) + dimension); + float norm_v2 = + *reinterpret_cast(static_cast(pVect2v) + dimension); return 1.0f - float(INT8_InnerProductImp(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2); } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h index ffda357ff..7716d8ad7 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h @@ -59,17 +59,19 @@ static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, return _mm512_reduce_add_epi32(sum); } -template // 0..32 +template // 0..64 float INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, size_t dimension) { return 1 - INT8_InnerProductImp(pVect1v, pVect2v, dimension); } -template // 0..32 +template // 0..64 float INT8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, size_t dimension) { - float norm_v1 = *(float *)((int8_t *)pVect1v + dimension); - float norm_v2 = *(float *)((int8_t *)pVect2v + dimension); - return 1.0f - - float(INT8_InnerProductImp(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2); + float ip = INT8_InnerProductImp(pVect1v, pVect2v, dimension); + float norm_v1 = + *reinterpret_cast(static_cast(pVect1v) + dimension); + float norm_v2 = + *reinterpret_cast(static_cast(pVect2v) + dimension); + return 1.0f - ip / (norm_v1 * norm_v2); } diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index b168b2d7d..27e915e71 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -241,7 +241,8 @@ dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment, : *static_cast(arch_opt); #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { - if (dim % 32 == 0) // no point in aligning if we have an offsetting residual + // Align to vector memory size, including the norm at the end of the vector. + if (dim % 32 + 4 == 0) // no point in aligning if we have an offsetting residual *alignment = 32 * sizeof(int8_t); // align to 256 bits. return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h index 3f4ba33a6..2c8b846af 100644 --- a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h @@ -36,7 +36,7 @@ float INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block, // so mask loading is guaranteed to be safe if constexpr (residual % 32) { - __mmask32 mask = (1LU << (residual % 32)) - 1; + constexpr __mmask32 mask = (1LU << (residual % 32)) - 1; __m256i temp_a = _mm256_loadu_epi8(pVect1); __m512i va = _mm512_cvtepi8_epi16(temp_a); pVect1 += residual % 32; diff --git a/src/VecSim/spaces/spaces.cpp b/src/VecSim/spaces/spaces.cpp index b512c9750..4385b5e94 100644 --- a/src/VecSim/spaces/spaces.cpp +++ b/src/VecSim/spaces/spaces.cpp @@ -69,6 +69,7 @@ dist_func_t GetDistFunc(VecSimMetric metric, size_t dim, } throw std::invalid_argument("Invalid metric"); } + template <> dist_func_t GetDistFunc(VecSimMetric metric, size_t dim, unsigned char *alignment) { diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index f76069c46..552e7464a 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -299,6 +299,7 @@ TEST_F(SpacesTest, smallDimChooser) { ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct); + ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine); } for (size_t dim = 8; dim < 16; dim++) { ASSERT_EQ(L2_FP32_GetDistFunc(dim), FP32_L2Sqr); @@ -309,6 +310,7 @@ TEST_F(SpacesTest, smallDimChooser) { ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct); + ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine); } for (size_t dim = 16; dim < 32; dim++) { ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian); @@ -317,6 +319,7 @@ TEST_F(SpacesTest, smallDimChooser) { ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct); + ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine); } } From 880dd332e5284aa96a46c6a4e25e341f585e9104 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 16 Dec 2024 13:02:16 +0000 Subject: [PATCH 20/26] use sizeof(float)instead of 4 --- src/VecSim/spaces/IP_space.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 27e915e71..d97d53d96 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -242,7 +242,7 @@ dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment, #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { // Align to vector memory size, including the norm at the end of the vector. - if (dim % 32 + 4 == 0) // no point in aligning if we have an offsetting residual + if (dim % 32 + sizeof(float) == 0) // no point in aligning if we have an offsetting residual *alignment = 32 * sizeof(int8_t); // align to 256 bits. return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } From b79777fe4daa5e84812d3bd6adae2afcd79dd48d Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 16 Dec 2024 13:10:20 +0000 Subject: [PATCH 21/26] remove int conversion in test_utils::compute_norm --- tests/utils/tests_utils.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 01461a78d..31dc3d9ef 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -23,8 +23,7 @@ static void populate_int8_vec(int8_t *v, size_t dim, int seed = 1234) { float compute_norm(const int8_t *vec, size_t dim) { int norm = 0; for (size_t i = 0; i < dim; i++) { - int val = static_cast(vec[i]); - norm += val * val; + norm += vec[i] * vec[i]; } return sqrt(norm); } From ab159bce10987ec372a3be77ed9aff234f70e565 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 16 Dec 2024 13:53:30 +0000 Subject: [PATCH 22/26] REVERT!!! malicious test to see if we get to the code --- tests/unit/test_spaces.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 552e7464a..3b3360765 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -1018,6 +1018,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; } + ASSERT_EQ(1, 0); #endif unsigned char alignment = 0; arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization); From 397ac3fb17d714caf2b507bf44963b4c5aadf643 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 16 Dec 2024 14:05:24 +0000 Subject: [PATCH 23/26] assert dummt --- tests/unit/test_spaces.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 3b3360765..b59e86532 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -1017,8 +1017,9 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; + + ASSERT_EQ(alignment, 2141); } - ASSERT_EQ(1, 0); #endif unsigned char alignment = 0; arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization); From f9b7b87e851c1b99c3b8beb94bfbbe4f2c9625f7 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 16 Dec 2024 14:24:07 +0000 Subject: [PATCH 24/26] fix alignemnt test --- src/VecSim/spaces/IP_space.cpp | 3 ++- tests/unit/test_spaces.cpp | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index d97d53d96..f4d1409e4 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -242,7 +242,8 @@ dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment, #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { // Align to vector memory size, including the norm at the end of the vector. - if (dim % 32 + sizeof(float) == 0) // no point in aligning if we have an offsetting residual + if ((dim + sizeof(float)) % 32 == + 0) // no point in aligning if we have an offsetting residual *alignment = 32 * sizeof(int8_t); // align to 256 bits. return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index b59e86532..50dc270c2 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -1000,7 +1000,8 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8; - return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0; + return ((dim + sizeof(float)) % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) + : 0; }; dist_func_t arch_opt_func; @@ -1018,7 +1019,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; - ASSERT_EQ(alignment, 2141); + // ASSERT_EQ(alignment, 2141)<< "alignemt for dim = " << dim; } #endif unsigned char alignment = 0; From c4439f3b42fbd162d8bc26250cfef7c70727cfd0 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 16 Dec 2024 14:25:25 +0000 Subject: [PATCH 25/26] remove assert --- tests/unit/test_spaces.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 50dc270c2..91bc02943 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -1018,8 +1018,6 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; - - // ASSERT_EQ(alignment, 2141)<< "alignemt for dim = " << dim; } #endif unsigned char alignment = 0; From e526d02e87b91048abc13af9ddbf9d1ba0b4fbac Mon Sep 17 00:00:00 2001 From: meiravgri Date: Tue, 17 Dec 2024 05:42:00 +0000 Subject: [PATCH 26/26] remove cosine alignment --- src/VecSim/spaces/IP_space.cpp | 9 +++++---- tests/unit/test_spaces.cpp | 8 ++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index f4d1409e4..e7129b2e8 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -241,10 +241,11 @@ dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment, : *static_cast(arch_opt); #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { - // Align to vector memory size, including the norm at the end of the vector. - if ((dim + sizeof(float)) % 32 == - 0) // no point in aligning if we have an offsetting residual - *alignment = 32 * sizeof(int8_t); // align to 256 bits. + // For int8 vectors with cosine distance, the extra float for the norm shifts alignment to + // `(dim + sizeof(float)) % 32`. + // Vectors satisfying this have a residual, causing offset loads during calculation. + // To avoid complexity, we skip alignment here, assuming the performance impact is + // negligible. return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); } #endif diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 91bc02943..9931d318a 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -998,11 +998,6 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { // write the norm at the end of the vector *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim); *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim); - auto expected_alignment = [](size_t reg_bit_size, size_t dim) { - size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8; - return ((dim + sizeof(float)) % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) - : 0; - }; dist_func_t arch_opt_func; float baseline = INT8_Cosine(v1, v2, dim); @@ -1014,7 +1009,8 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { ASSERT_EQ(arch_opt_func, Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim; - ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; + // We don't align int8 vectors with cosine distance + ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim; // Unset optimizations flag, so we'll choose the next optimization. optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0;