diff --git a/cmake/x86_64InstructionFlags.cmake b/cmake/x86_64InstructionFlags.cmake index 1fedda7fe..1ff8f48f2 100644 --- a/cmake/x86_64InstructionFlags.cmake +++ b/cmake/x86_64InstructionFlags.cmake @@ -13,6 +13,7 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") CHECK_CXX_COMPILER_FLAG(-mavx512vbmi2 CXX_AVX512VBMI2) CHECK_CXX_COMPILER_FLAG(-mavx512fp16 CXX_AVX512FP16) CHECK_CXX_COMPILER_FLAG(-mavx512f CXX_AVX512F) + CHECK_CXX_COMPILER_FLAG(-mavx512vnni CXX_AVX512VNNI) CHECK_CXX_COMPILER_FLAG(-mavx2 CXX_AVX2) CHECK_CXX_COMPILER_FLAG(-mavx CXX_AVX) CHECK_CXX_COMPILER_FLAG(-mf16c CXX_F16C) @@ -48,6 +49,10 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") add_compile_definitions(OPT_AVX512_BW_VBMI2) endif() + if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI) + add_compile_definitions(OPT_AVX512_F_BW_VL_VNNI) + endif() + if(CXX_F16C AND CXX_FMA AND CXX_AVX) add_compile_definitions(OPT_F16C) endif() diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt index 9cc0baaaf..1fc9473b2 100644 --- a/src/VecSim/spaces/CMakeLists.txt +++ b/src/VecSim/spaces/CMakeLists.txt @@ -44,6 +44,12 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") list(APPEND OPTIMIZATIONS functions/AVX512F.cpp) endif() + if(CXX_AVX512F AND CXX_AVX512BW AND CXX_AVX512VL AND CXX_AVX512VNNI) + message("Building with AVX512F, AVX512BW, AVX512VL and AVX512VNNI") + set_source_files_properties(functions/AVX512F_BW_VL_VNNI.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512vl -mavx512vnni") + list(APPEND OPTIMIZATIONS functions/AVX512F_BW_VL_VNNI.cpp) + endif() + if(CXX_AVX2) message("Building with AVX2") set_source_files_properties(functions/AVX2.cpp PROPERTIES COMPILE_FLAGS -mavx2) diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 98ad07676..0884df3bb 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -66,3 +66,27 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension } return 1.0f - res; } + +static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { + int8_t *pVect1 = (int8_t *)pVect1v; + int8_t *pVect2 = (int8_t *)pVect2v; + + int res = 0; + for (size_t i = 0; i < dimension; i++) { + res += pVect1[i] * pVect2[i]; + } + return res; +} + +float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1 - INT8_InnerProductImp(pVect1v, pVect2v, dimension); +} + +float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { + // We expect the vectors' norm to be stored at the end of the vector. + float norm_v1 = + *reinterpret_cast(static_cast(pVect1v) + dimension); + float norm_v2 = + *reinterpret_cast(static_cast(pVect2v) + dimension); + return 1.0f - float(INT8_InnerProductImp(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2); +} diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index 50fecef33..d712499ed 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -16,3 +16,6 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension float BF16_InnerProduct_LittleEndian(const void *pVect1v, const void *pVect2v, size_t dimension); float BF16_InnerProduct_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension); + +float INT8_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); +float INT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h new file mode 100644 index 000000000..7716d8ad7 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h @@ -0,0 +1,77 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" + +static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) { + __m256i temp_a = _mm256_loadu_epi8(pVect1); + __m512i va = _mm512_cvtepi8_epi16(temp_a); + pVect1 += 32; + + __m256i temp_b = _mm256_loadu_epi8(pVect2); + __m512i vb = _mm512_cvtepi8_epi16(temp_b); + pVect2 += 32; + + // _mm512_dpwssd_epi32(src, a, b) + // Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding + // 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results + // with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. + sum = _mm512_dpwssd_epi32(sum, va, vb); +} + +template // 0..64 +static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { + int8_t *pVect1 = (int8_t *)pVect1v; + int8_t *pVect2 = (int8_t *)pVect2v; + + const int8_t *pEnd1 = pVect1 + dimension; + + __m512i sum = _mm512_setzero_epi32(); + + // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block, + // so mask loading is guaranteed to be safe + if constexpr (residual % 32) { + __mmask32 mask = (1LU << (residual % 32)) - 1; + __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1); + __m512i va = _mm512_cvtepi8_epi16(temp_a); + pVect1 += residual % 32; + + __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2); + __m512i vb = _mm512_cvtepi8_epi16(temp_b); + pVect2 += residual % 32; + + sum = _mm512_dpwssd_epi32(sum, va, vb); + } + + if constexpr (residual >= 32) { + InnerProductStep(pVect1, pVect2, sum); + } + + // We dealt with the residual part. We are left with some multiple of 64-int_8. + while (pVect1 < pEnd1) { + InnerProductStep(pVect1, pVect2, sum); + InnerProductStep(pVect1, pVect2, sum); + } + + return _mm512_reduce_add_epi32(sum); +} + +template // 0..64 +float INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, + size_t dimension) { + + return 1 - INT8_InnerProductImp(pVect1v, pVect2v, dimension); +} +template // 0..64 +float INT8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, + size_t dimension) { + float ip = INT8_InnerProductImp(pVect1v, pVect2v, dimension); + float norm_v1 = + *reinterpret_cast(static_cast(pVect1v) + dimension); + float norm_v2 = + *reinterpret_cast(static_cast(pVect2v) + dimension); + return 1.0f - ip / (norm_v1 * norm_v2); +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index e6da26947..e7129b2e8 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -16,6 +16,7 @@ #include "VecSim/spaces/functions/AVX512BW_VBMI2.h" #include "VecSim/spaces/functions/AVX512FP16_VL.h" #include "VecSim/spaces/functions/AVX512BF16_VL.h" +#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" @@ -196,4 +197,59 @@ dist_func_t IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con return ret_dist_func; } +dist_func_t IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = INT8_InnerProduct; + // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. + if (dim < 32) { + return ret_dist_func; + } +#ifdef CPU_FEATURES_ARCH_X86_64 + auto features = (arch_opt == nullptr) + ? cpu_features::GetX86Info().features + : *static_cast(arch_opt); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { + if (dim % 32 == 0) // no point in aligning if we have an offsetting residual + *alignment = 32 * sizeof(int8_t); // align to 256 bits. + return Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + +dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = INT8_Cosine; + // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. + if (dim < 32) { + return ret_dist_func; + } +#ifdef CPU_FEATURES_ARCH_X86_64 + auto features = (arch_opt == nullptr) + ? cpu_features::GetX86Info().features + : *static_cast(arch_opt); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { + // For int8 vectors with cosine distance, the extra float for the norm shifts alignment to + // `(dim + sizeof(float)) % 32`. + // Vectors satisfying this have a residual, causing offset loads during calculation. + // To avoid complexity, we skip alignment here, assuming the performance impact is + // negligible. + return Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} } // namespace spaces diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index a3ab0f4f6..0d8c3a836 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -16,4 +16,8 @@ dist_func_t IP_BF16_GetDistFunc(size_t dim, unsigned char *alignment = nu const void *arch_opt = nullptr); dist_func_t IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +dist_func_t IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); +dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/L2/L2.cpp b/src/VecSim/spaces/L2/L2.cpp index 5fba0555e..ef310418b 100644 --- a/src/VecSim/spaces/L2/L2.cpp +++ b/src/VecSim/spaces/L2/L2.cpp @@ -70,3 +70,17 @@ float FP16_L2Sqr(const void *pVect1, const void *pVect2, size_t dimension) { } return res; } + +float INT8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension) { + int8_t *pVect1 = (int8_t *)pVect1v; + int8_t *pVect2 = (int8_t *)pVect2v; + + int res = 0; + for (size_t i = 0; i < dimension; i++) { + int16_t a = pVect1[i]; + int16_t b = pVect2[i]; + int16_t diff = a - b; + res += diff * diff; + } + return float(res); +} diff --git a/src/VecSim/spaces/L2/L2.h b/src/VecSim/spaces/L2/L2.h index c367f2ee1..65649d4eb 100644 --- a/src/VecSim/spaces/L2/L2.h +++ b/src/VecSim/spaces/L2/L2.h @@ -16,3 +16,5 @@ float BF16_L2Sqr_LittleEndian(const void *pVect1v, const void *pVect2v, size_t d float BF16_L2Sqr_BigEndian(const void *pVect1v, const void *pVect2v, size_t dimension); float FP16_L2Sqr(const void *pVect1, const void *pVect2, size_t dimension); + +float INT8_L2Sqr(const void *pVect1v, const void *pVect2v, size_t dimension); diff --git a/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h new file mode 100644 index 000000000..2c8b846af --- /dev/null +++ b/src/VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h @@ -0,0 +1,63 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" + +static inline void L2SqrStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &sum) { + __m256i temp_a = _mm256_loadu_epi8(pVect1); + __m512i va = _mm512_cvtepi8_epi16(temp_a); + pVect1 += 32; + + __m256i temp_b = _mm256_loadu_epi8(pVect2); + __m512i vb = _mm512_cvtepi8_epi16(temp_b); + pVect2 += 32; + + __m512i diff = _mm512_sub_epi16(va, vb); + // _mm512_dpwssd_epi32(src, a, b) + // Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding + // 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results + // with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. + sum = _mm512_dpwssd_epi32(sum, diff, diff); +} + +template // 0..64 +float INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v, + size_t dimension) { + int8_t *pVect1 = (int8_t *)pVect1v; + int8_t *pVect2 = (int8_t *)pVect2v; + + const int8_t *pEnd1 = pVect1 + dimension; + + __m512i sum = _mm512_setzero_epi32(); + + // Deal with remainder first. `dim` is more than 32, so we have at least one 32-int_8 block, + // so mask loading is guaranteed to be safe + if constexpr (residual % 32) { + constexpr __mmask32 mask = (1LU << (residual % 32)) - 1; + __m256i temp_a = _mm256_loadu_epi8(pVect1); + __m512i va = _mm512_cvtepi8_epi16(temp_a); + pVect1 += residual % 32; + + __m256i temp_b = _mm256_loadu_epi8(pVect2); + __m512i vb = _mm512_cvtepi8_epi16(temp_b); + pVect2 += residual % 32; + + __m512i diff = _mm512_maskz_sub_epi16(mask, va, vb); + sum = _mm512_dpwssd_epi32(sum, diff, diff); + } + + if constexpr (residual >= 32) { + L2SqrStep(pVect1, pVect2, sum); + } + + // We dealt with the residual part. We are left with some multiple of 64-int_8. + while (pVect1 < pEnd1) { + L2SqrStep(pVect1, pVect2, sum); + L2SqrStep(pVect1, pVect2, sum); + } + + return _mm512_reduce_add_epi32(sum); +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 1c2b2b59f..c0bec428f 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -15,6 +15,7 @@ #include "VecSim/spaces/functions/SSE.h" #include "VecSim/spaces/functions/AVX512BW_VBMI2.h" #include "VecSim/spaces/functions/AVX512FP16_VL.h" +#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" @@ -189,4 +190,30 @@ dist_func_t L2_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con return ret_dist_func; } +dist_func_t L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = INT8_L2Sqr; + // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. + if (dim < 32) { + return ret_dist_func; + } +#ifdef CPU_FEATURES_ARCH_X86_64 + auto features = (arch_opt == nullptr) + ? cpu_features::GetX86Info().features + : *static_cast(arch_opt); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { + if (dim % 32 == 0) // no point in aligning if we have an offsetting residual + *alignment = 32 * sizeof(int8_t); // align to 256 bits. + return Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + } // namespace spaces diff --git a/src/VecSim/spaces/L2_space.h b/src/VecSim/spaces/L2_space.h index 4a2ea801a..48e50a8c2 100644 --- a/src/VecSim/spaces/L2_space.h +++ b/src/VecSim/spaces/L2_space.h @@ -16,4 +16,6 @@ dist_func_t L2_BF16_GetDistFunc(size_t dim, unsigned char *alignment = nu const void *arch_opt = nullptr); dist_func_t L2_FP16_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +dist_func_t L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/computer/calculator.h b/src/VecSim/spaces/computer/calculator.h index 36e76deed..64e0d8dae 100644 --- a/src/VecSim/spaces/computer/calculator.h +++ b/src/VecSim/spaces/computer/calculator.h @@ -26,10 +26,10 @@ class IndexCalculatorInterface : public VecsimBaseObject { /** * This object purpose is to calculate the distance between two vectors. * It extends the IndexCalculatorInterface class' type to hold the distance function. - * Every specific implmentation of the distance claculater should hold by refrence or by value the + * Every specific implementation of the distance calculator should hold by reference or by value the * parameters required for the calculation. The distance calculation API of all DistanceCalculator * classes is: calc_dist(v1,v2,dim). Internally it calls the distance function according the - * template signature, allowing fexability in the distance function arguments. + * template signature, allowing flexibility in the distance function arguments. */ template class DistanceCalculatorInterface : public IndexCalculatorInterface { diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp new file mode 100644 index 000000000..661c2c945 --- /dev/null +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -0,0 +1,36 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "AVX512BW_VBMI2.h" + +#include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_INT8.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_L2SqrSIMD64_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} + +dist_func_t Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} + +dist_func_t Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, INT8_CosineSIMD64_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h new file mode 100644 index 000000000..532a33c76 --- /dev/null +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -0,0 +1,17 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/implementation_chooser.h b/src/VecSim/spaces/functions/implementation_chooser.h index 2903b8cc4..b32ad56c6 100644 --- a/src/VecSim/spaces/functions/implementation_chooser.h +++ b/src/VecSim/spaces/functions/implementation_chooser.h @@ -25,23 +25,28 @@ // of 4N, 4N+1, 4N+2, 4N+3. #define C4(X, func, N) X(4 * N, func) X(4 * N + 1, func) X(4 * N + 2, func) X(4 * N + 3, func) -// Macros for 8, 16 and 32 cases. Used to collapse the switch statement. Expands into 0-31, 0-15 or -// 0-7 cases. +// Macros for 8, 16, 32 and 64 cases. Used to collapse the switch statement. Expands into 0-63, +// 0-31, 0-15 or 0-7 cases. #define CASES32(X, func) \ C4(X, func, 0) \ C4(X, func, 1) \ C4(X, func, 2) C4(X, func, 3) C4(X, func, 4) C4(X, func, 5) C4(X, func, 6) C4(X, func, 7) #define CASES16(X, func) C4(X, func, 0) C4(X, func, 1) C4(X, func, 2) C4(X, func, 3) #define CASES8(X, func) C4(X, func, 0) C4(X, func, 1) +#define CASES64(X, func) \ + CASES32(X, func) \ + C4(X, func, 8) \ + C4(X, func, 9) \ + C4(X, func, 10) C4(X, func, 11) C4(X, func, 12) C4(X, func, 13) C4(X, func, 14) C4(X, func, 15) // Main macro. Expands into a switch statement that chooses the implementation based on the // dimension's remainder. // @params: // out: The output variable that will be set to the chosen implementation. // dim: The dimension. -// chunk: The chunk size. Can be 32, 16 or 8. 32 for 16-bit elements, 16 for 32-bit elements, 8 -// for 64-bit elements. func: The templated function that we want to choose the implementation -// for. +// chunk: The chunk size. Can be 64, 32, 16 or 8. 64 for 8-bit elements, 32 for 16-bit elements, +// 16 for 32-bit elements, 8 for 64-bit elements. func: The templated function that we want to +// choose the implementation for. #define CHOOSE_IMPLEMENTATION(out, dim, chunk, func) \ do { \ decltype(out) __ret_dist_func; \ diff --git a/src/VecSim/spaces/spaces.cpp b/src/VecSim/spaces/spaces.cpp index 84f71b463..4385b5e94 100644 --- a/src/VecSim/spaces/spaces.cpp +++ b/src/VecSim/spaces/spaces.cpp @@ -70,6 +70,20 @@ dist_func_t GetDistFunc(VecSimMetric metric, size_t dim, throw std::invalid_argument("Invalid metric"); } +template <> +dist_func_t GetDistFunc(VecSimMetric metric, size_t dim, + unsigned char *alignment) { + switch (metric) { + case VecSimMetric_Cosine: + return Cosine_INT8_GetDistFunc(dim, alignment); + case VecSimMetric_IP: + return IP_INT8_GetDistFunc(dim, alignment); + case VecSimMetric_L2: + return L2_INT8_GetDistFunc(dim, alignment); + } + throw std::invalid_argument("Invalid metric"); +} + template <> normalizeVector_f GetNormalizeFunc(void) { return normalizeVector_imp; diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt index 4d25a5499..2fa066e82 100644 --- a/tests/benchmark/CMakeLists.txt +++ b/tests/benchmark/CMakeLists.txt @@ -31,7 +31,7 @@ endforeach() include(${root}/cmake/x86_64InstructionFlags.cmake) -set(DATA_TYPE fp32 fp64 bf16 fp16) +set(DATA_TYPE fp32 fp64 bf16 fp16 int8) foreach(data_type IN LISTS DATA_TYPE) add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp) target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark) diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh index 11872e869..867077ede 100755 --- a/tests/benchmark/benchmarks.sh +++ b/tests/benchmark/benchmarks.sh @@ -13,6 +13,7 @@ if [ -z "$BM_TYPE" ] || [ "$BM_TYPE" = "benchmarks-all" ]; then echo spaces_fp64 echo spaces_bf16 echo spaces_fp16 + echo spaces_int8 elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo basics_single_fp32 echo basics_multi_fp32 @@ -20,6 +21,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo spaces_fp64 echo spaces_bf16 echo spaces_fp16 + echo spaces_int8 # Basic benchmarks elif [ "$BM_TYPE" = "bm-basics-fp32-single" ] ; then echo basics_single_fp32 @@ -66,4 +68,15 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then echo spaces_fp16 echo spaces_fp64 echo spaces_bf16 + echo spaces_int8 +elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then + echo spaces_fp32 +elif [ "$BM_TYPE" = "bm-spaces-fp64" ] ; then + echo spaces_fp64 +elif [ "$BM_TYPE" = "bm-spaces-bf16" ] ; then + echo spaces_bf16 +elif [ "$BM_TYPE" = "bm-spaces-fp16" ] ; then + echo spaces_fp16 +elif [ "$BM_TYPE" = "bm-spaces-int8" ] ; then + echo spaces_int8 fi diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index 3b55a9032..b7431c43c 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -19,6 +19,7 @@ #include "VecSim/spaces/functions/AVX.h" #include "VecSim/spaces/functions/AVX512BW_VBMI2.h" #include "VecSim/spaces/functions/AVX512BF16_VL.h" +#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/F16C.h" #include "VecSim/spaces/functions/SSE3.h" @@ -123,6 +124,12 @@ static constexpr size_t start = min_no_res_th_dim; INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported); \ INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported); -#define INITIALIZE_BENCHMARKS_SET(bm_class, type_prefix, arch, dim_opt, arch_supported) \ +#define INITIALIZE_BENCHMARKS_SET_Cosine(bm_class, type_prefix, arch, dim_opt, arch_supported) \ + INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, Cosine, arch_supported); \ + INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, Cosine, arch_supported); \ + INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, Cosine, dim_opt, arch_supported); \ + INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, Cosine, dim_opt, arch_supported); + +#define INITIALIZE_BENCHMARKS_SET_L2_IP(bm_class, type_prefix, arch, dim_opt, arch_supported) \ INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported) \ INITIALIZE_BENCHMARKS_SET_IP(bm_class, type_prefix, arch, dim_opt, arch_supported) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp index 8022c712a..27fe82a3d 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp @@ -26,20 +26,20 @@ INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_BF16, BF16, AVX512BF16_VL, 32, // AVX512 functions #ifdef OPT_AVX512_BW_VBMI2 bool avx512_bw_vbmi2_supported = opt.avx512bw && opt.avx512vbmi2; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, AVX512BW_VBMI2, 32, - avx512_bw_vbmi2_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, AVX512BW_VBMI2, 32, + avx512_bw_vbmi2_supported); #endif // AVX512F // AVX functions #ifdef OPT_AVX2 bool avx2_supported = opt.avx2; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, AVX2, 32, avx2_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, AVX2, 32, avx2_supported); #endif // AVX // SSE functions #ifdef OPT_SSE3 bool sse3_supported = opt.sse3; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_BF16, BF16, SSE3, 32, sse3_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, SSE3, 32, sse3_supported); #endif // SSE #endif // x86_64 diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp index c9bc42b0b..9457bc77d 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp @@ -22,8 +22,8 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; class BM_VecSimSpaces_FP16_adv : public BM_VecSimSpaces<_Float16> {}; bool avx512fp16_vl_supported = opt.avx512_fp16 && opt.avx512vl; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16_adv, FP16, AVX512FP16_VL, 32, - avx512fp16_vl_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16_adv, FP16, AVX512FP16_VL, 32, + avx512fp16_vl_supported); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, InnerProduct, 32); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, L2Sqr, 32); @@ -32,12 +32,12 @@ INITIALIZE_NAIVE_BM(BM_VecSimSpaces_FP16_adv, FP16, L2Sqr, 32); // OPT_AVX512F functions #ifdef OPT_AVX512F bool avx512f_supported = opt.avx512f; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16, FP16, AVX512F, 32, avx512f_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, AVX512F, 32, avx512f_supported); #endif // OPT_AVX512F // AVX functions #ifdef OPT_F16C bool avx512_bw_f16c_supported = opt.f16c && opt.fma3 && opt.avx; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP16, FP16, F16C, 32, avx512_bw_f16c_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, F16C, 32, avx512_bw_f16c_supported); #endif // OPT_F16C #endif // x86_64 diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp index 289e42405..106b2abc8 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp @@ -13,19 +13,19 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512 functions #ifdef OPT_AVX512F bool avx512f_supported = opt.avx512f; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, AVX512F, 16, avx512f_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, AVX512F, 16, avx512f_supported); #endif // AVX512F // AVX functions #ifdef OPT_AVX bool avx_supported = opt.avx; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, AVX, 16, avx_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, AVX, 16, avx_supported); #endif // AVX // SSE functions #ifdef OPT_SSE bool sse_supported = opt.sse; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP32, FP32, SSE, 16, sse_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, SSE, 16, sse_supported); #endif // SSE #endif // x86_64 diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp index 19157f03f..01052cebc 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp64.cpp @@ -13,19 +13,19 @@ cpu_features::X86Features opt = cpu_features::GetX86Info().features; // AVX512 functions #ifdef OPT_AVX512F bool avx512f_supported = opt.avx512f; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, AVX512F, 8, avx512f_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, AVX512F, 8, avx512f_supported); #endif // AVX512F // AVX functions #ifdef OPT_AVX bool avx_supported = opt.avx; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, AVX, 8, avx_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, AVX, 8, avx_supported); #endif // AVX // SSE functions #ifdef OPT_SSE bool sse_supported = opt.sse; -INITIALIZE_BENCHMARKS_SET(BM_VecSimSpaces_FP64, FP64, SSE, 8, sse_supported); +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP64, FP64, SSE, 8, sse_supported); #endif // SSE #endif // x86_64 diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp new file mode 100644 index 000000000..0adde8972 --- /dev/null +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp @@ -0,0 +1,56 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ +#include +#include +#include +#include "utils/tests_utils.h" +#include "bm_spaces.h" + +class BM_VecSimSpaces_Integers_INT8 : public benchmark::Fixture { +protected: + std::mt19937 rng; + size_t dim; + int8_t *v1, *v2; + +public: + BM_VecSimSpaces_Integers_INT8() { rng.seed(47); } + ~BM_VecSimSpaces_Integers_INT8() = default; + + void SetUp(const ::benchmark::State &state) { + dim = state.range(0); + // Allocate vector with extra space for cosine calculations + v1 = new int8_t[dim + sizeof(float)]; + v2 = new int8_t[dim + sizeof(float)]; + test_utils::populate_int8_vec(v1, dim, 123); + test_utils::populate_int8_vec(v2, dim, 1234); + + // Store the norm in the extra space for cosine calculations + *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim); + *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim); + } + void TearDown(const ::benchmark::State &state) { + delete v1; + delete v2; + } +}; + +#ifdef CPU_FEATURES_ARCH_X86_64 +cpu_features::X86Features opt = cpu_features::GetX86Info().features; + +// AVX512_F_BW_VL_VNNI functions +#ifdef OPT_AVX512_F_BW_VL_VNNI +bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, + avx512_f_bw_vl_vnni_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW_VL_VNNI, 32, + avx512_f_bw_vl_vnni_supported) +#endif // AVX512_F_BW_VL_VNNI + +#endif // x86_64 + + INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, InnerProduct, 32); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, L2Sqr, 32); +BENCHMARK_MAIN(); diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index b16bddac6..caa3fc522 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -30,15 +30,15 @@ endif() include(${root}/cmake/x86_64InstructionFlags.cmake) -add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp test_utils.cpp) -add_executable(test_hnsw_parallel test_hnsw_parallel.cpp test_utils.cpp) -add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp test_utils.cpp) -add_executable(test_allocator test_allocator.cpp test_utils.cpp) +add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp unit_test_utils.cpp) +add_executable(test_hnsw_parallel test_hnsw_parallel.cpp unit_test_utils.cpp) +add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp unit_test_utils.cpp) +add_executable(test_allocator test_allocator.cpp unit_test_utils.cpp) add_executable(test_spaces test_spaces.cpp) add_executable(test_types test_types.cpp) -add_executable(test_common ../utils/mock_thread_pool.cpp test_utils.cpp test_common.cpp) -add_executable(test_bf16 ../utils/mock_thread_pool.cpp test_bf16.cpp test_utils.cpp) -add_executable(test_fp16 ../utils/mock_thread_pool.cpp test_fp16.cpp test_utils.cpp) +add_executable(test_common ../utils/mock_thread_pool.cpp unit_test_utils.cpp test_common.cpp) +add_executable(test_bf16 ../utils/mock_thread_pool.cpp test_bf16.cpp unit_test_utils.cpp) +add_executable(test_fp16 ../utils/mock_thread_pool.cpp test_fp16.cpp unit_test_utils.cpp) target_link_libraries(test_hnsw PUBLIC gtest_main VectorSimilarity) target_link_libraries(test_hnsw_parallel PUBLIC gtest_main VectorSimilarity) diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp index 4eb389260..03f0b4e52 100644 --- a/tests/unit/test_allocator.cpp +++ b/tests/unit/test_allocator.cpp @@ -10,7 +10,7 @@ #include "VecSim/memory/vecsim_base.h" #include "VecSim/algorithms/brute_force/brute_force_single.h" #include "VecSim/algorithms/hnsw/hnsw_single.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/utils/serializer.h" #include "VecSim/index_factories/hnsw_factory.h" @@ -83,7 +83,7 @@ TEST_F(AllocatorTest, test_nested_object) { template class IndexAllocatorTest : public ::testing::Test {}; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(IndexAllocatorTest, DataTypeSet); diff --git a/tests/unit/test_bf16.cpp b/tests/unit/test_bf16.cpp index 921c80c35..95e12c98b 100644 --- a/tests/unit/test_bf16.cpp +++ b/tests/unit/test_bf16.cpp @@ -2,7 +2,7 @@ #include "VecSim/vec_sim.h" #include "VecSim/algorithms/hnsw/hnsw_single.h" #include "VecSim/index_factories/hnsw_factory.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/utils/serializer.h" #include "mock_thread_pool.h" #include "VecSim/query_result_definitions.h" diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index c56415e3d..b3d5b1192 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -6,7 +6,7 @@ #include "gtest/gtest.h" #include "VecSim/vec_sim.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/algorithms/brute_force/brute_force.h" #include "VecSim/algorithms/brute_force/brute_force_single.h" #include "cpu_features_macros.h" @@ -32,7 +32,7 @@ class BruteForceTest : public ::testing::Test { } }; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(BruteForceTest, DataTypeSet); diff --git a/tests/unit/test_bruteforce_multi.cpp b/tests/unit/test_bruteforce_multi.cpp index ef9cfc636..55aadedd4 100644 --- a/tests/unit/test_bruteforce_multi.cpp +++ b/tests/unit/test_bruteforce_multi.cpp @@ -6,7 +6,7 @@ #include "gtest/gtest.h" #include "VecSim/vec_sim.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/algorithms/brute_force/brute_force_multi.h" #include @@ -27,7 +27,7 @@ class BruteForceMultiTest : public ::testing::Test { } }; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(BruteForceMultiTest, DataTypeSet); diff --git a/tests/unit/test_common.cpp b/tests/unit/test_common.cpp index 58df46fba..bdfd6d9f2 100644 --- a/tests/unit/test_common.cpp +++ b/tests/unit/test_common.cpp @@ -10,7 +10,7 @@ #include "VecSim/query_result_definitions.h" #include "VecSim/utils/updatable_heap.h" #include "VecSim/utils/vec_utils.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/containers/vecsim_results_container.h" #include "VecSim/algorithms/hnsw/hnsw.h" #include "VecSim/index_factories/hnsw_factory.h" @@ -32,7 +32,7 @@ using float16 = vecsim_types::float16; template class CommonIndexTest : public ::testing::Test {}; -// DataTypeSet are defined in test_utils.h +// DataTypeSet are defined in unit_test_utils.h TYPED_TEST_SUITE(CommonIndexTest, DataTypeSet); diff --git a/tests/unit/test_fp16.cpp b/tests/unit/test_fp16.cpp index 377ef8f32..244bb9d0c 100644 --- a/tests/unit/test_fp16.cpp +++ b/tests/unit/test_fp16.cpp @@ -2,7 +2,7 @@ #include "VecSim/vec_sim.h" #include "VecSim/algorithms/hnsw/hnsw_single.h" #include "VecSim/index_factories/hnsw_factory.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/utils/serializer.h" #include "mock_thread_pool.h" #include "VecSim/query_result_definitions.h" diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp index f57a6d3e3..cc400d48a 100644 --- a/tests/unit/test_hnsw.cpp +++ b/tests/unit/test_hnsw.cpp @@ -9,7 +9,7 @@ #include "VecSim/vec_sim_debug.h" #include "VecSim/algorithms/hnsw/hnsw_single.h" #include "VecSim/index_factories/hnsw_factory.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/utils/serializer.h" #include "VecSim/query_result_definitions.h" #include @@ -36,7 +36,7 @@ class HNSWTest : public ::testing::Test { } }; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(HNSWTest, DataTypeSet); diff --git a/tests/unit/test_hnsw_multi.cpp b/tests/unit/test_hnsw_multi.cpp index ba87f1759..026f96e62 100644 --- a/tests/unit/test_hnsw_multi.cpp +++ b/tests/unit/test_hnsw_multi.cpp @@ -6,7 +6,7 @@ #include "gtest/gtest.h" #include "VecSim/vec_sim.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/algorithms/hnsw/hnsw_multi.h" #include #include @@ -31,7 +31,7 @@ class HNSWMultiTest : public ::testing::Test { } }; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(HNSWMultiTest, DataTypeSet); diff --git a/tests/unit/test_hnsw_parallel.cpp b/tests/unit/test_hnsw_parallel.cpp index a2d4827ca..0354a6af1 100644 --- a/tests/unit/test_hnsw_parallel.cpp +++ b/tests/unit/test_hnsw_parallel.cpp @@ -7,7 +7,7 @@ #include "gtest/gtest.h" #include "VecSim/vec_sim.h" #include "VecSim/algorithms/hnsw/hnsw_single.h" -#include "test_utils.h" +#include "unit_test_utils.h" #include "VecSim/query_result_definitions.h" #include "VecSim/vec_sim_debug.h" #include @@ -124,7 +124,7 @@ class HNSWTestParallel : public ::testing::Test { void parallelInsertSearch(bool is_multi); }; -// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in test_utils.h +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h TYPED_TEST_SUITE(HNSWTestParallel, DataTypeSet); diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp index ee5fef7d1..676424fd7 100644 --- a/tests/unit/test_hnsw_tiered.cpp +++ b/tests/unit/test_hnsw_tiered.cpp @@ -6,7 +6,7 @@ #include #include -#include "test_utils.h" +#include "unit_test_utils.h" #include "mock_thread_pool.h" #include diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 7cf7de92b..9931d318a 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -23,9 +23,11 @@ #include "VecSim/spaces/functions/AVX512BW_VBMI2.h" #include "VecSim/spaces/functions/AVX512BF16_VL.h" #include "VecSim/spaces/functions/AVX512FP16_VL.h" +#include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/F16C.h" +#include "tests_utils.h" using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; @@ -102,6 +104,21 @@ TEST_F(SpacesTest, fp16_l2_no_optimization_func_test) { ASSERT_EQ(dist, FP32_L2Sqr((const void *)sanity_a, (const void *)sanity_b, dim)); } +TEST_F(SpacesTest, int8_l2_no_optimization_func_test) { + size_t dim = 5; + + int8_t a[dim], b[dim]; + for (size_t i = 0; i < dim; i++) { + a[i] = (i + 1); + b[i] = (i + 2); + } + + float dist = INT8_L2Sqr((const void *)a, (const void *)b, dim); + ASSERT_EQ(dist, 5.0); +} + +/* ======================== IP NO OPT ======================== */ + TEST_F(SpacesTest, float_ip_no_optimization_func_test) { size_t dim = 5; @@ -211,6 +228,36 @@ TEST_F(SpacesTest, fp16_ip_no_optimization_func_test) { ASSERT_EQ(dist, FP32_InnerProduct((const void *)sanity_a, (const void *)sanity_b, dim)); } +TEST_F(SpacesTest, int8_ip_no_optimization_func_test) { + size_t dim = 4; + int8_t a[] = {1, 0, 0, 0}; + int8_t b[] = {1, 0, 0, 0}; + + float dist = INT8_InnerProduct((const void *)a, (const void *)b, dim); + ASSERT_EQ(dist, 0.0); +} + +/* ======================== Cosine NO OPT ======================== */ + +TEST_F(SpacesTest, int8_Cosine_no_optimization_func_test) { + size_t dim = 4; + // create a vector with extra space for the norm + int8_t *v1 = new int8_t[dim + sizeof(float)]; + int8_t *v2 = new int8_t[dim + sizeof(float)]; + + test_utils::populate_int8_vec(v1, dim, 123); + test_utils::populate_int8_vec(v2, dim, 123); + + // write the norm at the end of the vector + *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim); + *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim); + + float dist = INT8_Cosine((const void *)v1, (const void *)v2, dim); + ASSERT_NEAR(dist, 0.0, 0.000001); +} + +/* ======================== Test Getters ======================== */ + TEST_F(SpacesTest, GetDistFuncInvalidMetricFP32) { EXPECT_THROW( (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)), @@ -231,6 +278,11 @@ TEST_F(SpacesTest, GetDistFuncInvalidMetricFP16) { (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)), std::invalid_argument); } +TEST_F(SpacesTest, GetDistFuncInvalidMetricINT8) { + EXPECT_THROW( + (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)), + std::invalid_argument); +} using namespace spaces; @@ -241,27 +293,38 @@ TEST_F(SpacesTest, smallDimChooser) { ASSERT_EQ(L2_FP64_GetDistFunc(dim), FP64_L2Sqr); ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian); ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr); + ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr); ASSERT_EQ(IP_FP32_GetDistFunc(dim), FP32_InnerProduct); ASSERT_EQ(IP_FP64_GetDistFunc(dim), FP64_InnerProduct); ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); + ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct); + ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine); } for (size_t dim = 8; dim < 16; dim++) { ASSERT_EQ(L2_FP32_GetDistFunc(dim), FP32_L2Sqr); ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian); ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr); + ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr); ASSERT_EQ(IP_FP32_GetDistFunc(dim), FP32_InnerProduct); ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); + ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct); + ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine); } for (size_t dim = 16; dim < 32; dim++) { ASSERT_EQ(L2_BF16_GetDistFunc(dim), BF16_L2Sqr_LittleEndian); ASSERT_EQ(L2_FP16_GetDistFunc(dim), FP16_L2Sqr); + ASSERT_EQ(L2_INT8_GetDistFunc(dim), INT8_L2Sqr); ASSERT_EQ(IP_BF16_GetDistFunc(dim), BF16_InnerProduct_LittleEndian); ASSERT_EQ(IP_FP16_GetDistFunc(dim), FP16_InnerProduct); + ASSERT_EQ(IP_INT8_GetDistFunc(dim), INT8_InnerProduct); + ASSERT_EQ(Cosine_INT8_GetDistFunc(dim), INT8_Cosine); } } +/* ======================== Test SIMD Functions ======================== */ + // In this following tests we assume that compiler supports all X86 optimizations, so if we have // some hardware flag enabled, we check that the corresponding optimization function was chosen. #ifdef CPU_FEATURES_ARCH_X86_64 @@ -849,4 +912,118 @@ INSTANTIATE_TEST_SUITE_P(, FP16SpacesOptimizationTestAdvanced, #endif +class INT8SpacesOptimizationTest : public testing::TestWithParam {}; + +TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { + auto optimization = cpu_features::GetX86Info().features; + size_t dim = GetParam(); + int8_t *v1 = new int8_t[dim]; + int8_t *v2 = new int8_t[dim]; + test_utils::populate_int8_vec(v1, dim, 123); + test_utils::populate_int8_vec(v2, dim, 1234); + + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = INT8_L2Sqr(v1, v2, dim); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && + optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim; + ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.avx512f = optimization.avx512bw = optimization.avx512vl = + optimization.avx512vnni = 0; + } +#endif + unsigned char alignment = 0; + arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, INT8_L2Sqr) << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) { + auto optimization = cpu_features::GetX86Info().features; + size_t dim = GetParam(); + int8_t *v1 = new int8_t[dim]; + int8_t *v2 = new int8_t[dim]; + test_utils::populate_int8_vec(v1, dim, 123); + test_utils::populate_int8_vec(v2, dim, 1234); + + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(int8_t) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(int8_t) : 0; + }; + + dist_func_t arch_opt_func; + float baseline = INT8_InnerProduct(v1, v2, dim); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && + optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim; + ASSERT_EQ(alignment, expected_alignment(256, dim)) << "AVX512 with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.avx512f = optimization.avx512bw = optimization.avx512vl = + optimization.avx512vnni = 0; + } +#endif + unsigned char alignment = 0; + arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, INT8_InnerProduct) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { + auto optimization = cpu_features::GetX86Info().features; + size_t dim = GetParam(); + int8_t *v1 = new int8_t[dim + sizeof(float)]; + int8_t *v2 = new int8_t[dim + sizeof(float)]; + test_utils::populate_int8_vec(v1, dim, 123); + test_utils::populate_int8_vec(v2, dim, 1234); + + // write the norm at the end of the vector + *(float *)(v1 + dim) = test_utils::compute_norm(v1, dim); + *(float *)(v2 + dim) = test_utils::compute_norm(v2, dim); + + dist_func_t arch_opt_func; + float baseline = INT8_Cosine(v1, v2, dim); +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && + optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "AVX512 with dim " << dim; + // We don't align int8 vectors with cosine distance + ASSERT_EQ(alignment, 0) << "AVX512 with dim " << dim; + // Unset optimizations flag, so we'll choose the next optimization. + optimization.avx512f = optimization.avx512bw = optimization.avx512vl = + optimization.avx512vnni = 0; + } +#endif + unsigned char alignment = 0; + arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, INT8_Cosine) << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +INSTANTIATE_TEST_SUITE_P(INT8OptFuncs, INT8SpacesOptimizationTest, + testing::Range(32UL, 32 * 2UL + 1)); + #endif // CPU_FEATURES_ARCH_X86_64 diff --git a/tests/unit/test_utils.cpp b/tests/unit/unit_test_utils.cpp similarity index 99% rename from tests/unit/test_utils.cpp rename to tests/unit/unit_test_utils.cpp index 7b99eba22..89973d19d 100644 --- a/tests/unit/test_utils.cpp +++ b/tests/unit/unit_test_utils.cpp @@ -4,7 +4,7 @@ *the Server Side Public License v1 (SSPLv1). */ -#include "test_utils.h" +#include "unit_test_utils.h" #include "gtest/gtest.h" #include "VecSim/utils/vec_utils.h" #include "VecSim/memory/vecsim_malloc.h" diff --git a/tests/unit/test_utils.h b/tests/unit/unit_test_utils.h similarity index 100% rename from tests/unit/test_utils.h rename to tests/unit/unit_test_utils.h diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h new file mode 100644 index 000000000..31dc3d9ef --- /dev/null +++ b/tests/utils/tests_utils.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include + +namespace test_utils { + +// Assuming v is a memory allocation of size dim * sizeof(float) +static void populate_int8_vec(int8_t *v, size_t dim, int seed = 1234) { + + std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed + + // uniform_int_distribution doesn't support int8, + // Define a distribution range for int8_t + std::uniform_int_distribution dis(-128, 127); + + for (size_t i = 0; i < dim; i++) { + v[i] = static_cast(dis(gen)); + } +} + +// TODO: replace with normalize function from VecSim +float compute_norm(const int8_t *vec, size_t dim) { + int norm = 0; + for (size_t i = 0; i < dim; i++) { + norm += vec[i] * vec[i]; + } + return sqrt(norm); +} + +} // namespace test_utils