Skip to content

Add SVE/SVE2 support for uint8 and int8 data type [MOD-9080] #619

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 218 commits into from
Apr 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
218 commits
Select commit Hold shift + click to select a range
1171d17
Add arm support
dor-forer Feb 19, 2025
8102ad1
Changed the arm cpu info
dor-forer Feb 20, 2025
0504e08
Add ip test
dor-forer Feb 20, 2025
ba931d0
Add to tests
dor-forer Feb 20, 2025
e0642c8
Added tests andbm
dor-forer Feb 20, 2025
4b8c347
fix tests
dor-forer Feb 20, 2025
3039eb8
Add github benchmakrs
dor-forer Feb 25, 2025
9a67ee8
Check 1
dor-forer Feb 25, 2025
a9b87d4
only arm
dor-forer Feb 25, 2025
da3c880
change ami
dor-forer Feb 25, 2025
1fdb6d5
Try ireland
dor-forer Feb 25, 2025
b4302e1
Try different image
dor-forer Feb 25, 2025
a83947a
try image
dor-forer Feb 25, 2025
a698070
back to old image
dor-forer Feb 25, 2025
730d8ac
larger image
dor-forer Feb 25, 2025
38371c5
Add option to change env
dor-forer Feb 25, 2025
202a89d
back to default region
dor-forer Feb 25, 2025
185703d
Created new image
dor-forer Feb 25, 2025
90e885c
Try to add the x86 to check
dor-forer Feb 25, 2025
d61c358
Try different machine
dor-forer Feb 25, 2025
4a88b1f
added include
dor-forer Feb 25, 2025
3ceadaa
Try without opti on arm
dor-forer Feb 25, 2025
e89762c
Change to c6g
dor-forer Feb 25, 2025
ba1ea86
added matrix region
dor-forer Feb 25, 2025
76b7132
change to west
dor-forer Feb 25, 2025
55bb40f
try the i8
dor-forer Feb 25, 2025
1b84ced
Try oregon
dor-forer Feb 25, 2025
3f98c27
Change subnet id
dor-forer Feb 25, 2025
66d96a1
Now subnet
dor-forer Feb 25, 2025
0c5f16c
Change subnet
dor-forer Feb 25, 2025
b2af693
add subnet
dor-forer Feb 25, 2025
20e596c
Try group id
dor-forer Feb 25, 2025
0682472
Change to vpc id
dor-forer Feb 26, 2025
9be3846
change subnet
dor-forer Feb 26, 2025
125e30b
Change ami
dor-forer Feb 26, 2025
6758753
Try without subnet
dor-forer Feb 26, 2025
2a37fb3
add security group again
dor-forer Feb 26, 2025
7d97821
Change the subnets
dor-forer Feb 26, 2025
97e7249
Change to ids
dor-forer Feb 26, 2025
4545554
Change sg
dor-forer Feb 26, 2025
3a443d3
psubnet
dor-forer Feb 26, 2025
a472150
Try different
dor-forer Feb 26, 2025
bee1c27
different
dor-forer Feb 26, 2025
4a891da
to a file
dor-forer Feb 26, 2025
0341dd7
print
dor-forer Feb 26, 2025
f8f424a
p
dor-forer Feb 26, 2025
ee0458a
leave empty
dor-forer Feb 26, 2025
26ff2cc
empty
dor-forer Feb 26, 2025
d3eaeeb
Try different account
dor-forer Feb 26, 2025
55bc653
Run 2 arm machines
dor-forer Feb 26, 2025
21de162
Move both to us-west-2
dor-forer Feb 26, 2025
6f8e4d4
Try workflow
dor-forer Feb 26, 2025
eedc25c
Change name
dor-forer Feb 26, 2025
578b88d
Changes
dor-forer Feb 26, 2025
41e920f
Change the secrets
dor-forer Feb 27, 2025
6218a9c
Add supprted arch
dor-forer Feb 27, 2025
1533ba7
Add defaults
dor-forer Feb 27, 2025
a86d7ac
Support all
dor-forer Feb 27, 2025
7652c9e
Change the jq
dor-forer Feb 27, 2025
c369125
Change machine to t4g
dor-forer Feb 27, 2025
9d9a047
Change the name
dor-forer Feb 27, 2025
14f8739
Change the machine
dor-forer Feb 27, 2025
2f119ec
fix the stop
dor-forer Feb 27, 2025
96d63af
only benchamrk
dor-forer Mar 2, 2025
305aa0b
add the secrets
dor-forer Mar 2, 2025
4e45109
region secret
dor-forer Mar 2, 2025
1b4649a
benchmark region
dor-forer Mar 2, 2025
797d1d6
Change timeout
dor-forer Mar 3, 2025
db9c63e
Added support for arch name in benchamrks
dor-forer Mar 9, 2025
106fc5e
change th json
dor-forer Mar 9, 2025
a0d62fb
changed to v9.0
dor-forer Mar 9, 2025
b8075b1
Change the check
dor-forer Mar 9, 2025
2007e33
add v9
dor-forer Mar 9, 2025
606cea7
Check alt version of armv9
dor-forer Mar 9, 2025
12bead0
added check
dor-forer Mar 9, 2025
976c366
add arc_arch
dor-forer Mar 9, 2025
8e23a2f
changed to CONCAT_WITH_UNDERSCORE_ARCH
dor-forer Mar 9, 2025
e81ce18
change the check
dor-forer Mar 9, 2025
f8f3d9e
Add full check
dor-forer Mar 9, 2025
f408017
fix the instruct
dor-forer Mar 10, 2025
0af63d8
Added the cmake
dor-forer Mar 10, 2025
38d563a
fix the support
dor-forer Mar 10, 2025
87ac845
put it back to cmake
dor-forer Mar 10, 2025
14bcd59
back
dor-forer Mar 10, 2025
b48d9c4
change the condition
dor-forer Mar 10, 2025
47b9724
No armpl for now
dor-forer Mar 10, 2025
1b35e30
cland format
dor-forer Mar 11, 2025
cafb30c
remove the opt
dor-forer Mar 11, 2025
bde60e4
Changed to one machine
dor-forer Mar 11, 2025
421715c
Added BENCHMARK_ARCH
dor-forer Mar 11, 2025
3c07da6
fix endif
dor-forer Mar 11, 2025
eabe27c
Remove secrets call
dor-forer Mar 12, 2025
7beb70b
pr changes
dor-forer Mar 12, 2025
66b37a6
Changes
dor-forer Mar 12, 2025
768636d
change to compile
dor-forer Mar 12, 2025
01a4f60
add sve
dor-forer Mar 13, 2025
287490f
add #endif
dor-forer Mar 13, 2025
570ab69
add armpl
dor-forer Mar 13, 2025
9ad8c1e
add to cmake
dor-forer Mar 13, 2025
0334e43
remove armpl
dor-forer Mar 13, 2025
15e7963
add install
dor-forer Mar 13, 2025
3750241
Add ARCH=$(uname -m)
dor-forer Mar 13, 2025
22596de
change the path to armpl
dor-forer Mar 13, 2025
69a2f24
suuport check for armv7
dor-forer Mar 13, 2025
f31c2a3
change the armpl
dor-forer Mar 13, 2025
fd6291e
Change or OR
dor-forer Mar 13, 2025
154b2a8
Merge branch 'dorer-add-arm-support' of https://github.com/RedisAI/Ve…
dor-forer Mar 13, 2025
877a70e
add neon supported for spaces
dor-forer Mar 16, 2025
c32ef14
add sve
dor-forer Mar 16, 2025
655a474
add support
dor-forer Mar 16, 2025
4cc47c3
Merge branch 'main' of https://github.com/RedisAI/VectorSimilarity in…
dor-forer Mar 16, 2025
6ae4deb
align
dor-forer Mar 16, 2025
9b09210
format
dor-forer Mar 16, 2025
d3cb7ae
change error
dor-forer Mar 16, 2025
405931d
change
dor-forer Mar 16, 2025
ef9563f
Removed the ifdef
dor-forer Mar 16, 2025
220616b
Add comments
dor-forer Mar 16, 2025
52c5382
clang
dor-forer Mar 16, 2025
e31aa8a
Change names
dor-forer Mar 17, 2025
63ce083
format
dor-forer Mar 17, 2025
d19975a
Try fp32 neon simd
dor-forer Mar 18, 2025
552ce53
add l2
dor-forer Mar 18, 2025
0b12d21
add cmake
dor-forer Mar 18, 2025
0b7d92f
add SVE
dor-forer Mar 18, 2025
74b4285
fix sve l2
dor-forer Mar 19, 2025
c6317bc
PR changes
dor-forer Mar 19, 2025
45e8fdd
Change to 1
dor-forer Mar 19, 2025
f1487b8
fix the l2
dor-forer Mar 19, 2025
3d9be6b
Merge branch 'dorer-add-arm-opt' of https://github.com/RedisAI/Vector…
dor-forer Mar 19, 2025
8e097c8
fix format
dor-forer Mar 19, 2025
4aeca15
add desciriopn for chunk == 1
dor-forer Mar 19, 2025
76154bf
Change functions
dor-forer Mar 20, 2025
8f61a5c
Add include
dor-forer Mar 20, 2025
4a5c8ee
Change the cast
dor-forer Mar 23, 2025
0c42378
add resudual
dor-forer Mar 23, 2025
2cf10f1
formatting
dor-forer Mar 23, 2025
901cf36
Move th consexpt
dor-forer Mar 23, 2025
4489bf3
remove template armpl
dor-forer Mar 23, 2025
6fa2474
Back to armpl
dor-forer Mar 23, 2025
f2305dc
back to armpl_neon
dor-forer Mar 23, 2025
d567ab2
include
dor-forer Mar 23, 2025
192f8e6
armnpl
dor-forer Mar 23, 2025
1e7584f
Merge branch 'dorer-add-arm-opt' of https://github.com/RedisAI/Vector…
dor-forer Mar 23, 2025
601c968
add choose
dor-forer Mar 23, 2025
2cd084e
fix the residual div
dor-forer Mar 24, 2025
94ac845
raise the residuals values
dor-forer Mar 24, 2025
0229ac0
back to char
dor-forer Mar 24, 2025
1c8124e
Remove prefetch
dor-forer Mar 24, 2025
87feb67
Revert implemetion chooser
dor-forer Mar 24, 2025
5a1608c
Merge branch 'dorer-add-arm-opt' of https://github.com/RedisAI/Vector…
dor-forer Mar 24, 2025
1c233c3
Remove armpl
dor-forer Mar 24, 2025
67aa3fc
Revert remove error
dor-forer Mar 24, 2025
5ec219d
Remove comment
dor-forer Mar 24, 2025
57b580a
Merge branch 'dorer-add-arm-opt' of https://github.com/RedisAI/Vector…
dor-forer Mar 24, 2025
8882492
Remove empty line
dor-forer Mar 24, 2025
2dc295e
Merge branch 'dorer-add-arm-opt' of https://github.com/RedisAI/Vector…
dor-forer Mar 24, 2025
48e4417
Merge branch 'main' of https://github.com/RedisAI/VectorSimilarity in…
dor-forer Mar 25, 2025
872c10e
format
dor-forer Mar 25, 2025
0310d16
Add support macos
dor-forer Mar 25, 2025
f8fb4d2
add sudo
dor-forer Mar 25, 2025
b72986a
Add absolute path
dor-forer Mar 25, 2025
eacebb2
find all libs
dor-forer Mar 25, 2025
ead05e9
Change folder
dor-forer Mar 25, 2025
e9d0d64
Now set for real
dor-forer Mar 25, 2025
4dbd798
Merge branch 'dorer-add-arm-opt' of https://github.com/RedisAI/Vector…
dor-forer Mar 25, 2025
2a21eba
Remove armpl from pull
dor-forer Mar 25, 2025
753fdcd
change the templates
dor-forer Mar 25, 2025
965651c
change chunk size to 1
dor-forer Mar 25, 2025
98fd7b0
Back to 4
dor-forer Mar 25, 2025
29688d9
Removed the for
dor-forer Mar 26, 2025
417a95c
Change to 2 sums
dor-forer Mar 26, 2025
d8b0913
SVE L2
lerman25 Mar 26, 2025
38ac404
Changed
dor-forer Mar 26, 2025
0db8794
Add get opt func
dor-forer Mar 27, 2025
c49e76f
Change the var name
dor-forer Mar 27, 2025
566f2ae
format
dor-forer Mar 27, 2025
bbd3d3c
Pr fixes
dor-forer Mar 27, 2025
ecd431f
PR
dor-forer Mar 27, 2025
3cbac35
SVE IP , SVE2 IP & L2
lerman25 Mar 27, 2025
9b33f08
UINT8 support, remove int8_ip_sve
lerman25 Mar 27, 2025
c93172a
format
lerman25 Mar 27, 2025
20b564a
pr
dor-forer Mar 30, 2025
f9bf3d8
pr fix
dor-forer Mar 30, 2025
b9b6ee5
bm_spaces
lerman25 Mar 30, 2025
26b51fc
PR
dor-forer Mar 31, 2025
eab2bed
added conversion
dor-forer Mar 31, 2025
acda3b0
small dim for intel only
dor-forer Mar 31, 2025
da65f88
Merge remote-tracking branch 'origin/dorer-add-arm-opt-fp32' into Ome…
lerman25 Mar 31, 2025
0d1cad1
Test smallDimChooser only for intel
dor-forer Mar 31, 2025
cbae32b
Merge remote-tracking branch 'origin/dorer-add-arm-opt-fp32' into Ome…
lerman25 Mar 31, 2025
8b9f369
align offset
lerman25 Mar 31, 2025
4da3c28
align const expression
lerman25 Mar 31, 2025
8403f99
align cpu features function
lerman25 Mar 31, 2025
de491a3
format
lerman25 Mar 31, 2025
9f7bc81
change to svadd_f32_x where possible
lerman25 Apr 3, 2025
95b7e72
change to _x where possible
lerman25 Apr 3, 2025
7feabe7
move low dim check to intel only
lerman25 Apr 3, 2025
8348275
Merge remote-tracking branch 'origin/main' into Omer_arm_int8_sve_sve2
lerman25 Apr 3, 2025
35a20e0
format
lerman25 Apr 3, 2025
17a7295
fix IP
lerman25 Apr 3, 2025
f795896
Optimize, convert on final step
lerman25 Apr 6, 2025
9cf34b2
format
lerman25 Apr 6, 2025
b8fb20b
chunking
lerman25 Apr 6, 2025
31e53af
Merge remote-tracking branch 'origin/main' into Omer_arm_int8_sve_sve2
lerman25 Apr 6, 2025
7abf634
change to inline
lerman25 Apr 6, 2025
1bfc615
format
lerman25 Apr 6, 2025
9ddd920
guy's comments
lerman25 Apr 8, 2025
988a19e
fix unit_test
lerman25 Apr 8, 2025
fe0cc0b
format
lerman25 Apr 8, 2025
69efdaa
reinterpet comment
lerman25 Apr 8, 2025
ec0b2b4
Merge remote-tracking branch 'origin/main' into Omer_arm_int8_sve_sve2
lerman25 Apr 8, 2025
edf7ce3
using dot
lerman25 Apr 8, 2025
59e2872
fix uint8
lerman25 Apr 9, 2025
a489bdb
SVE2 -> SVE
lerman25 Apr 9, 2025
98faaf8
for mat
lerman25 Apr 9, 2025
50e2011
fix comments
lerman25 Apr 9, 2025
db73b7b
format :(
lerman25 Apr 9, 2025
5d3631a
illegal
lerman25 Apr 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions src/VecSim/spaces/IP/IP_SVE_INT8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include <arm_sve.h>

inline void InnerProductStep(const int8_t *&pVect1, const int8_t *&pVect2, size_t &offset,
svint32_t &sum, const size_t chunk) {
svbool_t pg = svptrue_b8();

// Load int8 vectors
svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset);
svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset);

sum = svdot_s32(sum, v1_i8, v2_i8);

offset += chunk; // Move to the next set of int8 elements
}

template <bool partial_chunk, unsigned char additional_steps>
float INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
const int8_t *pVect1 = reinterpret_cast<const int8_t *>(pVect1v);
const int8_t *pVect2 = reinterpret_cast<const int8_t *>(pVect2v);

size_t offset = 0;
const size_t vl = svcntb();
const size_t chunk_size = 4 * vl;

// Each innerProductStep adds maximum 2^8 & 2^8 = 2^16
// Therefore, on a single accumulator, we can perform 2^15 steps before overflowing
// That scenario will happen only is the dimension of the vector is larger than 16*4*2^15 = 2^21
// (16 int8 in 1 SVE register) * (4 accumulators) * (2^15 steps)
// We can safely assume that the dimension is smaller than that
// So using int32_t is safe

svint32_t sum0 = svdup_s32(0);
svint32_t sum1 = svdup_s32(0);
svint32_t sum2 = svdup_s32(0);
svint32_t sum3 = svdup_s32(0);

size_t num_chunks = dimension / chunk_size;

for (size_t i = 0; i < num_chunks; ++i) {
InnerProductStep(pVect1, pVect2, offset, sum0, vl);
InnerProductStep(pVect1, pVect2, offset, sum1, vl);
InnerProductStep(pVect1, pVect2, offset, sum2, vl);
InnerProductStep(pVect1, pVect2, offset, sum3, vl);
}

// Process remaining complete SVE vectors that didn't fit into the main loop
// These are full vector operations (0-3 elements)
if constexpr (additional_steps > 0) {
if constexpr (additional_steps >= 1) {
InnerProductStep(pVect1, pVect2, offset, sum0, vl);
}
if constexpr (additional_steps >= 2) {
InnerProductStep(pVect1, pVect2, offset, sum1, vl);
}
if constexpr (additional_steps >= 3) {
InnerProductStep(pVect1, pVect2, offset, sum2, vl);
}
}

if constexpr (partial_chunk) {
svbool_t pg = svwhilelt_b8_u64(offset, dimension);

svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors
svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors

sum3 = svdot_s32(sum3, v1_i8, v2_i8);

pVect1 += vl;
pVect2 += vl;
}

sum0 = svadd_s32_x(svptrue_b32(), sum0, sum1);
sum2 = svadd_s32_x(svptrue_b32(), sum2, sum3);

// Perform vector addition in parallel and Horizontal sum
int32_t sum_all = svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sum0, sum2));

return sum_all;
}

template <bool partial_chunk, unsigned char additional_steps>
float INT8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f -
INT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
}

template <bool partial_chunk, unsigned char additional_steps>
float INT8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
float ip = INT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
float norm_v1 =
*reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect1v) + dimension);
float norm_v2 =
*reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect2v) + dimension);
return 1.0f - ip / (norm_v1 * norm_v2);
}
102 changes: 102 additions & 0 deletions src/VecSim/spaces/IP/IP_SVE_UINT8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include <arm_sve.h>

inline void InnerProductStep(const uint8_t *&pVect1, const uint8_t *&pVect2, size_t &offset,
svuint32_t &sum, const size_t chunk) {
svbool_t pg = svptrue_b8();

// Load uint8 vectors
svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset);
svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset);

sum = svdot_u32(sum, v1_ui8, v2_ui8);

offset += chunk; // Move to the next set of uint8 elements
}

template <bool partial_chunk, unsigned char additional_steps>
float UINT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
const uint8_t *pVect1 = reinterpret_cast<const uint8_t *>(pVect1v);
const uint8_t *pVect2 = reinterpret_cast<const uint8_t *>(pVect2v);

size_t offset = 0;
const size_t vl = svcntb();
const size_t chunk_size = 4 * vl;

// Each innerProductStep adds maximum 2^8 & 2^8 = 2^16
// Therefore, on a single accumulator, we can perform 2^16 steps before overflowing
// That scenario will happen only is the dimension of the vector is larger than 16*4*2^16 = 2^22
// (16 uint8 in 1 SVE register) * (4 accumulators) * (2^16 steps)
// We can safely assume that the dimension is smaller than that
// So using int32_t is safe

svuint32_t sum0 = svdup_u32(0);
svuint32_t sum1 = svdup_u32(0);
svuint32_t sum2 = svdup_u32(0);
svuint32_t sum3 = svdup_u32(0);

size_t num_chunks = dimension / chunk_size;

for (size_t i = 0; i < num_chunks; ++i) {
InnerProductStep(pVect1, pVect2, offset, sum0, vl);
InnerProductStep(pVect1, pVect2, offset, sum1, vl);
InnerProductStep(pVect1, pVect2, offset, sum2, vl);
InnerProductStep(pVect1, pVect2, offset, sum3, vl);
}

// Process remaining complete SVE vectors that didn't fit into the main loop
// These are full vector operations (0-3 elements)
if constexpr (additional_steps > 0) {
if constexpr (additional_steps >= 1) {
InnerProductStep(pVect1, pVect2, offset, sum0, vl);
}
if constexpr (additional_steps >= 2) {
InnerProductStep(pVect1, pVect2, offset, sum1, vl);
}
if constexpr (additional_steps >= 3) {
InnerProductStep(pVect1, pVect2, offset, sum2, vl);
}
}

if constexpr (partial_chunk) {
svbool_t pg = svwhilelt_b8_u64(offset, dimension);

svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors
svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors

sum3 = svdot_u32(sum3, v1_ui8, v2_ui8);

pVect1 += vl;
pVect2 += vl;
}

sum0 = svadd_u32_x(svptrue_b32(), sum0, sum1);
sum2 = svadd_u32_x(svptrue_b32(), sum2, sum3);

// Perform vector addition in parallel and Horizontal sum
int32_t sum_all = svaddv_u32(svptrue_b32(), svadd_u32_x(svptrue_b32(), sum0, sum2));

return sum_all;
}

template <bool partial_chunk, unsigned char additional_steps>
float UINT8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f -
UINT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
}

template <bool partial_chunk, unsigned char additional_steps>
float UINT8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
float ip = UINT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
float norm_v1 =
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect1v) + dimension);
float norm_v2 =
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect2v) + dimension);
return 1.0f - ip / (norm_v1 * norm_v2);
}
73 changes: 65 additions & 8 deletions src/VecSim/spaces/IP_space.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,12 +276,27 @@ dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con
}

dist_func_t<float> ret_dist_func = INT8_InnerProduct;

auto features = getCpuOptimizationFeatures(arch_opt);

#ifdef CPU_FEATURES_ARCH_AARCH64
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_INT8_IP_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_INT8_IP_implementation_SVE(dim);
}
#endif
#endif
#ifdef CPU_FEATURES_ARCH_X86_64
// Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
if (dim < 32) {
return ret_dist_func;
}
#ifdef CPU_FEATURES_ARCH_X86_64
auto features = getCpuOptimizationFeatures(arch_opt);

#ifdef OPT_AVX512_F_BW_VL_VNNI
if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
Expand All @@ -301,12 +316,26 @@ dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
}

dist_func_t<float> ret_dist_func = INT8_Cosine;

auto features = getCpuOptimizationFeatures(arch_opt);

#ifdef CPU_FEATURES_ARCH_AARCH64
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_INT8_Cosine_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_INT8_Cosine_implementation_SVE(dim);
}
#endif
#endif
#ifdef CPU_FEATURES_ARCH_X86_64
// Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
if (dim < 32) {
return ret_dist_func;
}
#ifdef CPU_FEATURES_ARCH_X86_64
auto features = getCpuOptimizationFeatures(arch_opt);
#ifdef OPT_AVX512_F_BW_VL_VNNI
if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
// For int8 vectors with cosine distance, the extra float for the norm shifts alignment to
Expand All @@ -329,12 +358,26 @@ dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment,
}

dist_func_t<float> ret_dist_func = UINT8_InnerProduct;

auto features = getCpuOptimizationFeatures(arch_opt);

#ifdef CPU_FEATURES_ARCH_AARCH64
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_UINT8_IP_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_UINT8_IP_implementation_SVE(dim);
}
#endif
#endif
#ifdef CPU_FEATURES_ARCH_X86_64
// Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
if (dim < 32) {
return ret_dist_func;
}
#ifdef CPU_FEATURES_ARCH_X86_64
auto features = getCpuOptimizationFeatures(arch_opt);
#ifdef OPT_AVX512_F_BW_VL_VNNI
if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
Expand All @@ -354,12 +397,26 @@ dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment
}

dist_func_t<float> ret_dist_func = UINT8_Cosine;

auto features = getCpuOptimizationFeatures(arch_opt);

#ifdef CPU_FEATURES_ARCH_AARCH64
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_UINT8_Cosine_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_UINT8_Cosine_implementation_SVE(dim);
}
#endif
#endif
#ifdef CPU_FEATURES_ARCH_X86_64
// Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
if (dim < 32) {
return ret_dist_func;
}
#ifdef CPU_FEATURES_ARCH_X86_64
auto features = getCpuOptimizationFeatures(arch_opt);
#ifdef OPT_AVX512_F_BW_VL_VNNI
if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
// For uint8 vectors with cosine distance, the extra float for the norm shifts alignment to
Expand Down
Loading
Loading