Skip to content

[MOD-8244] fix TieredFactory::EstimateInitialSize #567

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/VecSim/index_factories/brute_force_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,24 +98,24 @@ inline size_t EstimateInitialSize_ChooseMultiOrSingle(bool is_multi) {
return sizeof(BruteForceIndex_Single<DataType, DistType>);
}

size_t EstimateInitialSize(const BFParams *params) {
size_t EstimateInitialSize(const BFParams *params, bool is_normalized) {

size_t allocations_overhead = VecSimAllocator::getAllocationOverheadSize();

// Constant part (not effected by parameters).
size_t est = sizeof(VecSimAllocator) + allocations_overhead;

if (params->type == VecSimType_FLOAT32) {
est += EstimateComponentsMemory<float, float>(params->metric);
est += EstimateComponentsMemory<float, float>(params->metric, is_normalized);
est += EstimateInitialSize_ChooseMultiOrSingle<float>(params->multi);
} else if (params->type == VecSimType_FLOAT64) {
est += EstimateComponentsMemory<double, double>(params->metric);
est += EstimateComponentsMemory<double, double>(params->metric, is_normalized);
est += EstimateInitialSize_ChooseMultiOrSingle<double>(params->multi);
} else if (params->type == VecSimType_BFLOAT16) {
est += EstimateComponentsMemory<bfloat16, float>(params->metric);
est += EstimateComponentsMemory<bfloat16, float>(params->metric, is_normalized);
est += EstimateInitialSize_ChooseMultiOrSingle<bfloat16, float>(params->multi);
} else if (params->type == VecSimType_FLOAT16) {
est += EstimateComponentsMemory<float16, float>(params->metric);
est += EstimateComponentsMemory<float16, float>(params->metric, is_normalized);
est += EstimateInitialSize_ChooseMultiOrSingle<float16, float>(params->multi);
}

Expand Down
2 changes: 1 addition & 1 deletion src/VecSim/index_factories/brute_force_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ VecSimIndex *NewIndex(const VecSimParams *params, bool is_normalized = false);
VecSimIndex *NewIndex(const BFParams *bfparams, bool is_normalized = false);
VecSimIndex *NewIndex(const BFParams *bfparams, const AbstractIndexInitParams &abstractInitParams,
bool is_normalized);
size_t EstimateInitialSize(const BFParams *params);
size_t EstimateInitialSize(const BFParams *params, bool is_normalized = false);
size_t EstimateElementSize(const BFParams *params);

}; // namespace BruteForceFactory
4 changes: 2 additions & 2 deletions src/VecSim/index_factories/components/components_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ CreateIndexComponents(std::shared_ptr<VecSimAllocator> allocator, VecSimMetric m
}

template <typename DataType, typename DistType>
size_t EstimateComponentsMemory(VecSimMetric metric) {
size_t EstimateComponentsMemory(VecSimMetric metric, bool is_normalized) {
size_t allocations_overhead = VecSimAllocator::getAllocationOverheadSize();

// Currently we have only one distance calculator implementation
size_t est = allocations_overhead + sizeof(DistanceCalculatorCommon<DistType>);

est += EstimatePreprocessorsContainerMemory<DataType>(metric);
est += EstimatePreprocessorsContainerMemory<DataType>(metric, is_normalized);

return est;
}
10 changes: 8 additions & 2 deletions src/VecSim/index_factories/components/preprocessors_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,16 @@ CreatePreprocessorsContainer(std::shared_ptr<VecSimAllocator> allocator,
}

template <typename DataType>
size_t EstimatePreprocessorsContainerMemory(VecSimMetric metric) {
size_t EstimatePreprocessorsContainerMemory(VecSimMetric metric, bool is_normalized = false) {
size_t allocations_overhead = VecSimAllocator::getAllocationOverheadSize();
VecSimMetric pp_metric;
if (is_normalized && metric == VecSimMetric_Cosine) {
pp_metric = VecSimMetric_IP;
} else {
pp_metric = metric;
}

if (metric == VecSimMetric_Cosine) {
if (pp_metric == VecSimMetric_Cosine) {
constexpr size_t n_preprocessors = 1;
// One entry in preprocessors array
size_t est =
Expand Down
10 changes: 5 additions & 5 deletions src/VecSim/index_factories/hnsw_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,21 +98,21 @@ inline size_t EstimateInitialSize_ChooseMultiOrSingle(bool is_multi) {
return sizeof(HNSWIndex_Single<DataType, DistType>);
}

size_t EstimateInitialSize(const HNSWParams *params) {
size_t EstimateInitialSize(const HNSWParams *params, bool is_normalized) {
size_t allocations_overhead = VecSimAllocator::getAllocationOverheadSize();

size_t est = sizeof(VecSimAllocator) + allocations_overhead;
if (params->type == VecSimType_FLOAT32) {
est += EstimateComponentsMemory<float, float>(params->metric);
est += EstimateComponentsMemory<float, float>(params->metric, is_normalized);
est += EstimateInitialSize_ChooseMultiOrSingle<float>(params->multi);
} else if (params->type == VecSimType_FLOAT64) {
est += EstimateComponentsMemory<double, double>(params->metric);
est += EstimateComponentsMemory<double, double>(params->metric, is_normalized);
est += EstimateInitialSize_ChooseMultiOrSingle<double>(params->multi);
} else if (params->type == VecSimType_BFLOAT16) {
est += EstimateComponentsMemory<bfloat16, float>(params->metric);
est += EstimateComponentsMemory<bfloat16, float>(params->metric, is_normalized);
est += EstimateInitialSize_ChooseMultiOrSingle<bfloat16, float>(params->multi);
} else if (params->type == VecSimType_FLOAT16) {
est += EstimateComponentsMemory<float16, float>(params->metric);
est += EstimateComponentsMemory<float16, float>(params->metric, is_normalized);
est += EstimateInitialSize_ChooseMultiOrSingle<float16, float>(params->multi);
}
return est;
Expand Down
2 changes: 1 addition & 1 deletion src/VecSim/index_factories/hnsw_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace HNSWFactory {
*/
VecSimIndex *NewIndex(const VecSimParams *params, bool is_normalized = false);
VecSimIndex *NewIndex(const HNSWParams *params, bool is_normalized = false);
size_t EstimateInitialSize(const HNSWParams *params);
size_t EstimateInitialSize(const HNSWParams *params, bool is_normalized = false);
size_t EstimateElementSize(const HNSWParams *params);

#ifdef BUILD_TESTS
Expand Down
30 changes: 19 additions & 11 deletions src/VecSim/index_factories/tiered_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,18 @@ using float16 = vecsim_types::float16;
namespace TieredFactory {

namespace TieredHNSWFactory {

static inline BFParams NewBFParams(const TieredIndexParams *params) {
auto hnsw_params = params->primaryIndexParams->algoParams.hnswParams;
BFParams bf_params = {.type = hnsw_params.type,
.dim = hnsw_params.dim,
.metric = hnsw_params.metric,
.multi = hnsw_params.multi,
.blockSize = hnsw_params.blockSize};

return bf_params;
}

template <typename DataType, typename DistType = DataType>
inline VecSimIndex *NewIndex(const TieredIndexParams *params) {

Expand All @@ -27,11 +39,7 @@ inline VecSimIndex *NewIndex(const TieredIndexParams *params) {
HNSWFactory::NewIndex(params->primaryIndexParams, true));
// initialize brute force index

BFParams bf_params = {.type = params->primaryIndexParams->algoParams.hnswParams.type,
.dim = params->primaryIndexParams->algoParams.hnswParams.dim,
.metric = params->primaryIndexParams->algoParams.hnswParams.metric,
.multi = params->primaryIndexParams->algoParams.hnswParams.multi,
.blockSize = params->primaryIndexParams->algoParams.hnswParams.blockSize};
BFParams bf_params = NewBFParams(params);

std::shared_ptr<VecSimAllocator> flat_allocator = VecSimAllocator::newVecsimAllocator();
AbstractIndexInitParams abstractInitParams = {.allocator = flat_allocator,
Expand All @@ -52,11 +60,12 @@ inline VecSimIndex *NewIndex(const TieredIndexParams *params) {
hnsw_index, frontendIndex, *params, management_layer_allocator);
}

inline size_t EstimateInitialSize(const TieredIndexParams *params, BFParams &bf_params_output) {
inline size_t EstimateInitialSize(const TieredIndexParams *params) {
HNSWParams hnsw_params = params->primaryIndexParams->algoParams.hnswParams;

// Add size estimation of VecSimTieredIndex sub indexes.
size_t est = HNSWFactory::EstimateInitialSize(&hnsw_params);
// Normalization is done by the frontend index.
size_t est = HNSWFactory::EstimateInitialSize(&hnsw_params, true);

// Management layer allocator overhead.
size_t allocations_overhead = VecSimAllocator::getAllocationOverheadSize();
Expand All @@ -72,8 +81,6 @@ inline size_t EstimateInitialSize(const TieredIndexParams *params, BFParams &bf_
} else if (hnsw_params.type == VecSimType_FLOAT16) {
est += sizeof(TieredHNSWIndex<float16, float>);
}
bf_params_output.type = hnsw_params.type;
bf_params_output.multi = hnsw_params.multi;

return est;
}
Expand Down Expand Up @@ -107,10 +114,11 @@ size_t EstimateInitialSize(const TieredIndexParams *params) {

BFParams bf_params{};
if (params->primaryIndexParams->algo == VecSimAlgo_HNSWLIB) {
est += TieredHNSWFactory::EstimateInitialSize(params, bf_params);
est += TieredHNSWFactory::EstimateInitialSize(params);
bf_params = TieredHNSWFactory::NewBFParams(params);
}

est += BruteForceFactory::EstimateInitialSize(&bf_params);
est += BruteForceFactory::EstimateInitialSize(&bf_params, false);
return est;
}

Expand Down
6 changes: 6 additions & 0 deletions src/VecSim/spaces/computer/preprocessor_container.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ class MultiPreprocessorsContainer : public PreprocessorsContainerAbstract {

void preprocessQueryInPlace(void *blob, size_t processed_bytes_count) const override;

#ifdef BUILD_TESTS
std::array<PreprocessorInterface *, n_preprocessors> getPreprocessors() const {
return preprocessors;
}
#endif

private:
using Base = PreprocessorsContainerAbstract;
};
Expand Down
4 changes: 4 additions & 0 deletions src/VecSim/vec_sim_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,10 @@ struct VecSimIndexAbstract : public VecSimIndexInterface {
delete this->preprocessors;
this->preprocessors = newPPContainer;
}

IndexComponents<DataType, DistType> get_components() const {
return {.indexCalculator = this->indexCalculator, .preprocessors = this->preprocessors};
}
#endif

protected:
Expand Down
58 changes: 57 additions & 1 deletion tests/unit/test_hnsw_tiered.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,62 @@ TYPED_TEST(HNSWTieredIndexTest, CreateIndexInstance) {
ASSERT_EQ(tiered_index->labelToInsertJobs.at(vector_label).size(), 0);
}

TYPED_TEST(HNSWTieredIndexTest, testIndexesAttributes) {
// Create TieredHNSW index instance with a mock queue.
HNSWParams params = {.type = TypeParam::get_index_type(),
.dim = 4,
.metric = VecSimMetric_Cosine,
.multi = TypeParam::isMulti()};
VecSimParams hnsw_params = CreateParams(params);
auto mock_thread_pool = tieredIndexMock();
auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, mock_thread_pool);

HNSWIndex<TEST_DATA_T, TEST_DIST_T> *hnsw_index = this->CastToHNSW(tiered_index);
BruteForceIndex<TEST_DATA_T, TEST_DIST_T> *bf_index = this->GetFlatIndex(tiered_index);

VecSimIndexBasicInfo hnsw_info = VecSimIndex_Info(hnsw_index).commonInfo.basicInfo;
VecSimIndexBasicInfo bf_info = VecSimIndex_Info(bf_index).commonInfo.basicInfo;

// assert metric
ASSERT_EQ(hnsw_info.metric, params.metric);
ASSERT_EQ(bf_info.metric, params.metric);
// assert type
ASSERT_EQ(hnsw_info.type, params.type);
ASSERT_EQ(bf_info.type, params.type);
// assert dim
ASSERT_EQ(hnsw_info.dim, params.dim);
ASSERT_EQ(bf_info.dim, params.dim);
// assert multi
ASSERT_EQ(hnsw_info.isMulti, params.multi);
ASSERT_EQ(bf_info.isMulti, params.multi);

// assert containers

// bf - multi with cosine
IndexComponents<TEST_DATA_T, TEST_DIST_T> bf_components = bf_index->get_components();
PreprocessorsContainerAbstract *bf_preprocessors = bf_components.preprocessors;
const std::type_info &bf_pp_container_expected_type =
typeid(MultiPreprocessorsContainer<TEST_DATA_T, 1>);
const std::type_info &bf_pp_container_actual_type = typeid(*bf_preprocessors);
ASSERT_EQ(bf_pp_container_actual_type, bf_pp_container_expected_type);

// CosinePreprocessor<TEST_DATA_T> cosine_pp =
// dynamic_cast<MultiPreprocessorsContainer<TEST_DATA_T, 1>
// *>(bf_preprocessors)->preprocessors[0];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

std::array<PreprocessorInterface *, 1> pp_arr =
dynamic_cast<MultiPreprocessorsContainer<TEST_DATA_T, 1> *>(bf_preprocessors)
->getPreprocessors();
const std::type_info &bf_pp_expected_type = typeid(CosinePreprocessor<TEST_DATA_T>);
const std::type_info &bf_pp_actual_type = typeid(*pp_arr[0]);
ASSERT_EQ(bf_pp_actual_type, bf_pp_expected_type);

// hnsw - simple
IndexComponents<TEST_DATA_T, TEST_DIST_T> hnsw_components = hnsw_index->get_components();
const std::type_info &hnsw_pp_expected_type = typeid(PreprocessorsContainerAbstract);
const std::type_info &hnsw_pp_actual_type = typeid(*hnsw_components.preprocessors);
ASSERT_EQ(hnsw_pp_expected_type, hnsw_pp_actual_type);
}

TYPED_TEST(HNSWTieredIndexTest, testSizeEstimation) {
size_t dim = 128;
size_t n = DEFAULT_BLOCK_SIZE;
Expand All @@ -129,7 +185,7 @@ TYPED_TEST(HNSWTieredIndexTest, testSizeEstimation) {

HNSWParams hnsw_params = {.type = TypeParam::get_index_type(),
.dim = dim,
.metric = VecSimMetric_L2,
.metric = VecSimMetric_Cosine,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does changing the metric in this test enough to cover all the PR changes?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It covers the second fix.
I'll add a test that validates the attributes of the tiered index indexes to validate the first.

Copy link
Collaborator Author

@meiravgri meiravgri Dec 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, it was not a problem in the actual indexes, only in the estimation function
I don't have a way to test it.
I already added a test for the components, and it won't harm to have one so I'm keeping it

.multi = isMulti,
.M = M};
VecSimParams vecsim_hnsw_params = CreateParams(hnsw_params);
Expand Down
Loading