Skip to content

Raw vectors data layer in HNSW + move to base class [MOD-7496] #523

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Dec 19, 2024
Merged
25 changes: 12 additions & 13 deletions src/VecSim/algorithms/brute_force/brute_force.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ template <typename DataType, typename DistType>
class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
protected:
vecsim_stl::vector<labelType> idToLabelMapping;
RawDataContainer *vectors;
idType count;

public:
Expand All @@ -41,7 +40,9 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
size_t indexSize() const override;
size_t indexCapacity() const override;
std::unique_ptr<RawDataContainer::Iterator> getVectorsIterator() const;
DataType *getDataByInternalId(idType id) const { return (DataType *)vectors->getElement(id); }
DataType *getDataByInternalId(idType id) const {
return (DataType *)this->vectors->getElement(id);
}
VecSimQueryReply *topKQuery(const void *queryBlob, size_t k,
VecSimQueryParams *queryParams) const override;
VecSimQueryReply *rangeQuery(const void *queryBlob, double radius,
Expand All @@ -54,7 +55,7 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const override;
labelType getVectorLabel(idType id) const { return idToLabelMapping.at(id); }

const RawDataContainer *getVectorsContainer() const { return vectors; }
const RawDataContainer *getVectorsContainer() const { return this->vectors; }

const labelType getLabelByInternalId(idType internal_id) const {
return idToLabelMapping.at(internal_id);
Expand All @@ -71,7 +72,7 @@ class BruteForceIndex : public VecSimIndexAbstract<DataType, DistType> {
// without duplicates in tiered index). Caller should hold the flat buffer lock for read.
virtual vecsim_stl::set<labelType> getLabelsSet() const = 0;

virtual ~BruteForceIndex() { delete vectors; }
virtual ~BruteForceIndex() = default;
#ifdef BUILD_TESTS
/**
* @brief Used for testing - store vector(s) data associated with a given label. This function
Expand Down Expand Up @@ -147,8 +148,6 @@ BruteForceIndex<DataType, DistType>::BruteForceIndex(
: VecSimIndexAbstract<DataType, DistType>(abstractInitParams, components),
idToLabelMapping(this->allocator), count(0) {
assert(VecSimType_sizeof(this->vecType) == sizeof(DataType));
vectors = new (this->allocator)
DataBlocksContainer(this->blockSize, this->dataSize, this->allocator, this->alignment);
}

/******************** Implementation **************/
Expand All @@ -164,7 +163,7 @@ void BruteForceIndex<DataType, DistType>::appendVector(const void *vector_data,
growByBlock();
}
// add vector data to vector raw data container
vectors->addElement(processed_blob.get(), id);
this->vectors->addElement(processed_blob.get(), id);

// add label to idToLabelMapping
setVectorLabel(id, label);
Expand Down Expand Up @@ -193,10 +192,10 @@ void BruteForceIndex<DataType, DistType>::removeVector(idType id_to_delete) {
replaceIdOfLabel(last_idx_label, id_to_delete, last_idx);

// Put data of last vector inplace of the deleted vector.
const char *last_vector_data = vectors->getElement(last_idx);
vectors->updateElement(id_to_delete, last_vector_data);
const char *last_vector_data = this->vectors->getElement(last_idx);
this->vectors->updateElement(id_to_delete, last_vector_data);
}
vectors->removeElement(last_idx);
this->vectors->removeElement(last_idx);

// If we reached to a multiply of a block size, we can reduce meta data structures size.
if (this->count % this->blockSize == 0) {
Expand All @@ -217,7 +216,7 @@ size_t BruteForceIndex<DataType, DistType>::indexCapacity() const {
template <typename DataType, typename DistType>
std::unique_ptr<RawDataContainer::Iterator>
BruteForceIndex<DataType, DistType>::getVectorsIterator() const {
return vectors->getIterator();
return this->vectors->getIterator();
}

template <typename DataType, typename DistType>
Expand All @@ -240,7 +239,7 @@ BruteForceIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
getNewMaxPriorityQueue();

// For vector, compute its scores and update the Top candidates max heap
auto vectors_it = vectors->getIterator();
auto vectors_it = this->vectors->getIterator();
idType curr_id = 0;
while (auto *vector = vectors_it->next()) {
if (VECSIM_TIMEOUT(timeoutCtx)) {
Expand Down Expand Up @@ -285,7 +284,7 @@ BruteForceIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double ra
getNewResultsContainer(10); // Use 10 as the initial capacity for the dynamic array.

DistType radius_ = DistType(radius);
auto vectors_it = vectors->getIterator();
auto vectors_it = this->vectors->getIterator();
idType curr_id = 0;
const void *processed_query = processed_query_ptr.get();
while (vectors_it->hasNext()) {
Expand Down
38 changes: 11 additions & 27 deletions src/VecSim/algorithms/hnsw/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include "VecSim/utils/vecsim_stl.h"
#include "VecSim/utils/vec_utils.h"
#include "VecSim/containers/data_block.h"
#include "VecSim/containers/raw_data_container_interface.h"
#include "VecSim/containers/data_blocks_container.h"
#include "VecSim/containers/vecsim_results_container.h"
#include "VecSim/query_result_definitions.h"
#include "VecSim/vec_sim_common.h"
Expand Down Expand Up @@ -110,7 +112,6 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
size_t maxLevel; // this is the top level of the entry point's element

// Index data
vecsim_stl::vector<DataBlock> vectorBlocks;
vecsim_stl::vector<DataBlock> graphDataBlocks;
vecsim_stl::vector<ElementMetaData> idToMetaData;

Expand Down Expand Up @@ -182,7 +183,7 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
void replaceEntryPoint();

void SwapLastIdWithDeletedId(idType element_internal_id, ElementGraphData *last_element,
void *last_element_data);
const void *last_element_data);

/** Add vector functions */
// Protected internal function that implements generic single vector insertion.
Expand Down Expand Up @@ -384,7 +385,7 @@ labelType HNSWIndex<DataType, DistType>::getEntryPointLabel() const {

template <typename DataType, typename DistType>
const char *HNSWIndex<DataType, DistType>::getDataByInternalId(idType internal_id) const {
return vectorBlocks[internal_id / this->blockSize].getElement(internal_id % this->blockSize);
return this->vectors->getElement(internal_id);
}

template <typename DataType, typename DistType>
Expand Down Expand Up @@ -1130,7 +1131,7 @@ void HNSWIndex<DataType, DistType>::replaceEntryPoint() {
template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::SwapLastIdWithDeletedId(idType element_internal_id,
ElementGraphData *last_element,
void *last_element_data) {
const void *last_element_data) {
// Swap label - this is relevant when the last element's label exists (it is not marked as
// deleted).
if (!isMarkedDeleted(curElementCount)) {
Expand Down Expand Up @@ -1305,12 +1306,6 @@ void HNSWIndex<DataType, DistType>::resizeIndexCommon(size_t new_max_elements) {
template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::growByBlock() {
size_t new_max_elements = maxElements + this->blockSize;

// Validations
assert(vectorBlocks.size() == graphDataBlocks.size());
assert(vectorBlocks.empty() || vectorBlocks.back().getLength() == this->blockSize);

vectorBlocks.emplace_back(this->blockSize, this->dataSize, this->allocator, this->alignment);
graphDataBlocks.emplace_back(this->blockSize, this->elementGraphDataSize, this->allocator);

resizeIndexCommon(new_max_elements);
Expand All @@ -1320,13 +1315,6 @@ template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::shrinkByBlock() {
assert(maxElements >= this->blockSize);
size_t new_max_elements = maxElements - this->blockSize;

// Validations
assert(vectorBlocks.size() == graphDataBlocks.size());
assert(!vectorBlocks.empty());
assert(vectorBlocks.back().getLength() == 0);

vectorBlocks.pop_back();
graphDataBlocks.pop_back();

resizeIndexCommon(new_max_elements);
Expand Down Expand Up @@ -1599,9 +1587,8 @@ HNSWIndex<DataType, DistType>::HNSWIndex(const HNSWParams *params,
const IndexComponents<DataType, DistType> &components,
size_t random_seed)
: VecSimIndexAbstract<DataType, DistType>(abstractInitParams, components),
VecSimIndexTombstone(), maxElements(0), vectorBlocks(this->allocator),
graphDataBlocks(this->allocator), idToMetaData(this->allocator),
visitedNodesHandlerPool(0, this->allocator) {
VecSimIndexTombstone(), maxElements(0), graphDataBlocks(this->allocator),
idToMetaData(this->allocator), visitedNodesHandlerPool(0, this->allocator) {

M = params->M ? params->M : HNSW_DEFAULT_M;
M0 = M * 2;
Expand Down Expand Up @@ -1673,8 +1660,7 @@ void HNSWIndex<DataType, DistType>::removeAndSwap(idType internalId) {

// Get the last element's metadata and data.
// If we are deleting the last element, we already destroyed it's metadata.
DataBlock &last_vector_block = vectorBlocks.back();
auto last_element_data = last_vector_block.removeAndFetchLastElement();
auto *last_element_data = this->vectors->getElement(curElementCount);
DataBlock &last_gd_block = graphDataBlocks.back();
auto last_element = (ElementGraphData *)last_gd_block.removeAndFetchLastElement();

Expand All @@ -1685,6 +1671,7 @@ void HNSWIndex<DataType, DistType>::removeAndSwap(idType internalId) {

// If we need to free a complete block and there is at least one block between the
// capacity and the size.
this->vectors->removeElement(curElementCount);
if (curElementCount % this->blockSize == 0) {
shrinkByBlock();
}
Expand Down Expand Up @@ -1793,16 +1780,13 @@ HNSWAddVectorState HNSWIndex<DataType, DistType>::storeNewElement(labelType labe
if (indexSize() > indexCapacity()) {
growByBlock();
} else if (state.newElementId % this->blockSize == 0) {
// If we had an initial capacity, we might have to allocate new blocks for the data and
// meta-data.
this->vectorBlocks.emplace_back(this->blockSize, this->dataSize, this->allocator,
this->alignment);
// If we had an initial capacity, we might have to allocate new blocks for the graph data.
this->graphDataBlocks.emplace_back(this->blockSize, this->elementGraphDataSize,
this->allocator);
}

// Insert the new element to the data block
this->vectorBlocks.back().addElement(vector_data);
this->vectors->addElement(vector_data, state.newElementId);
this->graphDataBlocks.back().addElement(cur_egd);
// We mark id as in process *before* we set it in the label lookup, so that IN_PROCESS flag is
// set when checking if label .
Expand Down
41 changes: 6 additions & 35 deletions src/VecSim/algorithms/hnsw/hnsw_serializer.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ HNSWIndex<DataType, DistType>::HNSWIndex(std::ifstream &input, const HNSWParams
const IndexComponents<DataType, DistType> &components,
Serializer::EncodingVersion version)
: VecSimIndexAbstract<DataType, DistType>(abstractInitParams, components), Serializer(version),
epsilon(params->epsilon), vectorBlocks(this->allocator), graphDataBlocks(this->allocator),
idToMetaData(this->allocator), visitedNodesHandlerPool(0, this->allocator) {
epsilon(params->epsilon), graphDataBlocks(this->allocator), idToMetaData(this->allocator),
visitedNodesHandlerPool(0, this->allocator) {

this->restoreIndexFields(input);
this->fieldsValidation();
Expand All @@ -23,7 +23,6 @@ HNSWIndex<DataType, DistType>::HNSWIndex(std::ifstream &input, const HNSWParams
this->visitedNodesHandlerPool.resize(maxElements);

size_t initial_vector_size = maxElements / this->blockSize;
vectorBlocks.reserve(initial_vector_size);
graphDataBlocks.reserve(initial_vector_size);
}

Expand Down Expand Up @@ -167,29 +166,13 @@ void HNSWIndex<DataType, DistType>::restoreGraph(std::ifstream &input, EncodingV
setVectorId(label, id);
}

// Get number of blocks
unsigned int num_blocks = 0;
readBinaryPOD(input, num_blocks);
this->vectorBlocks.reserve(num_blocks);
this->graphDataBlocks.reserve(num_blocks);

// Get data blocks
for (size_t i = 0; i < num_blocks; i++) {
this->vectorBlocks.emplace_back(this->blockSize, this->dataSize, this->allocator,
this->alignment);
unsigned int block_len = 0;
readBinaryPOD(input, block_len);
for (size_t j = 0; j < block_len; j++) {
auto cur_vec = this->getAllocator()->allocate_unique(this->dataSize);
input.read(static_cast<char *>(cur_vec.get()), this->dataSize);
this->vectorBlocks.back().addElement(cur_vec.get());
}
}
dynamic_cast<DataBlocksContainer *>(this->vectors)->restoreBlocks(input);

// Get graph data blocks
ElementGraphData *cur_egt;
auto tmpData = this->getAllocator()->allocate_unique(this->elementGraphDataSize);
size_t toplevel = 0;
size_t num_blocks = dynamic_cast<DataBlocksContainer *>(this->vectors)->numBlocks();
for (size_t i = 0; i < num_blocks; i++) {
this->graphDataBlocks.emplace_back(this->blockSize, this->elementGraphDataSize,
this->allocator);
Expand Down Expand Up @@ -283,22 +266,10 @@ void HNSWIndex<DataType, DistType>::saveGraph(std::ofstream &output) const {
writeBinaryPOD(output, flags);
}

// Save number of blocks
unsigned int num_blocks = this->vectorBlocks.size();
writeBinaryPOD(output, num_blocks);

// Save data blocks
for (size_t i = 0; i < num_blocks; i++) {
auto &block = this->vectorBlocks[i];
unsigned int block_len = block.getLength();
writeBinaryPOD(output, block_len);
for (size_t j = 0; j < block_len; j++) {
output.write(block.getElement(j), this->dataSize);
}
}
dynamic_cast<DataBlocksContainer *>(this->vectors)->saveBlocks(output);

// Save graph data blocks
for (size_t i = 0; i < num_blocks; i++) {
for (size_t i = 0; i < this->graphDataBlocks.size(); i++) {
auto &block = this->graphDataBlocks[i];
unsigned int block_len = block.getLength();
writeBinaryPOD(output, block_len);
Expand Down
48 changes: 48 additions & 0 deletions src/VecSim/containers/data_blocks_container.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "data_blocks_container.h"
#include "VecSim/utils/serializer.h"

DataBlocksContainer::DataBlocksContainer(size_t blockSize, size_t elementBytesCount,
std::shared_ptr<VecSimAllocator> allocator,
Expand All @@ -10,6 +11,8 @@ DataBlocksContainer::~DataBlocksContainer() = default;

size_t DataBlocksContainer::size() const { return element_count; }

size_t DataBlocksContainer::capacity() const { return blocks.capacity(); }

size_t DataBlocksContainer::blockSize() const { return block_size; }

size_t DataBlocksContainer::elementByteCount() const { return element_bytes_count; }
Expand Down Expand Up @@ -51,6 +54,51 @@ std::unique_ptr<RawDataContainer::Iterator> DataBlocksContainer::getIterator() c
return std::make_unique<DataBlocksContainer::Iterator>(*this);
}

#ifdef BUILD_TESTS
void DataBlocksContainer::saveBlocks(std::ostream &output) const {
// Save number of blocks
unsigned int num_blocks = this->numBlocks();
Serializer::writeBinaryPOD(output, num_blocks);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should consider only saving the vectors without the metadata about the number of blocks and their sizes, so we can load them into other containers (or to different block sizes)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also means we don't need to add serialization to the container class, keeping it on the algorithm level


// Save data blocks
for (size_t i = 0; i < num_blocks; i++) {
auto &block = this->blocks[i];
unsigned int block_len = block.getLength();
Serializer::writeBinaryPOD(output, block_len);
for (size_t j = 0; j < block_len; j++) {
output.write(block.getElement(j), this->element_bytes_count);
}
}
}

void DataBlocksContainer::restoreBlocks(std::istream &input) {

// Get number of blocks
unsigned int num_blocks = 0;
Serializer::readBinaryPOD(input, num_blocks);
this->blocks.reserve(num_blocks);

// Get data blocks
for (size_t i = 0; i < num_blocks; i++) {
this->blocks.emplace_back(this->block_size, this->element_bytes_count, this->allocator,
this->alignment);
unsigned int block_len = 0;
Serializer::readBinaryPOD(input, block_len);
for (size_t j = 0; j < block_len; j++) {
auto cur_vec = this->getAllocator()->allocate_unique(this->element_bytes_count);
input.read(static_cast<char *>(cur_vec.get()),
(std::streamsize)this->element_bytes_count);
this->blocks.back().addElement(cur_vec.get());
this->element_count++;
}
}
}

void DataBlocksContainer::shrinkToFit() { this->blocks.shrink_to_fit(); }

size_t DataBlocksContainer::numBlocks() const { return this->blocks.size(); }

#endif
/********************************** Iterator API ************************************************/

DataBlocksContainer::Iterator::Iterator(const DataBlocksContainer &container_)
Expand Down
9 changes: 9 additions & 0 deletions src/VecSim/containers/data_blocks_container.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ class DataBlocksContainer : public VecsimBaseObject, public RawDataContainer {

size_t size() const override;

size_t capacity() const;

size_t blockSize() const;

size_t elementByteCount() const;
Expand All @@ -34,6 +36,13 @@ class DataBlocksContainer : public VecsimBaseObject, public RawDataContainer {

std::unique_ptr<RawDataContainer::Iterator> getIterator() const override;

#ifdef BUILD_TESTS
void saveBlocks(std::ostream &output) const;
void restoreBlocks(std::istream &input);
void shrinkToFit();
size_t numBlocks() const;
#endif

class Iterator : public RawDataContainer::Iterator {
size_t cur_id;
const char *cur_element;
Expand Down
2 changes: 2 additions & 0 deletions src/VecSim/index_factories/hnsw_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ size_t EstimateInitialSize(const HNSWParams *params) {
est += EstimateComponentsMemory<float16, float>(params->metric);
est += EstimateInitialSize_ChooseMultiOrSingle<float16, float>(params->multi);
}
est += sizeof(DataBlocksContainer) + allocations_overhead;

return est;
}

Expand Down
Loading