Skip to content

reuse deleted ids on insert [MOD-4634] #663

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 29 additions & 13 deletions src/VecSim/algorithms/hnsw/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,10 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
bool isMarkedDeleted(idType internalId) const;
bool isInProcess(idType internalId) const;
void unmarkInProcess(idType internalId);
HNSWAddVectorState storeNewElement(labelType label, const void *vector_data);
HNSWAddVectorState storeNewElement(labelType label, const void *vector_data,
idType newElementId);
void removeAndSwapMarkDeletedElement(idType internalId);
void removeIncomingEdgesAndDelete(idType deletedId);
void repairNodeConnections(idType node_id, size_t level);
// For prefetching only.
const ElementMetaData *getMetaDataAddress(idType internal_id) const {
Expand All @@ -283,6 +285,7 @@ class HNSWIndex : public VecSimIndexAbstract<DataType, DistType>,
void insertElementToGraph(idType element_id, size_t element_max_level, idType entry_point,
size_t global_max_level, const void *vector_data);
void removeVectorInPlace(idType id);
idType getNewElementId();

/*************************** Labels lookup API ***************************/
/* Virtual functions that access the label lookup which is implemented in the derived classes */
Expand Down Expand Up @@ -356,6 +359,11 @@ size_t HNSWIndex<DataType, DistType>::indexSize() const {
return this->curElementCount;
}

template <typename DataType, typename DistType>
idType HNSWIndex<DataType, DistType>::getNewElementId() {
return this->curElementCount++;
}

template <typename DataType, typename DistType>
size_t HNSWIndex<DataType, DistType>::indexCapacity() const {
return this->maxElements;
Expand Down Expand Up @@ -1628,11 +1636,11 @@ HNSWIndex<DataType, DistType>::~HNSWIndex() {
*/

template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::removeAndSwap(idType internalId) {
void HNSWIndex<DataType, DistType>::removeIncomingEdgesAndDelete(idType deletedId) {
// Sanity check - the id to remove cannot be the entry point, as it should have been replaced
// upon marking it as deleted.
assert(entrypointNode != internalId);
auto element = getGraphDataByInternalId(internalId);
assert(entrypointNode != deletedId);
auto element = getGraphDataByInternalId(deletedId);

// Remove the deleted id form the relevant incoming edges sets in which it appears.
for (size_t level = 0; level <= element->toplevel; level++) {
Expand All @@ -1644,17 +1652,27 @@ void HNSWIndex<DataType, DistType>::removeAndSwap(idType internalId) {
// (we know we will get here and remove this deleted id permanently).
// However, upon asynchronous delete, this should always succeed since we do update
// the incoming edges in the mutual update even for deleted elements.
bool res = neighbour.removeIncomingUnidirectionalEdgeIfExists(internalId);

lockNodeLinks(cur_level.getLinkAtPos(i));
bool res = neighbour.removeIncomingUnidirectionalEdgeIfExists(deletedId);
unlockNodeLinks(cur_level.getLinkAtPos(i));

// Assert the logical condition of: is_marked_deleted(id) => res==True.
(void)res;
assert((!isMarkedDeleted(internalId) || res) && "The edge should be in the incoming "
"unidirectional edges");
assert((!isMarkedDeleted(deletedId) || res) && "The edge should be in the incoming "
"unidirectional edges");
}
}

// Free the element's resources
element->destroy(this->levelDataSize, this->allocator);
--numMarkedDeleted;
}

template <typename DataType, typename DistType>
void HNSWIndex<DataType, DistType>::removeAndSwap(idType internalId) {

removeIncomingEdgesAndDelete(internalId);
// We can say now that the element has removed completely from index.
--curElementCount;

Expand Down Expand Up @@ -1749,15 +1767,13 @@ void HNSWIndex<DataType, DistType>::removeVectorInPlace(const idType element_int
// scenario, the index data guard should be held by the caller (exclusive lock).
template <typename DataType, typename DistType>
HNSWAddVectorState HNSWIndex<DataType, DistType>::storeNewElement(labelType label,
const void *vector_data) {
HNSWAddVectorState state{};
const void *vector_data,
idType newElementId) {
HNSWAddVectorState state(newElementId = newElementId);

// Choose randomly the maximum level in which the new element will be in the index.
state.elementMaxLevel = getRandomLevel(mult);

// Access and update the index global data structures with the new element meta-data.
state.newElementId = curElementCount++;

// Create the new element's graph metadata.
// We must assign manually enough memory on the stack and not just declare an `ElementGraphData`
// variable, since it has a flexible array member.
Expand Down Expand Up @@ -1814,7 +1830,7 @@ HNSWAddVectorState HNSWIndex<DataType, DistType>::storeVector(const void *vector
HNSWAddVectorState state{};

this->lockIndexDataGuard();
state = storeNewElement(label, vector_data);
state = storeNewElement(label, vector_data, this->getNewElementId());
if (state.currMaxLevel >= state.elementMaxLevel) {
this->unlockIndexDataGuard();
}
Expand Down
23 changes: 21 additions & 2 deletions src/VecSim/algorithms/hnsw/hnsw_tiered.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
// Handle deletion of vector inplace considering that async deletion might occurred beforehand.
int deleteLabelFromHNSWInplace(labelType label);

idType getNewElementId();

#ifdef BUILD_TESTS
#include "VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h"
#endif
Expand Down Expand Up @@ -443,7 +445,8 @@ void TieredHNSWIndex<DataType, DistType>::insertVectorToHNSW(
// graph scans will not occur, as they will try access the entry point's neighbors.
// If an index resize is still needed, `storeNewElement` will perform it. This is OK since
// we hold the main index lock for exclusive access.
auto state = hnsw_index->storeNewElement(label, processed_storage_blob);
auto state =
hnsw_index->storeNewElement(label, processed_storage_blob, this->getNewElementId());
if constexpr (releaseFlatGuard) {
this->flatIndexGuard.unlock_shared();
}
Expand All @@ -468,7 +471,8 @@ void TieredHNSWIndex<DataType, DistType>::insertVectorToHNSW(
// graph scans will not occur, as they will try access the entry point's neighbors.
// At this point we are certain that the index has enough capacity for the new element, and
// this call will not resize the index.
auto state = hnsw_index->storeNewElement(label, processed_storage_blob);
auto state =
hnsw_index->storeNewElement(label, processed_storage_blob, this->getNewElementId());
if constexpr (releaseFlatGuard) {
this->flatIndexGuard.unlock_shared();
}
Expand Down Expand Up @@ -518,6 +522,21 @@ int TieredHNSWIndex<DataType, DistType>::deleteLabelFromHNSWInplace(labelType la
return ids.size();
}

template <typename DataType, typename DistType>
idType TieredHNSWIndex<DataType, DistType>::getNewElementId() {
auto *hnsw_index = this->getHNSWIndex();
if (idToSwapJob.size() == 0) {
// No pending swap jobs, so we can use a new id.
return hnsw_index->getNewElementId();
}
idType deletedId = idToSwapJob.begin()->first;
hnsw_index->removeIncomingEdgesAndDelete(deletedId);
idToSwapJob.erase(deletedId);

return deletedId;
// Note that this function is not thread-safe, and should be called while the main index lock
}

/******************** Job's callbacks **********************************/
template <typename DataType, typename DistType>
void TieredHNSWIndex<DataType, DistType>::executeInsertJob(HNSWInsertJob *job) {
Expand Down
Loading