From 15ec58b278ddd97def43c5e45d371b18ed9a537f Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Mon, 14 Apr 2025 16:48:24 +0300 Subject: [PATCH 1/3] add new element id functions --- src/VecSim/algorithms/hnsw/hnsw.h | 17 ++++++++++------- src/VecSim/algorithms/hnsw/hnsw_tiered.h | 13 +++++++++++-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h index 61ca1fa2d..714518d66 100644 --- a/src/VecSim/algorithms/hnsw/hnsw.h +++ b/src/VecSim/algorithms/hnsw/hnsw.h @@ -271,7 +271,7 @@ class HNSWIndex : public VecSimIndexAbstract, bool isMarkedDeleted(idType internalId) const; bool isInProcess(idType internalId) const; void unmarkInProcess(idType internalId); - HNSWAddVectorState storeNewElement(labelType label, const void *vector_data); + HNSWAddVectorState storeNewElement(labelType label, const void *vector_data, idType newElementId); void removeAndSwapMarkDeletedElement(idType internalId); void repairNodeConnections(idType node_id, size_t level); // For prefetching only. @@ -283,6 +283,7 @@ class HNSWIndex : public VecSimIndexAbstract, void insertElementToGraph(idType element_id, size_t element_max_level, idType entry_point, size_t global_max_level, const void *vector_data); void removeVectorInPlace(idType id); + idType getNewElementId(); /*************************** Labels lookup API ***************************/ /* Virtual functions that access the label lookup which is implemented in the derived classes */ @@ -356,6 +357,11 @@ size_t HNSWIndex::indexSize() const { return this->curElementCount; } +template +idType HNSWIndex::getNewElementId() { + return this->curElementCount++; +} + template size_t HNSWIndex::indexCapacity() const { return this->maxElements; @@ -1749,15 +1755,12 @@ void HNSWIndex::removeVectorInPlace(const idType element_int // scenario, the index data guard should be held by the caller (exclusive lock). template HNSWAddVectorState HNSWIndex::storeNewElement(labelType label, - const void *vector_data) { - HNSWAddVectorState state{}; + const void *vector_data, idType newElementId) { + HNSWAddVectorState state(newElementId=newElementId); // Choose randomly the maximum level in which the new element will be in the index. state.elementMaxLevel = getRandomLevel(mult); - // Access and update the index global data structures with the new element meta-data. - state.newElementId = curElementCount++; - // Create the new element's graph metadata. // We must assign manually enough memory on the stack and not just declare an `ElementGraphData` // variable, since it has a flexible array member. @@ -1814,7 +1817,7 @@ HNSWAddVectorState HNSWIndex::storeVector(const void *vector HNSWAddVectorState state{}; this->lockIndexDataGuard(); - state = storeNewElement(label, vector_data); + state = storeNewElement(label, vector_data, this->getNewElementId()); if (state.currMaxLevel >= state.elementMaxLevel) { this->unlockIndexDataGuard(); } diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index f868cd16d..fdbe3494f 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -129,6 +129,8 @@ class TieredHNSWIndex : public VecSimTieredIndex { // Handle deletion of vector inplace considering that async deletion might occurred beforehand. int deleteLabelFromHNSWInplace(labelType label); + idType getNewElementId(); + #ifdef BUILD_TESTS #include "VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h" #endif @@ -443,7 +445,7 @@ void TieredHNSWIndex::insertVectorToHNSW( // graph scans will not occur, as they will try access the entry point's neighbors. // If an index resize is still needed, `storeNewElement` will perform it. This is OK since // we hold the main index lock for exclusive access. - auto state = hnsw_index->storeNewElement(label, processed_storage_blob); + auto state = hnsw_index->storeNewElement(label, processed_storage_blob, this->getNewElementId()); if constexpr (releaseFlatGuard) { this->flatIndexGuard.unlock_shared(); } @@ -468,7 +470,7 @@ void TieredHNSWIndex::insertVectorToHNSW( // graph scans will not occur, as they will try access the entry point's neighbors. // At this point we are certain that the index has enough capacity for the new element, and // this call will not resize the index. - auto state = hnsw_index->storeNewElement(label, processed_storage_blob); + auto state = hnsw_index->storeNewElement(label, processed_storage_blob, this->getNewElementId()); if constexpr (releaseFlatGuard) { this->flatIndexGuard.unlock_shared(); } @@ -518,6 +520,13 @@ int TieredHNSWIndex::deleteLabelFromHNSWInplace(labelType la return ids.size(); } +template +idType TieredHNSWIndex::getNewElementId() { + auto *hnsw_index = this->getHNSWIndex(); + return hnsw_index->getNewElementId(); + // Note that this function is not thread-safe, and should be called while the main index lock +} + /******************** Job's callbacks **********************************/ template void TieredHNSWIndex::executeInsertJob(HNSWInsertJob *job) { From 761b546b6b561c9c5f0055ecda9aecde4b4c474a Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Tue, 15 Apr 2025 15:56:26 +0300 Subject: [PATCH 2/3] create remove edges and delete method --- src/VecSim/algorithms/hnsw/hnsw.h | 19 ++++++++++++++----- src/VecSim/algorithms/hnsw/hnsw_tiered.h | 6 ++++++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h index 714518d66..97030a090 100644 --- a/src/VecSim/algorithms/hnsw/hnsw.h +++ b/src/VecSim/algorithms/hnsw/hnsw.h @@ -273,6 +273,7 @@ class HNSWIndex : public VecSimIndexAbstract, void unmarkInProcess(idType internalId); HNSWAddVectorState storeNewElement(labelType label, const void *vector_data, idType newElementId); void removeAndSwapMarkDeletedElement(idType internalId); + void removeIncomingEdgesAndDelete(idType deletedId); void repairNodeConnections(idType node_id, size_t level); // For prefetching only. const ElementMetaData *getMetaDataAddress(idType internal_id) const { @@ -1634,11 +1635,11 @@ HNSWIndex::~HNSWIndex() { */ template -void HNSWIndex::removeAndSwap(idType internalId) { +void HNSWIndex::removeIncomingEdgesAndDelete(idType deletedId) { // Sanity check - the id to remove cannot be the entry point, as it should have been replaced // upon marking it as deleted. - assert(entrypointNode != internalId); - auto element = getGraphDataByInternalId(internalId); + assert(entrypointNode != deletedId); + auto element = getGraphDataByInternalId(deletedId); // Remove the deleted id form the relevant incoming edges sets in which it appears. for (size_t level = 0; level <= element->toplevel; level++) { @@ -1650,17 +1651,25 @@ void HNSWIndex::removeAndSwap(idType internalId) { // (we know we will get here and remove this deleted id permanently). // However, upon asynchronous delete, this should always succeed since we do update // the incoming edges in the mutual update even for deleted elements. - bool res = neighbour.removeIncomingUnidirectionalEdgeIfExists(internalId); + bool res = neighbour.removeIncomingUnidirectionalEdgeIfExists(deletedId); // Assert the logical condition of: is_marked_deleted(id) => res==True. (void)res; - assert((!isMarkedDeleted(internalId) || res) && "The edge should be in the incoming " + assert((!isMarkedDeleted(deletedId) || res) && "The edge should be in the incoming " "unidirectional edges"); } } // Free the element's resources element->destroy(this->levelDataSize, this->allocator); + --numMarkedDeleted; + +} + +template +void HNSWIndex::removeAndSwap(idType internalId) { + + removeIncomingEdgesAndDelete(internalId); // We can say now that the element has removed completely from index. --curElementCount; diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index fdbe3494f..e09f19abf 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -523,6 +523,12 @@ int TieredHNSWIndex::deleteLabelFromHNSWInplace(labelType la template idType TieredHNSWIndex::getNewElementId() { auto *hnsw_index = this->getHNSWIndex(); + if (idToSwapJob.size() == 0) { + // No pending swap jobs, so we can use a new id. + return hnsw_index->getNewElementId(); + } + // idType deleted_id = idToSwapJob.begin()->first; + return hnsw_index->getNewElementId(); // Note that this function is not thread-safe, and should be called while the main index lock } From 7dbb8d42f61472e83dbb6cad1302ace5e2856edd Mon Sep 17 00:00:00 2001 From: BenGoldberger Date: Wed, 16 Apr 2025 13:53:44 +0300 Subject: [PATCH 3/3] change the getNewElementId to remove deleted node --- src/VecSim/algorithms/hnsw/hnsw.h | 18 +++++++++++------- src/VecSim/algorithms/hnsw/hnsw_tiered.h | 12 ++++++++---- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h index 97030a090..173f1b086 100644 --- a/src/VecSim/algorithms/hnsw/hnsw.h +++ b/src/VecSim/algorithms/hnsw/hnsw.h @@ -271,7 +271,8 @@ class HNSWIndex : public VecSimIndexAbstract, bool isMarkedDeleted(idType internalId) const; bool isInProcess(idType internalId) const; void unmarkInProcess(idType internalId); - HNSWAddVectorState storeNewElement(labelType label, const void *vector_data, idType newElementId); + HNSWAddVectorState storeNewElement(labelType label, const void *vector_data, + idType newElementId); void removeAndSwapMarkDeletedElement(idType internalId); void removeIncomingEdgesAndDelete(idType deletedId); void repairNodeConnections(idType node_id, size_t level); @@ -1651,24 +1652,26 @@ void HNSWIndex::removeIncomingEdgesAndDelete(idType deletedI // (we know we will get here and remove this deleted id permanently). // However, upon asynchronous delete, this should always succeed since we do update // the incoming edges in the mutual update even for deleted elements. + + lockNodeLinks(cur_level.getLinkAtPos(i)); bool res = neighbour.removeIncomingUnidirectionalEdgeIfExists(deletedId); + unlockNodeLinks(cur_level.getLinkAtPos(i)); + // Assert the logical condition of: is_marked_deleted(id) => res==True. (void)res; assert((!isMarkedDeleted(deletedId) || res) && "The edge should be in the incoming " - "unidirectional edges"); + "unidirectional edges"); } } // Free the element's resources element->destroy(this->levelDataSize, this->allocator); --numMarkedDeleted; - } - template void HNSWIndex::removeAndSwap(idType internalId) { - + removeIncomingEdgesAndDelete(internalId); // We can say now that the element has removed completely from index. --curElementCount; @@ -1764,8 +1767,9 @@ void HNSWIndex::removeVectorInPlace(const idType element_int // scenario, the index data guard should be held by the caller (exclusive lock). template HNSWAddVectorState HNSWIndex::storeNewElement(labelType label, - const void *vector_data, idType newElementId) { - HNSWAddVectorState state(newElementId=newElementId); + const void *vector_data, + idType newElementId) { + HNSWAddVectorState state(newElementId = newElementId); // Choose randomly the maximum level in which the new element will be in the index. state.elementMaxLevel = getRandomLevel(mult); diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index e09f19abf..c6a76663b 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -445,7 +445,8 @@ void TieredHNSWIndex::insertVectorToHNSW( // graph scans will not occur, as they will try access the entry point's neighbors. // If an index resize is still needed, `storeNewElement` will perform it. This is OK since // we hold the main index lock for exclusive access. - auto state = hnsw_index->storeNewElement(label, processed_storage_blob, this->getNewElementId()); + auto state = + hnsw_index->storeNewElement(label, processed_storage_blob, this->getNewElementId()); if constexpr (releaseFlatGuard) { this->flatIndexGuard.unlock_shared(); } @@ -470,7 +471,8 @@ void TieredHNSWIndex::insertVectorToHNSW( // graph scans will not occur, as they will try access the entry point's neighbors. // At this point we are certain that the index has enough capacity for the new element, and // this call will not resize the index. - auto state = hnsw_index->storeNewElement(label, processed_storage_blob, this->getNewElementId()); + auto state = + hnsw_index->storeNewElement(label, processed_storage_blob, this->getNewElementId()); if constexpr (releaseFlatGuard) { this->flatIndexGuard.unlock_shared(); } @@ -527,9 +529,11 @@ idType TieredHNSWIndex::getNewElementId() { // No pending swap jobs, so we can use a new id. return hnsw_index->getNewElementId(); } - // idType deleted_id = idToSwapJob.begin()->first; + idType deletedId = idToSwapJob.begin()->first; + hnsw_index->removeIncomingEdgesAndDelete(deletedId); + idToSwapJob.erase(deletedId); - return hnsw_index->getNewElementId(); + return deletedId; // Note that this function is not thread-safe, and should be called while the main index lock }