From 5bd62ef0ead0abc48dbb5ee5690a7f0eb7b51871 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Thu, 19 Dec 2024 11:38:11 +0000 Subject: [PATCH 1/8] test_hnsw.py intiital --- tests/flow/test_hnsw.py | 48 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/flow/test_hnsw.py b/tests/flow/test_hnsw.py index 9370a651d..762556901 100644 --- a/tests/flow/test_hnsw.py +++ b/tests/flow/test_hnsw.py @@ -860,3 +860,51 @@ def test_hnsw_float16_multi_value(): recall = float(correct) / (k * num_queries) print("\nrecall is: \n", recall) assert (recall > 0.9) + +class TestINT8(): + dim = 50 + num_elements = 10_000 + M = 32 + efConstruction = 200 + efRuntime = 50 + data_type = VecSimType_INT8 + + rng = np.random.default_rng(seed=42) + + #### Create vectors + data = rng.integers(low=-128, high=127, size=(num_elements, dim), dtype=np.int8) + + #### Create queries + num_queries = 10 + query_data = rng.integers(low=-128, high=127, size=(num_queries, dim), dtype=np.int8) + + def create_index(self, metric): + hnsw_index = create_hnsw_index(self.dim, 0, metric, VecSimType_INT8, self.efConstruction, self.M, self.efRuntime) + hnsw_index.set_ef(self.efRuntime) + return hnsw_index + + def test_L2(self): + hnsw_index = self.create_index(VecSimMetric_L2) + k = 10 + + vectors = [] + for i, vector in enumerate(self.data): + hnsw_index.add_vector(vector, i) + vectors.append((i, vector)) + + correct = 0 + for target_vector in self.query_data: + hnswlib_labels, hnswlib_distances = hnsw_index.knn_query(target_vector, 10) + + results, keys = get_ground_truth_results(spatial.distance.sqeuclidean, target_vector, vectors, k) + for i, label in enumerate(hnswlib_labels[0]): + for j, correct_label in enumerate(keys): + if label == correct_label: + correct += 1 + assert math.isclose(hnswlib_distances[0][i], results[j]["dist"], rel_tol=1e-5) + break + + # Measure recall + recall = float(correct) / (k * self.num_queries) + print("\nrecall is: \n", recall) + assert (recall > 0.9) From 1aeaa3738b5292a3e8bc64f017f305bbd59b8478 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 22 Dec 2024 05:37:07 +0000 Subject: [PATCH 2/8] int8 hnsw tests --- src/python_bindings/bindings.cpp | 7 + tests/flow/test_hnsw.py | 228 +++++++++++++++++++++++++++++-- 2 files changed, 222 insertions(+), 13 deletions(-) diff --git a/src/python_bindings/bindings.cpp b/src/python_bindings/bindings.cpp index b72670fb8..e5d7733eb 100644 --- a/src/python_bindings/bindings.cpp +++ b/src/python_bindings/bindings.cpp @@ -300,6 +300,9 @@ class PyHNSWLibIndex : public PyVecSimIndex { } else if (type == VecSimType_FLOAT16) { auto *hnsw = dynamic_cast *>(index.get()); hnsw->saveIndex(location); + } else if (type == VecSimType_INT8) { + auto *hnsw = dynamic_cast *>(index.get()); + hnsw->saveIndex(location); } else { throw std::runtime_error("Invalid index data type"); } @@ -432,6 +435,10 @@ class PyHNSWLibIndex : public PyVecSimIndex { return dynamic_cast *>(this->index.get()) ->checkIntegrity() .valid_state; + } else if (type == VecSimType_INT8) { + return dynamic_cast *>(this->index.get()) + ->checkIntegrity() + .valid_state; } else { throw std::runtime_error("Invalid index data type"); } diff --git a/tests/flow/test_hnsw.py b/tests/flow/test_hnsw.py index 762556901..2f9251d9e 100644 --- a/tests/flow/test_hnsw.py +++ b/tests/flow/test_hnsw.py @@ -866,7 +866,7 @@ class TestINT8(): num_elements = 10_000 M = 32 efConstruction = 200 - efRuntime = 50 + efRuntime = 100 data_type = VecSimType_INT8 rng = np.random.default_rng(seed=42) @@ -878,33 +878,235 @@ class TestINT8(): num_queries = 10 query_data = rng.integers(low=-128, high=127, size=(num_queries, dim), dtype=np.int8) - def create_index(self, metric): - hnsw_index = create_hnsw_index(self.dim, 0, metric, VecSimType_INT8, self.efConstruction, self.M, self.efRuntime) - hnsw_index.set_ef(self.efRuntime) + # single HNSW index with L2 metric populated with INT8 vectors + cache_hnsw_index_L2_single = None + cached_label_to_vec_list = None + + @classmethod + def create_index(cls, metric = VecSimMetric_L2, is_multi=False): + hnsw_index = create_hnsw_index(cls.dim, 0, metric, VecSimType_INT8, cls.efConstruction, cls.M, cls.efRuntime, is_multi=is_multi) return hnsw_index + @classmethod + def create_add_vectors(cls, hnsw_index): + label_to_vec_list = [] + for i, vector in enumerate(cls.data): + hnsw_index.add_vector(vector, i) + label_to_vec_list.append((i, vector)) + return label_to_vec_list + + @classmethod + def get_cached_single_L2_index(cls): + if cls.cache_hnsw_index_L2_single is None: + cls.cache_hnsw_index_L2_single = cls.create_index() + cls.cached_label_to_vec_list = cls.create_add_vectors(cls.cache_hnsw_index_L2_single) + return cls.cache_hnsw_index_L2_single, cls.cached_label_to_vec_list + + @staticmethod + def compute_correct(res_labels, res_dist, gt_labels, gt_dist_label_list): + correct = 0 + for i, label in enumerate(res_labels): + for j, correct_label in enumerate(gt_labels): + if label == correct_label: + correct += 1 + assert math.isclose(res_dist[i], gt_dist_label_list[j]["dist"], rel_tol=1e-5) + break + + return correct + + @staticmethod + def fp32_expand_and_calc_cosine_dist(a, b): + # stupid numpy doesn't make any intermediate conversions when handling small types + # so we might get overflow. We need to convert to float32 ourselves. + a_float32 = a.astype(np.float32) + b_float32 = b.astype(np.float32) + return spatial.distance.cosine(a_float32, b_float32) + + def test_serialization(self): + hnsw_index, label_to_vec_list = self.get_cached_single_L2_index() + k = 10 + + correct = 0 + correct_labels = [] # cache these + for target_vector in self.query_data: + hnswlib_labels, hnswlib_distances = hnsw_index.knn_query(target_vector, k) + results, keys = get_ground_truth_results(spatial.distance.sqeuclidean, target_vector, label_to_vec_list, k) + + correct_labels.append(keys) + correct += self.compute_correct(hnswlib_labels[0], hnswlib_distances[0], keys, results) + + # Measure recall + recall = float(correct) / (k * self.num_queries) + print("\nrecall is: \n", recall) + + # Persist, delete and restore index. + file_name = os.getcwd() + "/dump" + hnsw_index.save_index(file_name) + + new_hnsw_index = HNSWIndex(file_name) + os.remove(file_name) + assert new_hnsw_index.index_size() == self.num_elements + assert new_hnsw_index.index_type() == VecSimType_INT8 + assert new_hnsw_index.check_integrity() + + # Check recall + correct_after = 0 + for i, target_vector in enumerate(self.query_data): + hnswlib_labels, _ = new_hnsw_index.knn_query(target_vector, k) + correct_labels_cur = correct_labels[i] + for label in hnswlib_labels[0]: + for correct_label in correct_labels_cur: + if label == correct_label: + correct_after += 1 + break + + # Compare recall after reloading the index + recall_after = float(correct_after) / (k * self.num_queries) + print("\nrecall after is: \n", recall_after) + assert recall == recall_after + + def knn(self, hnsw_index, label_vec_list, dist_func): + k = 10 + + correct = 0 + for target_vector in self.query_data: + hnswlib_labels, hnswlib_distances = hnsw_index.knn_query(target_vector, k) + results, keys = get_ground_truth_results(dist_func, target_vector, label_vec_list, k) + + correct += self.compute_correct(hnswlib_labels[0], hnswlib_distances[0], keys, results) + + # Measure recall + recall = recall = float(correct) / (k * self.num_queries) + print("\nrecall is: \n", recall) + assert (recall > 0.9) + def test_L2(self): - hnsw_index = self.create_index(VecSimMetric_L2) + hnsw_index, label_to_vec_list = self.get_cached_single_L2_index() + + self.knn(hnsw_index, label_to_vec_list, spatial.distance.sqeuclidean) + + def test_Cosine(self): + hnsw_index = self.create_index(VecSimMetric_Cosine) + label_to_vec_list = self.create_add_vectors(hnsw_index) + + self.knn(hnsw_index, label_to_vec_list, TestINT8.fp32_expand_and_calc_cosine_dist) + + def test_batch_iterator(self): + hnsw_index, _ = self.get_cached_single_L2_index() + + batch_size = 10 + + efRuntime = 180 + hnsw_index.set_ef(efRuntime) + + batch_iterator = hnsw_index.create_batch_iterator(self.query_data) + labels_first_batch, distances_first_batch = batch_iterator.get_next_results(batch_size, BY_ID) + for i, _ in enumerate(labels_first_batch[0][:-1]): + # Assert sorting by id + assert (labels_first_batch[0][i] < labels_first_batch[0][i + 1]) + + _, distances_second_batch = batch_iterator.get_next_results(batch_size, BY_SCORE) + should_have_return_in_first_batch = [] + for i, dist in enumerate(distances_second_batch[0][:-1]): + # Assert sorting by score + assert (distances_second_batch[0][i] < distances_second_batch[0][i + 1]) + # Assert that every distance in the second batch is higher than any distance of the first batch + if len(distances_first_batch[0][np.where(distances_first_batch[0] > dist)]) != 0: + should_have_return_in_first_batch.append(dist) + assert (len(should_have_return_in_first_batch) <= 2) + + # Verify that runtime args are sent properly to the batch iterator. + query_params = VecSimQueryParams() + query_params.hnswRuntimeParams.efRuntime = 5 + batch_iterator_new = hnsw_index.create_batch_iterator(self.query_data, query_params) + _, distances_first_batch_new = batch_iterator_new.get_next_results(batch_size, BY_ID) + # Verify that accuracy is worse with the new lower ef_runtime. + assert (sum(distances_first_batch[0]) < sum(distances_first_batch_new[0])) + + # reset efRuntime + hnsw_index.set_ef(self.efRuntime) + + def test_range_query(self): + hnsw_index = self.create_index(VecSimMetric_Cosine) + label_to_vec_list = self.create_add_vectors(hnsw_index) + radius = 0.7 + recalls = {} + + for epsilon_rt in [0.001, 0.01, 0.1]: + query_params = VecSimQueryParams() + query_params.hnswRuntimeParams.epsilon = epsilon_rt + start = time.time() + hnsw_labels, hnsw_distances = hnsw_index.range_query(self.query_data[0], radius=radius, query_param=query_params) + end = time.time() + res_num = len(hnsw_labels[0]) + + dists = sorted([(key, TestINT8.fp32_expand_and_calc_cosine_dist(self.query_data[0], vec)) for key, vec in label_to_vec_list]) + actual_results = [(key, dist) for key, dist in dists if dist <= radius] + + print( + f'\nlookup time for {self.num_elements} vectors with dim={self.dim} took {end - start} seconds with epsilon={epsilon_rt},' + f' got {res_num} results, which are {res_num / len(actual_results)} of the entire results in the range.') + + # Compare the number of vectors that are actually within the range to the returned results. + assert np.all(np.isin(hnsw_labels, np.array([label for label, _ in actual_results]))) + + assert max(hnsw_distances[0]) <= radius + recall = res_num / len(actual_results) + assert recall > 0.9 + recalls[epsilon_rt] = res_num / len(actual_results) + + # Expect higher recalls for higher epsilon values. + assert recalls[0.001] <= recalls[0.01] <= recalls[0.1] + + # Expect zero results for radius==0 + hnsw_labels, hnsw_distances = hnsw_index.range_query(self.query_data[0], radius=0) + assert len(hnsw_labels[0]) == 0 + + def test_multi_value(self): + num_per_label = 5 + num_labels = self.num_elements // num_per_label + + # efConstruction = 100 + num_queries = 10 k = 10 + hnsw_index = self.create_index(is_multi=True) + + data = self.rng.integers(low=-128, high=127, size=(num_labels, self.dim), dtype=np.int8) + vectors = [] - for i, vector in enumerate(self.data): - hnsw_index.add_vector(vector, i) - vectors.append((i, vector)) + for i, vector in enumerate(data): + for _ in range(num_per_label): + hnsw_index.add_vector(vector, i) + vectors.append((i, vector)) + query_data = self.rng.integers(low=-128, high=127, size=(num_queries, self.dim), dtype=np.int8) correct = 0 - for target_vector in self.query_data: - hnswlib_labels, hnswlib_distances = hnsw_index.knn_query(target_vector, 10) + for target_vector in query_data: + hnswlib_labels, hnswlib_distances = hnsw_index.knn_query(target_vector, k) + assert (len(hnswlib_labels[0]) == len(np.unique(hnswlib_labels[0]))) + + # sort distances of every vector from the target vector and get actual k nearest vectors + dists = {} + for key, vec in vectors: + # Setting or updating the score for each label. + # If it's the first time we calculate a score for a label dists.get(key, dist) + # will return dist so we will choose the actual score the first time. + dist = spatial.distance.sqeuclidean(target_vector, vec) + dists[key] = min(dist, dists.get(key, dist)) + + dists = list(dists.items()) + dists = sorted(dists, key=lambda pair: pair[1])[:k] + keys = [key for key, _ in dists] - results, keys = get_ground_truth_results(spatial.distance.sqeuclidean, target_vector, vectors, k) for i, label in enumerate(hnswlib_labels[0]): for j, correct_label in enumerate(keys): if label == correct_label: correct += 1 - assert math.isclose(hnswlib_distances[0][i], results[j]["dist"], rel_tol=1e-5) + assert math.isclose(hnswlib_distances[0][i], dists[j][1], rel_tol=1e-5) break # Measure recall - recall = float(correct) / (k * self.num_queries) + recall = float(correct) / (k * num_queries) print("\nrecall is: \n", recall) assert (recall > 0.9) From 2c463bd4153f6b8f9a3049ae937e02e6ed747d53 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 22 Dec 2024 12:09:59 +0000 Subject: [PATCH 3/8] general tests class --- tests/flow/common.py | 4 ++ tests/flow/test_hnsw.py | 134 ++++++++++++++++++++++++---------------- 2 files changed, 85 insertions(+), 53 deletions(-) diff --git a/tests/flow/common.py b/tests/flow/common.py index ea5333884..dfb05136a 100644 --- a/tests/flow/common.py +++ b/tests/flow/common.py @@ -61,6 +61,10 @@ def vec_to_bfloat16(vec): def vec_to_float16(vec): return vec.astype(np.float16) +def create_int8_vectors(num_elements, dim, rng: np.random.Generator = None): + rng = np.random.default_rng(seed=42) if rng is None else rng + return rng.integers(low=-128, high=127, size=(num_elements, dim), dtype=np.int8) + def get_ground_truth_results(dist_func, query, vectors, k): results = [{"dist": dist_func(query, vec), "label": key} for key, vec in vectors] results = sorted(results, key=lambda x: x["dist"]) diff --git a/tests/flow/test_hnsw.py b/tests/flow/test_hnsw.py index 2f9251d9e..19b5841ae 100644 --- a/tests/flow/test_hnsw.py +++ b/tests/flow/test_hnsw.py @@ -861,34 +861,48 @@ def test_hnsw_float16_multi_value(): print("\nrecall is: \n", recall) assert (recall > 0.9) -class TestINT8(): +''' +A Class to run common tests for HNSW index + +The following tests will *automatically* run if the class is inherited: +* test_serialization - single L2 index +* test_L2 - single L2 index +* test_batch_iterator - single L2 index + +The following tests should be *explicitly* called from a method prefixed with test_* +# range_query(dist_func) - single cosine index + +@param create_data_func is a function expects num_elements, dim, [and optional np.random.Generator] as input and +returns a (num_elements, dim) numpy array of vectors +uses multi L2 index +# multi_value(create_data_func, num_per_label) - +''' +class GeneralTest(): dim = 50 num_elements = 10_000 + num_queries = 10 M = 32 efConstruction = 200 - efRuntime = 100 - data_type = VecSimType_INT8 + efRuntime = 50 + data_type = None rng = np.random.default_rng(seed=42) + data = None + query_data = None - #### Create vectors - data = rng.integers(low=-128, high=127, size=(num_elements, dim), dtype=np.int8) - - #### Create queries - num_queries = 10 - query_data = rng.integers(low=-128, high=127, size=(num_queries, dim), dtype=np.int8) - - # single HNSW index with L2 metric populated with INT8 vectors + # single HNSW index with L2 metric cache_hnsw_index_L2_single = None cached_label_to_vec_list = None @classmethod def create_index(cls, metric = VecSimMetric_L2, is_multi=False): - hnsw_index = create_hnsw_index(cls.dim, 0, metric, VecSimType_INT8, cls.efConstruction, cls.M, cls.efRuntime, is_multi=is_multi) + assert cls.data_type is not None + hnsw_index = create_hnsw_index(cls.dim, 0, metric, cls.data_type, cls.efConstruction, cls.M, cls.efRuntime, is_multi=is_multi) return hnsw_index @classmethod def create_add_vectors(cls, hnsw_index): + assert cls.data is not None label_to_vec_list = [] for i, vector in enumerate(cls.data): hnsw_index.add_vector(vector, i) @@ -914,15 +928,24 @@ def compute_correct(res_labels, res_dist, gt_labels, gt_dist_label_list): return correct - @staticmethod - def fp32_expand_and_calc_cosine_dist(a, b): - # stupid numpy doesn't make any intermediate conversions when handling small types - # so we might get overflow. We need to convert to float32 ourselves. - a_float32 = a.astype(np.float32) - b_float32 = b.astype(np.float32) - return spatial.distance.cosine(a_float32, b_float32) + @classmethod + def knn(cls, hnsw_index, label_vec_list, dist_func): + k = 10 + + correct = 0 + for target_vector in cls.query_data: + hnswlib_labels, hnswlib_distances = hnsw_index.knn_query(target_vector, k) + results, keys = get_ground_truth_results(dist_func, target_vector, label_vec_list, k) + + correct += cls.compute_correct(hnswlib_labels[0], hnswlib_distances[0], keys, results) + + # Measure recall + recall = recall = float(correct) / (k * cls.num_queries) + print("\nrecall is: \n", recall) + assert (recall > 0.9) def test_serialization(self): + assert self.data_type is not None hnsw_index, label_to_vec_list = self.get_cached_single_L2_index() k = 10 @@ -946,7 +969,7 @@ def test_serialization(self): new_hnsw_index = HNSWIndex(file_name) os.remove(file_name) assert new_hnsw_index.index_size() == self.num_elements - assert new_hnsw_index.index_type() == VecSimType_INT8 + assert new_hnsw_index.index_type() == self.data_type assert new_hnsw_index.check_integrity() # Check recall @@ -965,32 +988,10 @@ def test_serialization(self): print("\nrecall after is: \n", recall_after) assert recall == recall_after - def knn(self, hnsw_index, label_vec_list, dist_func): - k = 10 - - correct = 0 - for target_vector in self.query_data: - hnswlib_labels, hnswlib_distances = hnsw_index.knn_query(target_vector, k) - results, keys = get_ground_truth_results(dist_func, target_vector, label_vec_list, k) - - correct += self.compute_correct(hnswlib_labels[0], hnswlib_distances[0], keys, results) - - # Measure recall - recall = recall = float(correct) / (k * self.num_queries) - print("\nrecall is: \n", recall) - assert (recall > 0.9) - def test_L2(self): hnsw_index, label_to_vec_list = self.get_cached_single_L2_index() - self.knn(hnsw_index, label_to_vec_list, spatial.distance.sqeuclidean) - def test_Cosine(self): - hnsw_index = self.create_index(VecSimMetric_Cosine) - label_to_vec_list = self.create_add_vectors(hnsw_index) - - self.knn(hnsw_index, label_to_vec_list, TestINT8.fp32_expand_and_calc_cosine_dist) - def test_batch_iterator(self): hnsw_index, _ = self.get_cached_single_L2_index() @@ -1026,7 +1027,8 @@ def test_batch_iterator(self): # reset efRuntime hnsw_index.set_ef(self.efRuntime) - def test_range_query(self): + ##### Should be explicitly called ##### + def range_query(self, dist_func): hnsw_index = self.create_index(VecSimMetric_Cosine) label_to_vec_list = self.create_add_vectors(hnsw_index) radius = 0.7 @@ -1040,7 +1042,7 @@ def test_range_query(self): end = time.time() res_num = len(hnsw_labels[0]) - dists = sorted([(key, TestINT8.fp32_expand_and_calc_cosine_dist(self.query_data[0], vec)) for key, vec in label_to_vec_list]) + dists = sorted([(key, dist_func(self.query_data[0], vec)) for key, vec in label_to_vec_list]) actual_results = [(key, dist) for key, dist in dists if dist <= radius] print( @@ -1062,27 +1064,23 @@ def test_range_query(self): hnsw_labels, hnsw_distances = hnsw_index.range_query(self.query_data[0], radius=0) assert len(hnsw_labels[0]) == 0 - def test_multi_value(self): + def multi_value(self, create_data_func, num_per_label = 5): num_per_label = 5 num_labels = self.num_elements // num_per_label - - # efConstruction = 100 - num_queries = 10 k = 10 - hnsw_index = self.create_index(is_multi=True) + data = create_data_func(num_labels, self.dim, self.rng) - data = self.rng.integers(low=-128, high=127, size=(num_labels, self.dim), dtype=np.int8) + hnsw_index = self.create_index(is_multi=True) vectors = [] for i, vector in enumerate(data): for _ in range(num_per_label): hnsw_index.add_vector(vector, i) vectors.append((i, vector)) - query_data = self.rng.integers(low=-128, high=127, size=(num_queries, self.dim), dtype=np.int8) correct = 0 - for target_vector in query_data: + for target_vector in self.query_data: hnswlib_labels, hnswlib_distances = hnsw_index.knn_query(target_vector, k) assert (len(hnswlib_labels[0]) == len(np.unique(hnswlib_labels[0]))) @@ -1107,6 +1105,36 @@ def test_multi_value(self): break # Measure recall - recall = float(correct) / (k * num_queries) + recall = float(correct) / (k * self.num_queries) print("\nrecall is: \n", recall) assert (recall > 0.9) + +class TestINT8(GeneralTest): + + GeneralTest.data_type = VecSimType_INT8 + + #### Create vectors + GeneralTest.data = create_int8_vectors(GeneralTest.num_elements, GeneralTest.dim, GeneralTest.rng) + + #### Create queries + GeneralTest.query_data = create_int8_vectors(GeneralTest.num_queries, GeneralTest.dim, GeneralTest.rng) + + @staticmethod + def fp32_expand_and_calc_cosine_dist(a, b): + # stupid numpy doesn't make any intermediate conversions when handling small types + # so we might get overflow. We need to convert to float32 ourselves. + a_float32 = a.astype(np.float32) + b_float32 = b.astype(np.float32) + return spatial.distance.cosine(a_float32, b_float32) + + def test_Cosine(self): + hnsw_index = self.create_index(VecSimMetric_Cosine) + label_to_vec_list = self.create_add_vectors(hnsw_index) + + self.knn(hnsw_index, label_to_vec_list, self.fp32_expand_and_calc_cosine_dist) + + def test_range_query(self): + self.range_query(self.fp32_expand_and_calc_cosine_dist) + + def test_multi_value(self): + self.multi_value(create_int8_vectors) From adae1758b4b84ee4390ed13c3428a9af56e98418 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 22 Dec 2024 16:44:24 +0000 Subject: [PATCH 4/8] flow_bruteforce.py: introduce GeneralTest call from TestINT8 common.py: introduce create_flat_index create_add_vectors move fp32_expand_and_calc_cosine_dist to common.py --- tests/flow/common.py | 25 +++++ tests/flow/test_bruteforce.py | 204 +++++++++++++++++++++++++++++++++- tests/flow/test_hnsw.py | 19 +--- 3 files changed, 231 insertions(+), 17 deletions(-) diff --git a/tests/flow/common.py b/tests/flow/common.py index dfb05136a..c6ec3d6d7 100644 --- a/tests/flow/common.py +++ b/tests/flow/common.py @@ -24,6 +24,7 @@ def create_hnsw_params(dim, num_elements, metric, data_type, ef_construction=200 hnsw_params.multi = is_multi return hnsw_params + # Helper function for creating an index,uses the default HNSW parameters if not specified. def create_hnsw_index(dim, num_elements, metric, data_type, ef_construction=200, m=16, ef_runtime=10, epsilon=0.01, is_multi=False): @@ -40,6 +41,23 @@ def create_hnsw_index(dim, num_elements, metric, data_type, ef_construction=200, return HNSWIndex(hnsw_params) +# Helper function for creating an index, uses the default flat parameters if not specified. +def create_flat_index(dim, metric, data_type, is_multi=False): + bfparams = BFParams() + + bfparams.dim = dim + bfparams.type = data_type + bfparams.metric = metric + bfparams.multi = is_multi + + return BFIndex(bfparams) + +def create_add_vectors(index, vectors): + label_to_vec_list = [] + for i, vector in enumerate(vectors): + index.add_vector(vector, i) + label_to_vec_list.append((i, vector)) + return label_to_vec_list # Compute the expected speedup as a function of the expected parallel section rate of the code by Amdahl's law def expected_speedup(expected_parallel_rate, n_threads): @@ -71,3 +89,10 @@ def get_ground_truth_results(dist_func, query, vectors, k): keys = [res["label"] for res in results[:k]] return results, keys + +def fp32_expand_and_calc_cosine_dist(a, b): + # stupid numpy doesn't make any intermediate conversions when handling small types + # so we might get overflow. We need to convert to float32 ourselves. + a_float32 = a.astype(np.float32) + b_float32 = b.astype(np.float32) + return spatial.distance.cosine(a_float32, b_float32) diff --git a/tests/flow/test_bruteforce.py b/tests/flow/test_bruteforce.py index b3492b7fe..4b812ecfc 100644 --- a/tests/flow/test_bruteforce.py +++ b/tests/flow/test_bruteforce.py @@ -31,13 +31,17 @@ def __init__(self, data_type, metric, dist_func, np_fuc, dim=16, num_labels=10, self.index.add_vector(vector, i % num_labels) self.vectors.append((i % num_labels, vector)) - def measure_dists(self, k): - dists = [(self.dist_func(self.query.flat, vec), key) for key, vec in self.vectors] + @staticmethod + def calculate_dists(query, vectors, k, dist_func): + dists = [(dist_func(query, vec), key) for key, vec in vectors] dists = sorted(dists)[:k] keys = [key for _, key in dists] dists = [dist for dist, _ in dists] return (keys, dists) + def measure_dists(self, k): + return self.calculate_dists(self.query[0], self.vectors, k, self.dist_func) + def test_sanity_bf(): test_datas = [] @@ -536,3 +540,199 @@ def test_bf_float16_multivalue(): assert_allclose(bf_labels, [keys], rtol=1e-5, atol=0) assert_allclose(bf_distances, [dists], rtol=1e-5, atol=0) + +''' +A Class to run common tests for BF index + +The following tests will *automatically* run if the class is inherited: +* test_serialization - single L2 index +* test_L2 - single L2 index +* test_batch_iterator - single L2 index + +The following tests should be *explicitly* called from a method prefixed with test_* +# range_query(dist_func) - single cosine index + +@param create_data_func is a function expects num_elements, dim, [and optional np.random.Generator] as input and +returns a (num_elements, dim) numpy array of vectors +uses multi L2 index +# multi_value(create_data_func, num_per_label) - +''' +class GeneralTest(): + dim = 128 + num_elements = 10_000 + num_queries = 1 + + data_type = None + + rng = np.random.default_rng(seed=42) + vectors_data = None + query_data = None + + # single FLAT index with L2 metric + cache_flat_index_L2_single = None + cached_label_to_vec_list = None + + @classmethod + def create_index(cls, metric = VecSimMetric_L2, is_multi=False): + assert cls.data_type is not None + return create_flat_index(cls.dim, metric, cls.data_type, is_multi=is_multi) + + @classmethod + def create_add_vectors(cls, index): + assert cls.vectors_data is not None + return create_add_vectors(index, cls.vectors_data) + + @classmethod + def get_cached_single_L2_index(cls): + if cls.cache_flat_index_L2_single is None: + cls.cache_flat_index_L2_single = cls.create_index() + cls.cached_label_to_vec_list = cls.create_add_vectors(cls.cache_flat_index_L2_single) + return cls.cache_flat_index_L2_single, cls.cached_label_to_vec_list + + @staticmethod + def compute_correct(res_labels, res_dist, gt_labels, gt_dist_label_list): + correct = 0 + for i, label in enumerate(res_labels): + for j, correct_label in enumerate(gt_labels): + if label == correct_label: + correct += 1 + assert math.isclose(res_dist[i], gt_dist_label_list[j]["dist"], rel_tol=1e-5) + break + + return correct + + @classmethod + def knn(cls, index, label_vec_list, dist_func): + k = 10 + keys, dists = Data.calculate_dists(cls.query_data[0], label_vec_list, k, dist_func) + bf_labels, bf_distances = index.knn_query(cls.query_data, k=k) + assert_allclose(bf_labels, [keys], rtol=1e-5, atol=0) + assert_allclose(bf_distances, [dists], rtol=1e-5, atol=0) + print(f"\nsanity test for L2 and {cls.data_type} pass") + + def test_L2(self): + index, label_to_vec_list = self.get_cached_single_L2_index() + self.knn(index, label_to_vec_list, spatial.distance.sqeuclidean) + + def test_batch_iterator(self): + index, _ = self.get_cached_single_L2_index() + # num_elements = self.num_labels + batch_size = 10 + + + batch_iterator = index.create_batch_iterator(self.query_data) + labels_first_batch, distances_first_batch = batch_iterator.get_next_results(batch_size, BY_ID) + for i, _ in enumerate(labels_first_batch[0][:-1]): + # assert sorting by id + assert(labels_first_batch[0][i] < labels_first_batch[0][i+1]) + + _, distances_second_batch = batch_iterator.get_next_results(batch_size, BY_SCORE) + for i, dist in enumerate(distances_second_batch[0][:-1]): + # assert sorting by score + assert(distances_second_batch[0][i] < distances_second_batch[0][i+1]) + # assert that every distance in the second batch is higher than any distance of the first batch + assert(len(distances_first_batch[0][np.where(distances_first_batch[0] > dist)]) == 0) + + # reset + batch_iterator.reset() + + # Run again in batches until depleted + batch_size = 1500 + returned_results_num = 0 + iterations = 0 + start = time.time() + while batch_iterator.has_next(): + iterations += 1 + labels, distances = batch_iterator.get_next_results(batch_size, BY_SCORE) + returned_results_num += len(labels[0]) + + print(f'Total search time for running batches of size {batch_size} for index with {self.num_elements} of dim={self.dim}: {time.time() - start}') + assert (returned_results_num == self.num_elements) + assert (iterations == np.ceil(self.num_elements/batch_size)) + + ##### Should be explicitly called ##### + def range_query(self, dist_func): + bfindex = self.create_index(VecSimMetric_Cosine) + label_to_vec_list = self.create_add_vectors(bfindex) + radius = 0.7 + + start = time.time() + bf_labels, bf_distances = bfindex.range_query(self.query_data[0], radius=radius) + end = time.time() + res_num = len(bf_labels[0]) + print(f'\nlookup time for {self.num_elements} vectors with dim={self.dim} took {end - start} seconds, got {res_num} results') + + # Verify that we got exactly all vectors within the range + results, keys = get_ground_truth_results(dist_func, self.query_data[0], label_to_vec_list, res_num) + + assert_allclose(max(bf_distances[0]), results[res_num-1]["dist"], rtol=1e-05) + assert np.array_equal(np.array(bf_labels[0]), np.array(keys)) + assert max(bf_distances[0]) <= radius + # Verify that the next closest vector that hasn't returned is not within the range + assert results[res_num]["dist"] > radius + + # Expect zero results for radius==0 + bf_labels, bf_distances = bfindex.range_query(self.query_data[0], radius=0) + assert len(bf_labels[0]) == 0 + + def multi_value(self, create_data_func, num_per_label = 5): + # num_labels=5_000 + # num_per_label=20 + # num_elements = num_labels * num_per_label + num_labels = self.num_elements // num_per_label + k = 10 + + data = create_data_func(num_labels, self.dim, self.rng) + + index = self.create_index(is_multi=True) + + vectors = [] + for i, vector in enumerate(data): + for _ in range(num_per_label): + index.add_vector(vector, i) + vectors.append((i, vector)) + + dists = {} + for key, vec in vectors: + # Setting or updating the score for each label. + # If it's the first time we calculate a score for a label dists.get(key, dist) + # will return dist so we will choose the actual score the first time. + dist = spatial.distance.sqeuclidean(self.query_data[0], vec) + dists[key] = min(dist, dists.get(key, dist)) + + dists = list(dists.items()) + dists = sorted(dists, key=lambda pair: pair[1])[:k] + keys = [key for key, _ in dists[:k]] + dists = [dist for _, dist in dists[:k]] + + start = time.time() + bf_labels, bf_distances = index.knn_query(self.query_data[0], k=10) + end = time.time() + + print(f'\nlookup time for {self.num_elements} vectors ({num_labels} labels and {num_per_label} vectors per label) with dim={self.dim} took {end - start} seconds') + + assert_allclose(bf_labels, [keys], rtol=1e-5, atol=0) + assert_allclose(bf_distances, [dists], rtol=1e-5, atol=0) + +class TestINT8(GeneralTest): + + GeneralTest.data_type = VecSimType_INT8 + + #### Create vectors + GeneralTest.vectors_data = create_int8_vectors(GeneralTest.num_elements, GeneralTest.dim, GeneralTest.rng) + + #### Create queries + GeneralTest.query_data = create_int8_vectors(GeneralTest.num_queries, GeneralTest.dim, GeneralTest.rng) + + def test_Cosine(self): + + index = self.create_index(VecSimMetric_Cosine) + label_to_vec_list = self.create_add_vectors(index) + + self.knn(index, label_to_vec_list, fp32_expand_and_calc_cosine_dist) + + def test_range_query(self): + self.range_query(fp32_expand_and_calc_cosine_dist) + + def test_multi_value(self): + self.multi_value(create_int8_vectors) diff --git a/tests/flow/test_hnsw.py b/tests/flow/test_hnsw.py index 19b5841ae..9e97ed683 100644 --- a/tests/flow/test_hnsw.py +++ b/tests/flow/test_hnsw.py @@ -903,11 +903,7 @@ def create_index(cls, metric = VecSimMetric_L2, is_multi=False): @classmethod def create_add_vectors(cls, hnsw_index): assert cls.data is not None - label_to_vec_list = [] - for i, vector in enumerate(cls.data): - hnsw_index.add_vector(vector, i) - label_to_vec_list.append((i, vector)) - return label_to_vec_list + return create_add_vectors(hnsw_index, cls.data) @classmethod def get_cached_single_L2_index(cls): @@ -1028,6 +1024,7 @@ def test_batch_iterator(self): hnsw_index.set_ef(self.efRuntime) ##### Should be explicitly called ##### + def range_query(self, dist_func): hnsw_index = self.create_index(VecSimMetric_Cosine) label_to_vec_list = self.create_add_vectors(hnsw_index) @@ -1119,22 +1116,14 @@ class TestINT8(GeneralTest): #### Create queries GeneralTest.query_data = create_int8_vectors(GeneralTest.num_queries, GeneralTest.dim, GeneralTest.rng) - @staticmethod - def fp32_expand_and_calc_cosine_dist(a, b): - # stupid numpy doesn't make any intermediate conversions when handling small types - # so we might get overflow. We need to convert to float32 ourselves. - a_float32 = a.astype(np.float32) - b_float32 = b.astype(np.float32) - return spatial.distance.cosine(a_float32, b_float32) - def test_Cosine(self): hnsw_index = self.create_index(VecSimMetric_Cosine) label_to_vec_list = self.create_add_vectors(hnsw_index) - self.knn(hnsw_index, label_to_vec_list, self.fp32_expand_and_calc_cosine_dist) + self.knn(hnsw_index, label_to_vec_list, fp32_expand_and_calc_cosine_dist) def test_range_query(self): - self.range_query(self.fp32_expand_and_calc_cosine_dist) + self.range_query(fp32_expand_and_calc_cosine_dist) def test_multi_value(self): self.multi_value(create_int8_vectors) From 25b3b7a2be38d5c060ae9fa36b477582a1189853 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Sun, 22 Dec 2024 17:23:22 +0000 Subject: [PATCH 5/8] tiered flow tests: * add optional create_data_func to IndexCtx, use for special datatypes *inntroduce test_create_int8 and test_search_insert_int8 create_int8_vectors expectes shape (tuple) --- tests/flow/common.py | 4 +-- tests/flow/test_bruteforce.py | 6 ++-- tests/flow/test_hnsw.py | 6 ++-- tests/flow/test_hnsw_tiered.py | 64 +++++++++++++++++++++++++--------- 4 files changed, 56 insertions(+), 24 deletions(-) diff --git a/tests/flow/common.py b/tests/flow/common.py index c6ec3d6d7..036862065 100644 --- a/tests/flow/common.py +++ b/tests/flow/common.py @@ -79,9 +79,9 @@ def vec_to_bfloat16(vec): def vec_to_float16(vec): return vec.astype(np.float16) -def create_int8_vectors(num_elements, dim, rng: np.random.Generator = None): +def create_int8_vectors(shape, rng: np.random.Generator = None): rng = np.random.default_rng(seed=42) if rng is None else rng - return rng.integers(low=-128, high=127, size=(num_elements, dim), dtype=np.int8) + return rng.integers(low=-128, high=127, size=shape, dtype=np.int8) def get_ground_truth_results(dist_func, query, vectors, k): results = [{"dist": dist_func(query, vec), "label": key} for key, vec in vectors] diff --git a/tests/flow/test_bruteforce.py b/tests/flow/test_bruteforce.py index 4b812ecfc..c47c4cd7e 100644 --- a/tests/flow/test_bruteforce.py +++ b/tests/flow/test_bruteforce.py @@ -682,7 +682,7 @@ def multi_value(self, create_data_func, num_per_label = 5): num_labels = self.num_elements // num_per_label k = 10 - data = create_data_func(num_labels, self.dim, self.rng) + data = create_data_func((num_labels, self.dim), self.rng) index = self.create_index(is_multi=True) @@ -719,10 +719,10 @@ class TestINT8(GeneralTest): GeneralTest.data_type = VecSimType_INT8 #### Create vectors - GeneralTest.vectors_data = create_int8_vectors(GeneralTest.num_elements, GeneralTest.dim, GeneralTest.rng) + GeneralTest.vectors_data = create_int8_vectors((GeneralTest.num_elements, GeneralTest.dim), GeneralTest.rng) #### Create queries - GeneralTest.query_data = create_int8_vectors(GeneralTest.num_queries, GeneralTest.dim, GeneralTest.rng) + GeneralTest.query_data = create_int8_vectors((GeneralTest.num_queries, GeneralTest.dim), GeneralTest.rng) def test_Cosine(self): diff --git a/tests/flow/test_hnsw.py b/tests/flow/test_hnsw.py index 9e97ed683..f5d6e3fb6 100644 --- a/tests/flow/test_hnsw.py +++ b/tests/flow/test_hnsw.py @@ -1066,7 +1066,7 @@ def multi_value(self, create_data_func, num_per_label = 5): num_labels = self.num_elements // num_per_label k = 10 - data = create_data_func(num_labels, self.dim, self.rng) + data = create_data_func((num_labels, self.dim), self.rng) hnsw_index = self.create_index(is_multi=True) @@ -1111,10 +1111,10 @@ class TestINT8(GeneralTest): GeneralTest.data_type = VecSimType_INT8 #### Create vectors - GeneralTest.data = create_int8_vectors(GeneralTest.num_elements, GeneralTest.dim, GeneralTest.rng) + GeneralTest.data = create_int8_vectors((GeneralTest.num_elements, GeneralTest.dim), GeneralTest.rng) #### Create queries - GeneralTest.query_data = create_int8_vectors(GeneralTest.num_queries, GeneralTest.dim, GeneralTest.rng) + GeneralTest.query_data = create_int8_vectors((GeneralTest.num_queries, GeneralTest.dim), GeneralTest.rng) def test_Cosine(self): hnsw_index = self.create_index(VecSimMetric_Cosine) diff --git a/tests/flow/test_hnsw_tiered.py b/tests/flow/test_hnsw_tiered.py index 569c52493..73ac88c45 100644 --- a/tests/flow/test_hnsw_tiered.py +++ b/tests/flow/test_hnsw_tiered.py @@ -12,7 +12,20 @@ def create_tiered_hnsw_params(swap_job_threshold = 0): return tiered_hnsw_params class IndexCtx: - array_conversion_func = {VecSimType_FLOAT32: np.float32, VecSimType_BFLOAT16: vec_to_bfloat16, VecSimType_FLOAT16: vec_to_float16} + array_conversion_func = { + VecSimType_FLOAT32: np.float32, + VecSimType_BFLOAT16: vec_to_bfloat16, + VecSimType_FLOAT16: vec_to_float16, + } + + type_to_dtype = { + VecSimType_FLOAT32: np.float32, + VecSimType_FLOAT64: np.float64, + VecSimType_BFLOAT16: bfloat16, + VecSimType_FLOAT16: np.float16, + VecSimType_INT8: np.int8 + } + def __init__(self, data_size=10000, dim=16, M=16, @@ -23,7 +36,8 @@ def __init__(self, data_size=10000, is_multi=False, num_per_label=1, swap_job_threshold=0, - flat_buffer_size=1024): + flat_buffer_size=1024, + create_data_func = None): self.num_vectors = data_size self.dim = dim self.M = M @@ -38,12 +52,17 @@ def __init__(self, data_size=10000, self.num_labels = int(self.num_vectors/num_per_label) self.rng = np.random.default_rng(seed=47) + self.create_data_func = self.rng.random if create_data_func is None else create_data_func data_shape = (self.num_labels, num_per_label, self.dim) if is_multi else (self.num_labels, self.dim) - data = self.rng.random(data_shape) - if self.data_type != VecSimType_FLOAT64: - self.data = self.array_conversion_func[self.data_type](data) - print("data type = ", self.data.dtype) + + + self.data = self.create_data_func(data_shape) + if self.data_type in self.array_conversion_func.keys(): + self.data = self.array_conversion_func[self.data_type](self.data) + print("data type = ", self.data.dtype) + assert self.data.dtype == self.type_to_dtype[self.data_type] + self.hnsw_params = create_hnsw_params(dim = self.dim, num_elements = self.num_vectors, metric = self.metric, @@ -102,18 +121,23 @@ def init_and_populate_hnsw_index(self): return hnsw_index def generate_queries(self, num_queries): - queries = self.rng.random((num_queries, self.dim)) - if self.data_type != VecSimType_FLOAT64: + queries = self.create_data_func((num_queries, self.dim)) + if self.data_type in self.array_conversion_func.keys(): queries = self.array_conversion_func[self.data_type](queries) return queries def get_vectors_memory_size(self): - memory_size = {VecSimType_FLOAT32:4, VecSimType_FLOAT64:8, VecSimType_BFLOAT16:2, VecSimType_FLOAT16:2} + memory_size = { + VecSimType_FLOAT32: 4, + VecSimType_FLOAT64: 8, + VecSimType_BFLOAT16: 2, + VecSimType_FLOAT16: 2, + VecSimType_INT8: 1 + } return bytes_to_mega(self.num_vectors * self.dim * memory_size[self.data_type]) - -def create_tiered_index(is_multi: bool, num_per_label=1, data_type=VecSimType_FLOAT32): - indices_ctx = IndexCtx(data_size=50000, is_multi=is_multi, num_per_label=num_per_label, data_type=data_type) +def create_tiered_index(is_multi: bool, num_per_label=1, data_type=VecSimType_FLOAT32, create_data_func=None): + indices_ctx = IndexCtx(data_size=50000, is_multi=is_multi, num_per_label=num_per_label, data_type=data_type, create_data_func=create_data_func) num_elements = indices_ctx.num_labels index = indices_ctx.tiered_index @@ -152,10 +176,10 @@ def create_tiered_index(is_multi: bool, num_per_label=1, data_type=VecSimType_FL print(f"with {threads_num} threads, insertion runtime is {round_(execution_time_ratio)} times better \n") -def search_insert(is_multi: bool, num_per_label=1, data_type=VecSimType_FLOAT32): +def search_insert(is_multi: bool, num_per_label=1, data_type=VecSimType_FLOAT32, create_data_func=None): data_size = 100000 indices_ctx = IndexCtx(data_size=data_size, is_multi=is_multi, num_per_label=num_per_label, - flat_buffer_size=data_size, M=64, data_type=data_type) + flat_buffer_size=data_size, M=64, data_type=data_type, create_data_func=create_data_func) index = indices_ctx.tiered_index num_labels = indices_ctx.num_labels @@ -226,13 +250,17 @@ def test_create_multi(): create_tiered_index(is_multi=True, num_per_label=5) def test_create_bf16(): - print("Test create multi label tiered hnsw index") + print("Test create BFLOAT16 tiered hnsw index") create_tiered_index(is_multi=False, data_type=VecSimType_BFLOAT16) def test_create_fp16(): - print("Test create multi label tiered hnsw index") + print("Test create FLOAT16 tiered hnsw index") create_tiered_index(is_multi=False, data_type=VecSimType_FLOAT16) +def test_create_int8(): + print("Test create INT8 tiered hnsw index") + create_tiered_index(is_multi=False, data_type=VecSimType_INT8, create_data_func=create_int8_vectors) + def test_search_insert(): print(f"\nStart insert & search test") search_insert(is_multi=False) @@ -245,6 +273,10 @@ def test_search_insert_fp16(): print(f"\nStart insert & search test") search_insert(is_multi=False, data_type=VecSimType_FLOAT16) +def test_search_insert_int8(): + print(f"\nStart insert & search test") + search_insert(is_multi=False, data_type=VecSimType_INT8, create_data_func=create_int8_vectors) + def test_search_insert_multi_index(): print(f"\nStart insert & search test for multi index") From 6fd0aed3df494e365f83186959f1c39b4e88bd2b Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 23 Dec 2024 05:44:15 +0000 Subject: [PATCH 6/8] use query.flat --- tests/flow/common.py | 8 +++++--- tests/flow/test_bruteforce.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/flow/common.py b/tests/flow/common.py index 036862065..4fc00ae8e 100644 --- a/tests/flow/common.py +++ b/tests/flow/common.py @@ -84,7 +84,7 @@ def create_int8_vectors(shape, rng: np.random.Generator = None): return rng.integers(low=-128, high=127, size=shape, dtype=np.int8) def get_ground_truth_results(dist_func, query, vectors, k): - results = [{"dist": dist_func(query, vec), "label": key} for key, vec in vectors] + results = [{"dist": dist_func(query.flat, vec), "label": key} for key, vec in vectors] results = sorted(results, key=lambda x: x["dist"]) keys = [res["label"] for res in results[:k]] @@ -93,6 +93,8 @@ def get_ground_truth_results(dist_func, query, vectors, k): def fp32_expand_and_calc_cosine_dist(a, b): # stupid numpy doesn't make any intermediate conversions when handling small types # so we might get overflow. We need to convert to float32 ourselves. - a_float32 = a.astype(np.float32) - b_float32 = b.astype(np.float32) + # a_float32 = a.astype(np.float32) + # b_float32 = b.astype(np.float32) + a_float32 = a + b_float32 = b return spatial.distance.cosine(a_float32, b_float32) diff --git a/tests/flow/test_bruteforce.py b/tests/flow/test_bruteforce.py index c47c4cd7e..ae68a3b38 100644 --- a/tests/flow/test_bruteforce.py +++ b/tests/flow/test_bruteforce.py @@ -33,7 +33,7 @@ def __init__(self, data_type, metric, dist_func, np_fuc, dim=16, num_labels=10, @staticmethod def calculate_dists(query, vectors, k, dist_func): - dists = [(dist_func(query, vec), key) for key, vec in vectors] + dists = [(dist_func(query.flat, vec), key) for key, vec in vectors] dists = sorted(dists)[:k] keys = [key for _, key in dists] dists = [dist for dist, _ in dists] From 358361a0de55a2d55237f304aca40da4a419d572 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 23 Dec 2024 06:44:05 +0000 Subject: [PATCH 7/8] revert using flat (not helping in int8) fix float16 calling query.flat --- tests/flow/common.py | 8 +++----- tests/flow/test_bruteforce.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/flow/common.py b/tests/flow/common.py index 4fc00ae8e..036862065 100644 --- a/tests/flow/common.py +++ b/tests/flow/common.py @@ -84,7 +84,7 @@ def create_int8_vectors(shape, rng: np.random.Generator = None): return rng.integers(low=-128, high=127, size=shape, dtype=np.int8) def get_ground_truth_results(dist_func, query, vectors, k): - results = [{"dist": dist_func(query.flat, vec), "label": key} for key, vec in vectors] + results = [{"dist": dist_func(query, vec), "label": key} for key, vec in vectors] results = sorted(results, key=lambda x: x["dist"]) keys = [res["label"] for res in results[:k]] @@ -93,8 +93,6 @@ def get_ground_truth_results(dist_func, query, vectors, k): def fp32_expand_and_calc_cosine_dist(a, b): # stupid numpy doesn't make any intermediate conversions when handling small types # so we might get overflow. We need to convert to float32 ourselves. - # a_float32 = a.astype(np.float32) - # b_float32 = b.astype(np.float32) - a_float32 = a - b_float32 = b + a_float32 = a.astype(np.float32) + b_float32 = b.astype(np.float32) return spatial.distance.cosine(a_float32, b_float32) diff --git a/tests/flow/test_bruteforce.py b/tests/flow/test_bruteforce.py index ae68a3b38..21f74143b 100644 --- a/tests/flow/test_bruteforce.py +++ b/tests/flow/test_bruteforce.py @@ -495,7 +495,7 @@ def test_bf_float16_range_query(self): print(f'\nlookup time for {self.num_labels} vectors with dim={self.dim} took {end - start} seconds, got {res_num} results') # Verify that we got exactly all vectors within the range - results, keys = get_ground_truth_results(spatial.distance.sqeuclidean, query_data.flat, self.data.vectors, res_num) + results, keys = get_ground_truth_results(spatial.distance.sqeuclidean, query_data[0], self.data.vectors, res_num) assert_allclose(max(bf_distances[0]), results[res_num-1]["dist"], rtol=1e-05) assert np.array_equal(np.array(bf_labels[0]), np.array(keys)) From 9ac3449c3247936371dd8d06ecba9478dbee42a2 Mon Sep 17 00:00:00 2001 From: meiravgri Date: Mon, 23 Dec 2024 07:38:31 +0000 Subject: [PATCH 8/8] revert changes in Data class in bf tests revert test_bf_float16_range_query change --- tests/flow/test_bruteforce.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/flow/test_bruteforce.py b/tests/flow/test_bruteforce.py index 21f74143b..ee7b7a5b0 100644 --- a/tests/flow/test_bruteforce.py +++ b/tests/flow/test_bruteforce.py @@ -31,17 +31,13 @@ def __init__(self, data_type, metric, dist_func, np_fuc, dim=16, num_labels=10, self.index.add_vector(vector, i % num_labels) self.vectors.append((i % num_labels, vector)) - @staticmethod - def calculate_dists(query, vectors, k, dist_func): - dists = [(dist_func(query.flat, vec), key) for key, vec in vectors] + def measure_dists(self, k): + dists = [(self.dist_func(self.query.flat, vec), key) for key, vec in self.vectors] dists = sorted(dists)[:k] keys = [key for _, key in dists] dists = [dist for dist, _ in dists] return (keys, dists) - def measure_dists(self, k): - return self.calculate_dists(self.query[0], self.vectors, k, self.dist_func) - def test_sanity_bf(): test_datas = [] @@ -495,7 +491,7 @@ def test_bf_float16_range_query(self): print(f'\nlookup time for {self.num_labels} vectors with dim={self.dim} took {end - start} seconds, got {res_num} results') # Verify that we got exactly all vectors within the range - results, keys = get_ground_truth_results(spatial.distance.sqeuclidean, query_data[0], self.data.vectors, res_num) + results, keys = get_ground_truth_results(spatial.distance.sqeuclidean, query_data.flat, self.data.vectors, res_num) assert_allclose(max(bf_distances[0]), results[res_num-1]["dist"], rtol=1e-05) assert np.array_equal(np.array(bf_labels[0]), np.array(keys)) @@ -604,10 +600,12 @@ def compute_correct(res_labels, res_dist, gt_labels, gt_dist_label_list): @classmethod def knn(cls, index, label_vec_list, dist_func): k = 10 - keys, dists = Data.calculate_dists(cls.query_data[0], label_vec_list, k, dist_func) + + results, keys = get_ground_truth_results(dist_func, cls.query_data[0], label_vec_list, k) + dists = [res["dist"] for res in results] bf_labels, bf_distances = index.knn_query(cls.query_data, k=k) assert_allclose(bf_labels, [keys], rtol=1e-5, atol=0) - assert_allclose(bf_distances, [dists], rtol=1e-5, atol=0) + assert_allclose(bf_distances, [dists[:k]], rtol=1e-5, atol=0) print(f"\nsanity test for L2 and {cls.data_type} pass") def test_L2(self):