From 5adcf9308627cbc27d62919e334d67032fbf33e7 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Tue, 29 Apr 2025 12:26:42 +0200 Subject: [PATCH 1/6] Real vectors for test --- tests/congruence_tests/test_common.py | 6 ++--- tests/congruence_tests/test_discovery.py | 6 ++++- tests/congruence_tests/test_recommendation.py | 6 ++++- tests/congruence_tests/test_search.py | 11 +++++++--- tests/fixtures/points.py | 22 ++++++++++++++----- 5 files changed, 37 insertions(+), 14 deletions(-) diff --git a/tests/congruence_tests/test_common.py b/tests/congruence_tests/test_common.py index d5d7252f4..fe251989a 100644 --- a/tests/congruence_tests/test_common.py +++ b/tests/congruence_tests/test_common.py @@ -14,9 +14,9 @@ COLLECTION_NAME = "congruence_test_collection" # dense vectors sizes -text_vector_size = 50 -image_vector_size = 100 -code_vector_size = 80 +text_vector_size = 384 +image_vector_size = 384 +code_vector_size = 384 # sparse vectors sizes sparse_text_vector_size = 100 diff --git a/tests/congruence_tests/test_discovery.py b/tests/congruence_tests/test_discovery.py index f3803dacf..e584442f3 100644 --- a/tests/congruence_tests/test_discovery.py +++ b/tests/congruence_tests/test_discovery.py @@ -21,7 +21,11 @@ def random_vector(dims: int) -> list[float]: - return np.random.random(dims).round(3).tolist() + _text_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32) + _text_vectors_unique = np.unique(_text_vectors, axis=0) + _text_vectors = _text_vectors_unique.tolist() + sampled_vectors = np.random.choice(len(_text_vectors), size=1, replace=False) + return sampled_vectors[0].tolist() @pytest.fixture(scope="module") diff --git a/tests/congruence_tests/test_recommendation.py b/tests/congruence_tests/test_recommendation.py index 2ad388572..1eccc11d4 100644 --- a/tests/congruence_tests/test_recommendation.py +++ b/tests/congruence_tests/test_recommendation.py @@ -24,7 +24,11 @@ class TestSimpleRecommendation: __test__ = False def __init__(self): - self.query_image = np.random.random(image_vector_size).tolist() + _text_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32) + _text_vectors_unique = np.unique(_text_vectors, axis=0) + _text_vectors = _text_vectors_unique.tolist() + sampled_vectors = np.random.choice(len(_text_vectors), size=1, replace=False) + self.query_image = sampled_vectors[0].tolist() @classmethod def simple_recommend_image(cls, client: QdrantBase) -> list[models.ScoredPoint]: diff --git a/tests/congruence_tests/test_search.py b/tests/congruence_tests/test_search.py index 28213a935..887170cd3 100644 --- a/tests/congruence_tests/test_search.py +++ b/tests/congruence_tests/test_search.py @@ -22,9 +22,14 @@ class TestSimpleSearcher: __test__ = False def __init__(self): - self.query_text = np.random.random(text_vector_size).tolist() - self.query_image = np.random.random(image_vector_size).tolist() - self.query_code = np.random.random(code_vector_size).tolist() + _text_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32) + _text_vectors_unique = np.unique(_text_vectors, axis=0) + _text_vectors = _text_vectors_unique.tolist() + sampled_vectors = np.random.choice(len(_text_vectors), size=3, replace=False) + + self.query_text = _text_vectors[sampled_vectors[0]] + self.query_image = _text_vectors[sampled_vectors[1]] + self.query_code = _text_vectors[sampled_vectors[2]] def simple_search_text(self, client: QdrantBase) -> list[models.ScoredPoint]: return client.search( diff --git a/tests/fixtures/points.py b/tests/fixtures/points.py index acb9c0c2c..cb75f424a 100644 --- a/tests/fixtures/points.py +++ b/tests/fixtures/points.py @@ -10,16 +10,24 @@ from qdrant_client.local.sparse import validate_sparse_vector from tests.fixtures.payload import one_random_payload_please +_text_vectors = np.load("data/text.npy") +_text_vectors_unique = np.unique(_text_vectors, axis=0) +_text_vectors = _text_vectors_unique.tolist() -def random_vectors( - vector_sizes: Union[dict[str, int], int], -) -> models.VectorStruct: + +def random_vectors(vector_sizes: Union[dict[str, int], int], idx=None) -> models.VectorStruct: if isinstance(vector_sizes, int): - return np.random.random(vector_sizes).round(3).tolist() + if idx: + return _text_vectors[idx] + else: + return np.random.random(vector_sizes).round(3).tolist() elif isinstance(vector_sizes, dict): vectors = {} for vector_name, vector_size in vector_sizes.items(): - vectors[vector_name] = np.random.random(vector_size).round(3).tolist() + if idx: + vectors[vector_name] = _text_vectors[idx] + else: + vectors[vector_name] = np.random.random(vector_size).round(3).tolist() return vectors else: raise ValueError("vector_sizes must be int or dict") @@ -100,7 +108,9 @@ def generate_points( if skip_vectors and isinstance(vector_sizes, int): raise ValueError("skip_vectors is not supported for single vector") + sampled_vectors = np.random.choice(len(_text_vectors), size=num_points, replace=False) points = [] + for i in range(num_points): payload = None if with_payload: @@ -115,7 +125,7 @@ def generate_points( elif multivector: vectors = random_multivectors(vector_sizes) else: - vectors = random_vectors(vector_sizes) + vectors = random_vectors(vector_sizes, sampled_vectors[i]) if skip_vectors: if random.random() > 0.8: From 1073e87d7b51449338a65cbfcb564c7f7336de90 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Tue, 6 May 2025 11:30:59 +0200 Subject: [PATCH 2/6] Lone vectors as real vectors --- tests/congruence_tests/test_collections.py | 2 +- tests/congruence_tests/test_common.py | 6 ++--- tests/congruence_tests/test_discovery.py | 7 ++--- tests/congruence_tests/test_group_search.py | 19 +++++++------- tests/congruence_tests/test_query.py | 18 ++++++++----- tests/congruence_tests/test_query_batch.py | 26 +++++++++---------- tests/congruence_tests/test_recommendation.py | 8 ++---- 7 files changed, 43 insertions(+), 43 deletions(-) diff --git a/tests/congruence_tests/test_collections.py b/tests/congruence_tests/test_collections.py index 48aca6198..1b65bef83 100644 --- a/tests/congruence_tests/test_collections.py +++ b/tests/congruence_tests/test_collections.py @@ -101,7 +101,7 @@ def test_collection_exists(): def test_init_from(): - vector_size = 2 + vector_size = 384 remote_client = init_remote() local_client = init_local() diff --git a/tests/congruence_tests/test_common.py b/tests/congruence_tests/test_common.py index fe251989a..5b2b3b004 100644 --- a/tests/congruence_tests/test_common.py +++ b/tests/congruence_tests/test_common.py @@ -19,9 +19,9 @@ code_vector_size = 384 # sparse vectors sizes -sparse_text_vector_size = 100 -sparse_image_vector_size = 1_000 -sparse_code_vector_size = 10_000 +sparse_text_vector_size = 384 +sparse_image_vector_size = 384 +sparse_code_vector_size = 384 # number of vectors to generate NUM_VECTORS = 1000 diff --git a/tests/congruence_tests/test_discovery.py b/tests/congruence_tests/test_discovery.py index e584442f3..23b75b289 100644 --- a/tests/congruence_tests/test_discovery.py +++ b/tests/congruence_tests/test_discovery.py @@ -16,16 +16,13 @@ init_remote, ) from tests.fixtures.filters import one_random_filter_please +from tests.fixtures.points import sample_queries secondary_collection_name = "congruence_secondary_collection" def random_vector(dims: int) -> list[float]: - _text_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32) - _text_vectors_unique = np.unique(_text_vectors, axis=0) - _text_vectors = _text_vectors_unique.tolist() - sampled_vectors = np.random.choice(len(_text_vectors), size=1, replace=False) - return sampled_vectors[0].tolist() + return sample_queries(1)[0] @pytest.fixture(scope="module") diff --git a/tests/congruence_tests/test_group_search.py b/tests/congruence_tests/test_group_search.py index dbe01f1ca..4c3252584 100644 --- a/tests/congruence_tests/test_group_search.py +++ b/tests/congruence_tests/test_group_search.py @@ -18,6 +18,7 @@ text_vector_size, ) from tests.fixtures.filters import one_random_filter_please +from tests.fixtures.points import sample_queries LOOKUP_COLLECTION_NAME = "lookup_collection" @@ -26,9 +27,10 @@ class TestGroupSearcher: __test__ = False def __init__(self): - self.query_text = np.random.random(text_vector_size).tolist() - self.query_image = np.random.random(image_vector_size).tolist() - self.query_code = np.random.random(code_vector_size).tolist() + queries = sample_queries(3) + self.query_text = queries[0] + self.query_image = queries[1] + self.query_code = queries[2] self.group_by = "rand_digit" self.group_size = 1 self.limit = 10 @@ -217,8 +219,8 @@ def group_by_keys(): def test_group_search_types(): - fixture_points = generate_fixtures(vectors_sizes=50) - vectors_config = models.VectorParams(size=50, distance=models.Distance.EUCLID) + fixture_points = generate_fixtures(vectors_sizes=text_vector_size) + vectors_config = models.VectorParams(size=text_vector_size, distance=models.Distance.EUCLID) searcher = TestGroupSearcher() @@ -228,17 +230,16 @@ def test_group_search_types(): remote_client = init_remote() init_client(remote_client, fixture_points, vectors_config=vectors_config) - query_vector_np = np.random.random(text_vector_size) + query_vector_np = sample_queries(1)[0] compare_client_results( local_client, remote_client, searcher.group_search, - query_vector=query_vector_np, + query_vector=np.array(query_vector_np), ) - query_vector_list = query_vector_np.tolist() compare_client_results( - local_client, remote_client, searcher.group_search, query_vector=query_vector_list + local_client, remote_client, searcher.group_search, query_vector=query_vector_np ) delete_fixture_collection(local_client) diff --git a/tests/congruence_tests/test_query.py b/tests/congruence_tests/test_query.py index 85d270bd2..eef3b961b 100644 --- a/tests/congruence_tests/test_query.py +++ b/tests/congruence_tests/test_query.py @@ -33,6 +33,7 @@ generate_random_multivector, ) from tests.utils import read_version +from tests.fixtures.points import sample_queries SECONDARY_COLLECTION_NAME = "congruence_secondary_collection" @@ -46,12 +47,15 @@ def __init__(self): self.group_size = 3 self.limit = 2 # number of groups + sampled_queries = sample_queries(4) + self.query_image = sampled_queries[0] + # dense query vectors - self.dense_vector_query_text = np.random.random(text_vector_size).tolist() - self.dense_vector_query_text_bis = self.dense_vector_query_text + self.dense_vector_query_text = sampled_queries[1] + self.dense_vector_query_text_bis = sampled_queries[1] self.dense_vector_query_text_bis[0] += 42.0 # slightly different vector - self.dense_vector_query_image = np.random.random(image_vector_size).tolist() - self.dense_vector_query_code = np.random.random(code_vector_size).tolist() + self.dense_vector_query_image = sampled_queries[2] + self.dense_vector_query_code = sampled_queries[3] # sparse query vectors self.sparse_vector_query_text = generate_random_sparse_vector( @@ -1458,9 +1462,11 @@ def test_original_input_persistence(): # the reason was that we were replacing point id with a sparse vector, and then, when we needed a dense vector # from the same point id, we already had point id replaced with a sparse vector num_points = 50 - vectors_config = {"text": models.VectorParams(size=50, distance=models.Distance.COSINE)} + vectors_config = { + "text": models.VectorParams(size=text_vector_size, distance=models.Distance.COSINE) + } sparse_vectors_config = {"sparse-text": models.SparseVectorParams()} - fixture_points = generate_fixtures(vectors_sizes={"text": 50}, num=num_points) + fixture_points = generate_fixtures(vectors_sizes={"text": text_vector_size}, num=num_points) sparse_fixture_points = generate_sparse_fixtures(num=num_points) points = [ models.PointStruct( diff --git a/tests/congruence_tests/test_query_batch.py b/tests/congruence_tests/test_query_batch.py index 2f8376a73..09342072d 100644 --- a/tests/congruence_tests/test_query_batch.py +++ b/tests/congruence_tests/test_query_batch.py @@ -19,7 +19,11 @@ generate_multivector_fixtures, multi_vector_config, ) -from tests.fixtures.points import generate_random_sparse_vector, generate_random_multivector +from tests.fixtures.points import ( + generate_random_sparse_vector, + generate_random_multivector, + sample_queries, +) class TestQueryBatchSearcher: @@ -39,12 +43,11 @@ def __init__(self): self.multivector_query_batch_code = [] for _ in range(4): + vecs = sample_queries(4) self.dense_vector_query_batch_text.append( models.QueryRequest( - query=np.random.random(text_vector_size).tolist(), - prefetch=models.Prefetch( - query=np.random.random(text_vector_size).tolist(), limit=5, using="text" - ), + query=vecs[0], + prefetch=models.Prefetch(query=vecs[1], limit=5, using="text"), limit=5, using="text", with_payload=True, @@ -52,7 +55,7 @@ def __init__(self): ) self.dense_vector_query_batch_image.append( models.QueryRequest( - query=np.random.random(image_vector_size).tolist(), + query=vecs[2], limit=5, using="image", with_payload=True, @@ -60,7 +63,7 @@ def __init__(self): ) self.dense_vector_query_batch_code.append( models.QueryRequest( - query=np.random.random(code_vector_size).tolist(), + query=vecs[3], limit=5, using="code", with_payload=True, @@ -101,16 +104,13 @@ def __init__(self): ) ) + vecs = sample_queries(2) self.dense_vector_query_batch_text_dbsf = [ models.QueryRequest( query=models.FusionQuery(fusion=models.Fusion.DBSF), prefetch=[ - models.Prefetch( - query=np.random.random(text_vector_size).tolist(), using="text" - ), - models.Prefetch( - query=np.random.random(text_vector_size).tolist(), using="text" - ), + models.Prefetch(query=vecs[0], using="text"), + models.Prefetch(query=vecs[1], using="text"), ], with_payload=True, ) diff --git a/tests/congruence_tests/test_recommendation.py b/tests/congruence_tests/test_recommendation.py index 1eccc11d4..dba1c3bba 100644 --- a/tests/congruence_tests/test_recommendation.py +++ b/tests/congruence_tests/test_recommendation.py @@ -16,6 +16,7 @@ init_remote, ) from tests.fixtures.filters import one_random_filter_please +from tests.fixtures.points import sample_queries secondary_collection_name = "congruence_secondary_collection" @@ -24,11 +25,7 @@ class TestSimpleRecommendation: __test__ = False def __init__(self): - _text_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32) - _text_vectors_unique = np.unique(_text_vectors, axis=0) - _text_vectors = _text_vectors_unique.tolist() - sampled_vectors = np.random.choice(len(_text_vectors), size=1, replace=False) - self.query_image = sampled_vectors[0].tolist() + self.query_image = sample_queries(1)[0] @classmethod def simple_recommend_image(cls, client: QdrantBase) -> list[models.ScoredPoint]: @@ -295,7 +292,6 @@ def test_recommend_from_another_collection(): def test_simple_recommend() -> None: fixture_points = generate_fixtures() - secondary_collection_points = generate_fixtures(100) searcher = TestSimpleRecommendation() From 5319ed777454a7b0fc5aeceed38aa84c91a12bc2 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Tue, 6 May 2025 11:34:14 +0200 Subject: [PATCH 3/6] Real vectors data --- tests/fixtures/points.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/tests/fixtures/points.py b/tests/fixtures/points.py index cb75f424a..4830cd401 100644 --- a/tests/fixtures/points.py +++ b/tests/fixtures/points.py @@ -10,22 +10,30 @@ from qdrant_client.local.sparse import validate_sparse_vector from tests.fixtures.payload import one_random_payload_please -_text_vectors = np.load("data/text.npy") +_text_vectors = np.load("data/text.npy", mmap_mode="r") _text_vectors_unique = np.unique(_text_vectors, axis=0) -_text_vectors = _text_vectors_unique.tolist() +_text_vectors_clean = _text_vectors_unique[~np.isnan(_text_vectors_unique).any(axis=1)].tolist() + + +def sample_queries(n: int) -> list[np.array]: + _query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32) + _query_vectors_unique = np.unique(_query_vectors, axis=0) + _query_vectors = _query_vectors_unique.tolist() + sampled_vectors = np.random.choice(len(_query_vectors), size=n, replace=False) + return [_query_vectors[i] for i in sampled_vectors] def random_vectors(vector_sizes: Union[dict[str, int], int], idx=None) -> models.VectorStruct: if isinstance(vector_sizes, int): if idx: - return _text_vectors[idx] + return _text_vectors_clean[idx] else: return np.random.random(vector_sizes).round(3).tolist() elif isinstance(vector_sizes, dict): vectors = {} for vector_name, vector_size in vector_sizes.items(): if idx: - vectors[vector_name] = _text_vectors[idx] + vectors[vector_name] = _text_vectors_clean[idx] else: vectors[vector_name] = np.random.random(vector_size).round(3).tolist() return vectors @@ -36,12 +44,12 @@ def random_vectors(vector_sizes: Union[dict[str, int], int], idx=None) -> models def random_multivectors(vector_sizes: Union[dict[str, int], int]) -> models.VectorStruct: if isinstance(vector_sizes, int): vec_count = random.randint(1, 10) - return generate_random_multivector(vector_sizes, vec_count) + return sample_random_multivector(vector_sizes, vec_count) elif isinstance(vector_sizes, dict): vectors = {} for vector_name, vector_size in vector_sizes.items(): vec_count = random.randint(1, 10) - vectors[vector_name] = generate_random_multivector(vector_size, vec_count) + vectors[vector_name] = sample_random_multivector(vector_size, vec_count) return vectors else: raise ValueError("vector_sizes must be int or dict") @@ -54,6 +62,11 @@ def generate_random_multivector(vec_size: int, vec_count: int) -> list[list[floa return multivec +def sample_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]: + sampled_vectors = np.random.choice(len(_text_vectors_clean), size=vec_count, replace=False) + return [_text_vectors_clean[i] for i in sampled_vectors] + + # Generate random sparse vector with given size and density # The density is the probability of non-zero value over the whole vector def generate_random_sparse_vector(size: int, density: float) -> SparseVector: @@ -108,7 +121,7 @@ def generate_points( if skip_vectors and isinstance(vector_sizes, int): raise ValueError("skip_vectors is not supported for single vector") - sampled_vectors = np.random.choice(len(_text_vectors), size=num_points, replace=False) + sampled_vectors = np.random.choice(len(_text_vectors_clean), size=num_points, replace=False) points = [] for i in range(num_points): From 888d94e9b43a23de8c2d3543e028b5abd61ee8c6 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Tue, 6 May 2025 14:51:25 +0200 Subject: [PATCH 4/6] Multivector draft --- tests/congruence_tests/test_common.py | 6 +++--- tests/congruence_tests/test_group_search.py | 5 +++-- .../test_multivector_discovery_queries.py | 16 ++++++---------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/tests/congruence_tests/test_common.py b/tests/congruence_tests/test_common.py index 5b2b3b004..fe251989a 100644 --- a/tests/congruence_tests/test_common.py +++ b/tests/congruence_tests/test_common.py @@ -19,9 +19,9 @@ code_vector_size = 384 # sparse vectors sizes -sparse_text_vector_size = 384 -sparse_image_vector_size = 384 -sparse_code_vector_size = 384 +sparse_text_vector_size = 100 +sparse_image_vector_size = 1_000 +sparse_code_vector_size = 10_000 # number of vectors to generate NUM_VECTORS = 1000 diff --git a/tests/congruence_tests/test_group_search.py b/tests/congruence_tests/test_group_search.py index 4c3252584..cad1155b2 100644 --- a/tests/congruence_tests/test_group_search.py +++ b/tests/congruence_tests/test_group_search.py @@ -230,12 +230,13 @@ def test_group_search_types(): remote_client = init_remote() init_client(remote_client, fixture_points, vectors_config=vectors_config) - query_vector_np = sample_queries(1)[0] + query_vector_list = sample_queries(1)[0] + query_vector_np = np.array(query_vector_list) compare_client_results( local_client, remote_client, searcher.group_search, - query_vector=np.array(query_vector_np), + query_vector=query_vector_np, ) compare_client_results( diff --git a/tests/congruence_tests/test_multivector_discovery_queries.py b/tests/congruence_tests/test_multivector_discovery_queries.py index 74dede340..12285a11b 100644 --- a/tests/congruence_tests/test_multivector_discovery_queries.py +++ b/tests/congruence_tests/test_multivector_discovery_queries.py @@ -17,7 +17,7 @@ init_remote, multi_vector_config, ) -from tests.fixtures.points import generate_random_multivector +from tests.fixtures.points import sample_random_multivector secondary_collection_name = "congruence_secondary_collection" @@ -126,10 +126,10 @@ def test_context_many_pairs( http_client, grpc_client, ): - random_image_multivector_1 = generate_random_multivector( + random_image_multivector_1 = sample_random_multivector( image_vector_size, random.randint(2, 30) ) - random_image_multivector_2 = generate_random_multivector( + random_image_multivector_2 = sample_random_multivector( image_vector_size, random.randint(2, 30) ) @@ -227,9 +227,7 @@ def test_discover_raw_target( http_client, grpc_client, ): - random_image_multivector = generate_random_multivector( - image_vector_size, random.randint(2, 30) - ) + random_image_multivector = sample_random_multivector(image_vector_size, random.randint(2, 30)) def f(client: QdrantBase, **kwargs: dict[str, Any]) -> list[models.ScoredPoint]: return client.query_points( @@ -253,9 +251,7 @@ def test_context_raw_positive( http_client, grpc_client, ): - random_image_multivector = generate_random_multivector( - image_vector_size, random.randint(2, 30) - ) + random_image_multivector = sample_random_multivector(image_vector_size, random.randint(2, 30)) def f(client: QdrantBase, **kwargs: dict[str, Any]) -> list[models.ScoredPoint]: return client.query_points( @@ -416,7 +412,7 @@ def f(client: QdrantBase, **kwargs: dict[str, Any]) -> list[list[models.ScoredPo def test_query_with_nan(): fixture_points = generate_multivector_fixtures(20) - vector = generate_random_multivector(image_vector_size, random.randint(2, 30)) + vector = sample_random_multivector(image_vector_size, random.randint(2, 30)) vector[0][1] = np.nan using = "multi-image" From 5d304e86e640e979b7376b802bb134cb1fa4fe88 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Thu, 22 May 2025 12:46:21 +0200 Subject: [PATCH 5/6] Multivector draft --- tests/congruence_tests/test_collections.py | 4 +-- tests/congruence_tests/test_common.py | 8 +++-- tests/embed_tests/test_local_inference.py | 4 +-- tests/fixtures/points.py | 39 +++++++++++++++------- tests/test_fastembed.py | 5 +-- 5 files changed, 39 insertions(+), 21 deletions(-) diff --git a/tests/congruence_tests/test_collections.py b/tests/congruence_tests/test_collections.py index 1b65bef83..5f8e69e77 100644 --- a/tests/congruence_tests/test_collections.py +++ b/tests/congruence_tests/test_collections.py @@ -12,7 +12,7 @@ init_local, init_remote, ) - +from tests.congruence_tests.test_common import text_vector_size COLLECTION_NAME = "test_collection" @@ -101,7 +101,7 @@ def test_collection_exists(): def test_init_from(): - vector_size = 384 + vector_size = text_vector_size remote_client = init_remote() local_client = init_local() diff --git a/tests/congruence_tests/test_common.py b/tests/congruence_tests/test_common.py index fe251989a..9221ae7fd 100644 --- a/tests/congruence_tests/test_common.py +++ b/tests/congruence_tests/test_common.py @@ -11,12 +11,14 @@ from tests.congruence_tests.settings import TIMEOUT from tests.fixtures.points import generate_points +from tests.fixtures.points import text_vector_size + COLLECTION_NAME = "congruence_test_collection" # dense vectors sizes -text_vector_size = 384 -image_vector_size = 384 -code_vector_size = 384 +text_vector_size = text_vector_size # todo 384 +image_vector_size = text_vector_size # todo 384 +code_vector_size = text_vector_size # todo 384 # sparse vectors sizes sparse_text_vector_size = 100 diff --git a/tests/embed_tests/test_local_inference.py b/tests/embed_tests/test_local_inference.py index eb8353caa..4a3cab324 100644 --- a/tests/embed_tests/test_local_inference.py +++ b/tests/embed_tests/test_local_inference.py @@ -17,11 +17,11 @@ LateInteractionTextEmbedding, ImageEmbedding, ) - +from tests.congruence_tests.test_common import text_vector_size COLLECTION_NAME = "inference_collection" DENSE_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" -DENSE_DIM = 384 +DENSE_DIM = text_vector_size SPARSE_MODEL_NAME = "Qdrant/bm42-all-minilm-l6-v2-attentions" COLBERT_MODEL_NAME = "answerdotai/answerai-colbert-small-v1" COLBERT_DIM = 96 diff --git a/tests/fixtures/points.py b/tests/fixtures/points.py index 4830cd401..f00e622ca 100644 --- a/tests/fixtures/points.py +++ b/tests/fixtures/points.py @@ -10,32 +10,36 @@ from qdrant_client.local.sparse import validate_sparse_vector from tests.fixtures.payload import one_random_payload_please -_text_vectors = np.load("data/text.npy", mmap_mode="r") +text_vector_size = 20 + +_text_vectors = np.load("data/text.npy", mmap_mode="r")[..., :text_vector_size] _text_vectors_unique = np.unique(_text_vectors, axis=0) _text_vectors_clean = _text_vectors_unique[~np.isnan(_text_vectors_unique).any(axis=1)].tolist() def sample_queries(n: int) -> list[np.array]: - _query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32) + _query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)[ + ..., :text_vector_size + ] _query_vectors_unique = np.unique(_query_vectors, axis=0) _query_vectors = _query_vectors_unique.tolist() sampled_vectors = np.random.choice(len(_query_vectors), size=n, replace=False) - return [_query_vectors[i] for i in sampled_vectors] + return [_query_vectors[i].copy() for i in sampled_vectors] def random_vectors(vector_sizes: Union[dict[str, int], int], idx=None) -> models.VectorStruct: if isinstance(vector_sizes, int): if idx: - return _text_vectors_clean[idx] + return _text_vectors_clean[idx].copy() else: - return np.random.random(vector_sizes).round(3).tolist() + return np.random.random(vector_sizes).tolist() # .round(3) elif isinstance(vector_sizes, dict): vectors = {} for vector_name, vector_size in vector_sizes.items(): if idx: - vectors[vector_name] = _text_vectors_clean[idx] + vectors[vector_name] = _text_vectors_clean[idx].copy() else: - vectors[vector_name] = np.random.random(vector_size).round(3).tolist() + vectors[vector_name] = np.random.random(vector_size).tolist() # .round(3) return vectors else: raise ValueError("vector_sizes must be int or dict") @@ -58,13 +62,14 @@ def random_multivectors(vector_sizes: Union[dict[str, int], int]) -> models.Vect def generate_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]: multivec = [] for _ in range(vec_count): - multivec.append(np.random.random(vec_size).round(3).tolist()) + multivec.append(np.random.random(vec_size).tolist()) # .round(3). return multivec def sample_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]: - sampled_vectors = np.random.choice(len(_text_vectors_clean), size=vec_count, replace=False) - return [_text_vectors_clean[i] for i in sampled_vectors] + doc_vectors = _text_vectors_clean.copy() + sampled_vectors = np.random.choice(len(doc_vectors), size=vec_count, replace=False) + return [np.array(doc_vectors[i]).astype(np.float32).tolist() for i in sampled_vectors] # Generate random sparse vector with given size and density @@ -72,7 +77,8 @@ def sample_random_multivector(vec_size: int, vec_count: int) -> list[list[float] def generate_random_sparse_vector(size: int, density: float) -> SparseVector: num_non_zero = int(size * density) indices: list[int] = random.sample(range(size), num_non_zero) - values: list[float] = [round(random.random(), 6) for _ in range(num_non_zero)] + values: list[float] = [random.random() for _ in range(num_non_zero)] + sparse_vector = SparseVector(indices=indices, values=values) validate_sparse_vector(sparse_vector) return sparse_vector @@ -121,7 +127,8 @@ def generate_points( if skip_vectors and isinstance(vector_sizes, int): raise ValueError("skip_vectors is not supported for single vector") - sampled_vectors = np.random.choice(len(_text_vectors_clean), size=num_points, replace=False) + doc_vectors = _text_vectors_clean.copy() + sampled_vectors = np.random.choice(len(doc_vectors), size=num_points, replace=False) points = [] for i in range(num_points): @@ -137,6 +144,14 @@ def generate_points( vectors = random_sparse_vectors(vector_sizes, even=even_sparse) elif multivector: vectors = random_multivectors(vector_sizes) + if isinstance(vectors, dict): + for name, vec in vectors.items(): + assert np.array(vec).dtype.kind in ("f", "i") # float or int + assert not np.isnan(np.array(vec)).any() + else: + for name, vec in enumerate(vectors): + assert np.array(vec).dtype.kind in ("f", "i") # float or int + assert not np.isnan(np.array(vec)).any() else: vectors = random_vectors(vector_sizes, sampled_vectors[i]) diff --git a/tests/test_fastembed.py b/tests/test_fastembed.py index d089dd9bc..ef8216bcd 100644 --- a/tests/test_fastembed.py +++ b/tests/test_fastembed.py @@ -5,6 +5,7 @@ from tests.congruence_tests.test_common import compare_client_results from tests.utils import read_version +from tests.congruence_tests.test_common import text_vector_size DOCS_EXAMPLE = { @@ -126,7 +127,7 @@ def test_set_model(): # Check if the model is initialized & cls.embeddings_models is set with expected values dim, dist = local_client._get_model_params(embedding_model_name) - assert dim == 384 + assert dim == text_vector_size # Use the initialized model to add documents with vector embeddings local_client.add(collection_name=collection_name, **DOCS_EXAMPLE) @@ -209,7 +210,7 @@ def test_get_embedding_size(): if not local_client._FASTEMBED_INSTALLED: pytest.skip("FastEmbed is not installed, skipping test") - assert local_client.get_embedding_size() == 384 + assert local_client.get_embedding_size() == text_vector_size assert local_client.get_embedding_size(model_name="BAAI/bge-base-en-v1.5") == 768 From 7c577db7a1e8955f28e13161a018f7f6c84422d9 Mon Sep 17 00:00:00 2001 From: "d.rudenko" Date: Thu, 22 May 2025 13:14:11 +0200 Subject: [PATCH 6/6] Fixed size inconsistency --- tests/congruence_tests/test_search.py | 5 ++--- tests/fixtures/points.py | 14 +++++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/congruence_tests/test_search.py b/tests/congruence_tests/test_search.py index 887170cd3..3b95bc2d4 100644 --- a/tests/congruence_tests/test_search.py +++ b/tests/congruence_tests/test_search.py @@ -16,15 +16,14 @@ text_vector_size, ) from tests.fixtures.filters import one_random_filter_please +from tests.fixtures.points import generate_vectors class TestSimpleSearcher: __test__ = False def __init__(self): - _text_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32) - _text_vectors_unique = np.unique(_text_vectors, axis=0) - _text_vectors = _text_vectors_unique.tolist() + _text_vectors = generate_vectors() sampled_vectors = np.random.choice(len(_text_vectors), size=3, replace=False) self.query_text = _text_vectors[sampled_vectors[0]] diff --git a/tests/fixtures/points.py b/tests/fixtures/points.py index f00e622ca..5c7453418 100644 --- a/tests/fixtures/points.py +++ b/tests/fixtures/points.py @@ -12,9 +12,17 @@ text_vector_size = 20 -_text_vectors = np.load("data/text.npy", mmap_mode="r")[..., :text_vector_size] -_text_vectors_unique = np.unique(_text_vectors, axis=0) -_text_vectors_clean = _text_vectors_unique[~np.isnan(_text_vectors_unique).any(axis=1)].tolist() + +def generate_vectors(): + _text_vectors = np.load("data/text.npy", mmap_mode="r")[..., :text_vector_size] + _text_vectors_unique = np.unique(_text_vectors, axis=0) + _text_vectors_clean = _text_vectors_unique[ + ~np.isnan(_text_vectors_unique).any(axis=1) + ].tolist() + return _text_vectors_clean + + +_text_vectors_clean = generate_vectors() def sample_queries(n: int) -> list[np.array]: