diff --git a/tests/congruence_tests/test_collections.py b/tests/congruence_tests/test_collections.py index 48aca619..5f8e69e7 100644 --- a/tests/congruence_tests/test_collections.py +++ b/tests/congruence_tests/test_collections.py @@ -12,7 +12,7 @@ init_local, init_remote, ) - +from tests.congruence_tests.test_common import text_vector_size COLLECTION_NAME = "test_collection" @@ -101,7 +101,7 @@ def test_collection_exists(): def test_init_from(): - vector_size = 2 + vector_size = text_vector_size remote_client = init_remote() local_client = init_local() diff --git a/tests/congruence_tests/test_common.py b/tests/congruence_tests/test_common.py index d5d7252f..9221ae7f 100644 --- a/tests/congruence_tests/test_common.py +++ b/tests/congruence_tests/test_common.py @@ -11,12 +11,14 @@ from tests.congruence_tests.settings import TIMEOUT from tests.fixtures.points import generate_points +from tests.fixtures.points import text_vector_size + COLLECTION_NAME = "congruence_test_collection" # dense vectors sizes -text_vector_size = 50 -image_vector_size = 100 -code_vector_size = 80 +text_vector_size = text_vector_size # todo 384 +image_vector_size = text_vector_size # todo 384 +code_vector_size = text_vector_size # todo 384 # sparse vectors sizes sparse_text_vector_size = 100 diff --git a/tests/congruence_tests/test_discovery.py b/tests/congruence_tests/test_discovery.py index f3803dac..23b75b28 100644 --- a/tests/congruence_tests/test_discovery.py +++ b/tests/congruence_tests/test_discovery.py @@ -16,12 +16,13 @@ init_remote, ) from tests.fixtures.filters import one_random_filter_please +from tests.fixtures.points import sample_queries secondary_collection_name = "congruence_secondary_collection" def random_vector(dims: int) -> list[float]: - return np.random.random(dims).round(3).tolist() + return sample_queries(1)[0] @pytest.fixture(scope="module") diff --git a/tests/congruence_tests/test_group_search.py b/tests/congruence_tests/test_group_search.py index dbe01f1c..cad1155b 100644 --- a/tests/congruence_tests/test_group_search.py +++ b/tests/congruence_tests/test_group_search.py @@ -18,6 +18,7 @@ text_vector_size, ) from tests.fixtures.filters import one_random_filter_please +from tests.fixtures.points import sample_queries LOOKUP_COLLECTION_NAME = "lookup_collection" @@ -26,9 +27,10 @@ class TestGroupSearcher: __test__ = False def __init__(self): - self.query_text = np.random.random(text_vector_size).tolist() - self.query_image = np.random.random(image_vector_size).tolist() - self.query_code = np.random.random(code_vector_size).tolist() + queries = sample_queries(3) + self.query_text = queries[0] + self.query_image = queries[1] + self.query_code = queries[2] self.group_by = "rand_digit" self.group_size = 1 self.limit = 10 @@ -217,8 +219,8 @@ def group_by_keys(): def test_group_search_types(): - fixture_points = generate_fixtures(vectors_sizes=50) - vectors_config = models.VectorParams(size=50, distance=models.Distance.EUCLID) + fixture_points = generate_fixtures(vectors_sizes=text_vector_size) + vectors_config = models.VectorParams(size=text_vector_size, distance=models.Distance.EUCLID) searcher = TestGroupSearcher() @@ -228,7 +230,8 @@ def test_group_search_types(): remote_client = init_remote() init_client(remote_client, fixture_points, vectors_config=vectors_config) - query_vector_np = np.random.random(text_vector_size) + query_vector_list = sample_queries(1)[0] + query_vector_np = np.array(query_vector_list) compare_client_results( local_client, remote_client, @@ -236,9 +239,8 @@ def test_group_search_types(): query_vector=query_vector_np, ) - query_vector_list = query_vector_np.tolist() compare_client_results( - local_client, remote_client, searcher.group_search, query_vector=query_vector_list + local_client, remote_client, searcher.group_search, query_vector=query_vector_np ) delete_fixture_collection(local_client) diff --git a/tests/congruence_tests/test_multivector_discovery_queries.py b/tests/congruence_tests/test_multivector_discovery_queries.py index 74dede34..12285a11 100644 --- a/tests/congruence_tests/test_multivector_discovery_queries.py +++ b/tests/congruence_tests/test_multivector_discovery_queries.py @@ -17,7 +17,7 @@ init_remote, multi_vector_config, ) -from tests.fixtures.points import generate_random_multivector +from tests.fixtures.points import sample_random_multivector secondary_collection_name = "congruence_secondary_collection" @@ -126,10 +126,10 @@ def test_context_many_pairs( http_client, grpc_client, ): - random_image_multivector_1 = generate_random_multivector( + random_image_multivector_1 = sample_random_multivector( image_vector_size, random.randint(2, 30) ) - random_image_multivector_2 = generate_random_multivector( + random_image_multivector_2 = sample_random_multivector( image_vector_size, random.randint(2, 30) ) @@ -227,9 +227,7 @@ def test_discover_raw_target( http_client, grpc_client, ): - random_image_multivector = generate_random_multivector( - image_vector_size, random.randint(2, 30) - ) + random_image_multivector = sample_random_multivector(image_vector_size, random.randint(2, 30)) def f(client: QdrantBase, **kwargs: dict[str, Any]) -> list[models.ScoredPoint]: return client.query_points( @@ -253,9 +251,7 @@ def test_context_raw_positive( http_client, grpc_client, ): - random_image_multivector = generate_random_multivector( - image_vector_size, random.randint(2, 30) - ) + random_image_multivector = sample_random_multivector(image_vector_size, random.randint(2, 30)) def f(client: QdrantBase, **kwargs: dict[str, Any]) -> list[models.ScoredPoint]: return client.query_points( @@ -416,7 +412,7 @@ def f(client: QdrantBase, **kwargs: dict[str, Any]) -> list[list[models.ScoredPo def test_query_with_nan(): fixture_points = generate_multivector_fixtures(20) - vector = generate_random_multivector(image_vector_size, random.randint(2, 30)) + vector = sample_random_multivector(image_vector_size, random.randint(2, 30)) vector[0][1] = np.nan using = "multi-image" diff --git a/tests/congruence_tests/test_query.py b/tests/congruence_tests/test_query.py index 85d270bd..eef3b961 100644 --- a/tests/congruence_tests/test_query.py +++ b/tests/congruence_tests/test_query.py @@ -33,6 +33,7 @@ generate_random_multivector, ) from tests.utils import read_version +from tests.fixtures.points import sample_queries SECONDARY_COLLECTION_NAME = "congruence_secondary_collection" @@ -46,12 +47,15 @@ def __init__(self): self.group_size = 3 self.limit = 2 # number of groups + sampled_queries = sample_queries(4) + self.query_image = sampled_queries[0] + # dense query vectors - self.dense_vector_query_text = np.random.random(text_vector_size).tolist() - self.dense_vector_query_text_bis = self.dense_vector_query_text + self.dense_vector_query_text = sampled_queries[1] + self.dense_vector_query_text_bis = sampled_queries[1] self.dense_vector_query_text_bis[0] += 42.0 # slightly different vector - self.dense_vector_query_image = np.random.random(image_vector_size).tolist() - self.dense_vector_query_code = np.random.random(code_vector_size).tolist() + self.dense_vector_query_image = sampled_queries[2] + self.dense_vector_query_code = sampled_queries[3] # sparse query vectors self.sparse_vector_query_text = generate_random_sparse_vector( @@ -1458,9 +1462,11 @@ def test_original_input_persistence(): # the reason was that we were replacing point id with a sparse vector, and then, when we needed a dense vector # from the same point id, we already had point id replaced with a sparse vector num_points = 50 - vectors_config = {"text": models.VectorParams(size=50, distance=models.Distance.COSINE)} + vectors_config = { + "text": models.VectorParams(size=text_vector_size, distance=models.Distance.COSINE) + } sparse_vectors_config = {"sparse-text": models.SparseVectorParams()} - fixture_points = generate_fixtures(vectors_sizes={"text": 50}, num=num_points) + fixture_points = generate_fixtures(vectors_sizes={"text": text_vector_size}, num=num_points) sparse_fixture_points = generate_sparse_fixtures(num=num_points) points = [ models.PointStruct( diff --git a/tests/congruence_tests/test_query_batch.py b/tests/congruence_tests/test_query_batch.py index 2f8376a7..09342072 100644 --- a/tests/congruence_tests/test_query_batch.py +++ b/tests/congruence_tests/test_query_batch.py @@ -19,7 +19,11 @@ generate_multivector_fixtures, multi_vector_config, ) -from tests.fixtures.points import generate_random_sparse_vector, generate_random_multivector +from tests.fixtures.points import ( + generate_random_sparse_vector, + generate_random_multivector, + sample_queries, +) class TestQueryBatchSearcher: @@ -39,12 +43,11 @@ def __init__(self): self.multivector_query_batch_code = [] for _ in range(4): + vecs = sample_queries(4) self.dense_vector_query_batch_text.append( models.QueryRequest( - query=np.random.random(text_vector_size).tolist(), - prefetch=models.Prefetch( - query=np.random.random(text_vector_size).tolist(), limit=5, using="text" - ), + query=vecs[0], + prefetch=models.Prefetch(query=vecs[1], limit=5, using="text"), limit=5, using="text", with_payload=True, @@ -52,7 +55,7 @@ def __init__(self): ) self.dense_vector_query_batch_image.append( models.QueryRequest( - query=np.random.random(image_vector_size).tolist(), + query=vecs[2], limit=5, using="image", with_payload=True, @@ -60,7 +63,7 @@ def __init__(self): ) self.dense_vector_query_batch_code.append( models.QueryRequest( - query=np.random.random(code_vector_size).tolist(), + query=vecs[3], limit=5, using="code", with_payload=True, @@ -101,16 +104,13 @@ def __init__(self): ) ) + vecs = sample_queries(2) self.dense_vector_query_batch_text_dbsf = [ models.QueryRequest( query=models.FusionQuery(fusion=models.Fusion.DBSF), prefetch=[ - models.Prefetch( - query=np.random.random(text_vector_size).tolist(), using="text" - ), - models.Prefetch( - query=np.random.random(text_vector_size).tolist(), using="text" - ), + models.Prefetch(query=vecs[0], using="text"), + models.Prefetch(query=vecs[1], using="text"), ], with_payload=True, ) diff --git a/tests/congruence_tests/test_recommendation.py b/tests/congruence_tests/test_recommendation.py index 2ad38857..dba1c3bb 100644 --- a/tests/congruence_tests/test_recommendation.py +++ b/tests/congruence_tests/test_recommendation.py @@ -16,6 +16,7 @@ init_remote, ) from tests.fixtures.filters import one_random_filter_please +from tests.fixtures.points import sample_queries secondary_collection_name = "congruence_secondary_collection" @@ -24,7 +25,7 @@ class TestSimpleRecommendation: __test__ = False def __init__(self): - self.query_image = np.random.random(image_vector_size).tolist() + self.query_image = sample_queries(1)[0] @classmethod def simple_recommend_image(cls, client: QdrantBase) -> list[models.ScoredPoint]: @@ -291,7 +292,6 @@ def test_recommend_from_another_collection(): def test_simple_recommend() -> None: fixture_points = generate_fixtures() - secondary_collection_points = generate_fixtures(100) searcher = TestSimpleRecommendation() diff --git a/tests/congruence_tests/test_search.py b/tests/congruence_tests/test_search.py index 28213a93..3b95bc2d 100644 --- a/tests/congruence_tests/test_search.py +++ b/tests/congruence_tests/test_search.py @@ -16,15 +16,19 @@ text_vector_size, ) from tests.fixtures.filters import one_random_filter_please +from tests.fixtures.points import generate_vectors class TestSimpleSearcher: __test__ = False def __init__(self): - self.query_text = np.random.random(text_vector_size).tolist() - self.query_image = np.random.random(image_vector_size).tolist() - self.query_code = np.random.random(code_vector_size).tolist() + _text_vectors = generate_vectors() + sampled_vectors = np.random.choice(len(_text_vectors), size=3, replace=False) + + self.query_text = _text_vectors[sampled_vectors[0]] + self.query_image = _text_vectors[sampled_vectors[1]] + self.query_code = _text_vectors[sampled_vectors[2]] def simple_search_text(self, client: QdrantBase) -> list[models.ScoredPoint]: return client.search( diff --git a/tests/embed_tests/test_local_inference.py b/tests/embed_tests/test_local_inference.py index eb8353ca..4a3cab32 100644 --- a/tests/embed_tests/test_local_inference.py +++ b/tests/embed_tests/test_local_inference.py @@ -17,11 +17,11 @@ LateInteractionTextEmbedding, ImageEmbedding, ) - +from tests.congruence_tests.test_common import text_vector_size COLLECTION_NAME = "inference_collection" DENSE_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" -DENSE_DIM = 384 +DENSE_DIM = text_vector_size SPARSE_MODEL_NAME = "Qdrant/bm42-all-minilm-l6-v2-attentions" COLBERT_MODEL_NAME = "answerdotai/answerai-colbert-small-v1" COLBERT_DIM = 96 diff --git a/tests/fixtures/points.py b/tests/fixtures/points.py index acb9c0c2..5c745341 100644 --- a/tests/fixtures/points.py +++ b/tests/fixtures/points.py @@ -10,16 +10,44 @@ from qdrant_client.local.sparse import validate_sparse_vector from tests.fixtures.payload import one_random_payload_please +text_vector_size = 20 -def random_vectors( - vector_sizes: Union[dict[str, int], int], -) -> models.VectorStruct: + +def generate_vectors(): + _text_vectors = np.load("data/text.npy", mmap_mode="r")[..., :text_vector_size] + _text_vectors_unique = np.unique(_text_vectors, axis=0) + _text_vectors_clean = _text_vectors_unique[ + ~np.isnan(_text_vectors_unique).any(axis=1) + ].tolist() + return _text_vectors_clean + + +_text_vectors_clean = generate_vectors() + + +def sample_queries(n: int) -> list[np.array]: + _query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)[ + ..., :text_vector_size + ] + _query_vectors_unique = np.unique(_query_vectors, axis=0) + _query_vectors = _query_vectors_unique.tolist() + sampled_vectors = np.random.choice(len(_query_vectors), size=n, replace=False) + return [_query_vectors[i].copy() for i in sampled_vectors] + + +def random_vectors(vector_sizes: Union[dict[str, int], int], idx=None) -> models.VectorStruct: if isinstance(vector_sizes, int): - return np.random.random(vector_sizes).round(3).tolist() + if idx: + return _text_vectors_clean[idx].copy() + else: + return np.random.random(vector_sizes).tolist() # .round(3) elif isinstance(vector_sizes, dict): vectors = {} for vector_name, vector_size in vector_sizes.items(): - vectors[vector_name] = np.random.random(vector_size).round(3).tolist() + if idx: + vectors[vector_name] = _text_vectors_clean[idx].copy() + else: + vectors[vector_name] = np.random.random(vector_size).tolist() # .round(3) return vectors else: raise ValueError("vector_sizes must be int or dict") @@ -28,12 +56,12 @@ def random_vectors( def random_multivectors(vector_sizes: Union[dict[str, int], int]) -> models.VectorStruct: if isinstance(vector_sizes, int): vec_count = random.randint(1, 10) - return generate_random_multivector(vector_sizes, vec_count) + return sample_random_multivector(vector_sizes, vec_count) elif isinstance(vector_sizes, dict): vectors = {} for vector_name, vector_size in vector_sizes.items(): vec_count = random.randint(1, 10) - vectors[vector_name] = generate_random_multivector(vector_size, vec_count) + vectors[vector_name] = sample_random_multivector(vector_size, vec_count) return vectors else: raise ValueError("vector_sizes must be int or dict") @@ -42,16 +70,23 @@ def random_multivectors(vector_sizes: Union[dict[str, int], int]) -> models.Vect def generate_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]: multivec = [] for _ in range(vec_count): - multivec.append(np.random.random(vec_size).round(3).tolist()) + multivec.append(np.random.random(vec_size).tolist()) # .round(3). return multivec +def sample_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]: + doc_vectors = _text_vectors_clean.copy() + sampled_vectors = np.random.choice(len(doc_vectors), size=vec_count, replace=False) + return [np.array(doc_vectors[i]).astype(np.float32).tolist() for i in sampled_vectors] + + # Generate random sparse vector with given size and density # The density is the probability of non-zero value over the whole vector def generate_random_sparse_vector(size: int, density: float) -> SparseVector: num_non_zero = int(size * density) indices: list[int] = random.sample(range(size), num_non_zero) - values: list[float] = [round(random.random(), 6) for _ in range(num_non_zero)] + values: list[float] = [random.random() for _ in range(num_non_zero)] + sparse_vector = SparseVector(indices=indices, values=values) validate_sparse_vector(sparse_vector) return sparse_vector @@ -100,7 +135,10 @@ def generate_points( if skip_vectors and isinstance(vector_sizes, int): raise ValueError("skip_vectors is not supported for single vector") + doc_vectors = _text_vectors_clean.copy() + sampled_vectors = np.random.choice(len(doc_vectors), size=num_points, replace=False) points = [] + for i in range(num_points): payload = None if with_payload: @@ -114,8 +152,16 @@ def generate_points( vectors = random_sparse_vectors(vector_sizes, even=even_sparse) elif multivector: vectors = random_multivectors(vector_sizes) + if isinstance(vectors, dict): + for name, vec in vectors.items(): + assert np.array(vec).dtype.kind in ("f", "i") # float or int + assert not np.isnan(np.array(vec)).any() + else: + for name, vec in enumerate(vectors): + assert np.array(vec).dtype.kind in ("f", "i") # float or int + assert not np.isnan(np.array(vec)).any() else: - vectors = random_vectors(vector_sizes) + vectors = random_vectors(vector_sizes, sampled_vectors[i]) if skip_vectors: if random.random() > 0.8: diff --git a/tests/test_fastembed.py b/tests/test_fastembed.py index d089dd9b..ef8216bc 100644 --- a/tests/test_fastembed.py +++ b/tests/test_fastembed.py @@ -5,6 +5,7 @@ from tests.congruence_tests.test_common import compare_client_results from tests.utils import read_version +from tests.congruence_tests.test_common import text_vector_size DOCS_EXAMPLE = { @@ -126,7 +127,7 @@ def test_set_model(): # Check if the model is initialized & cls.embeddings_models is set with expected values dim, dist = local_client._get_model_params(embedding_model_name) - assert dim == 384 + assert dim == text_vector_size # Use the initialized model to add documents with vector embeddings local_client.add(collection_name=collection_name, **DOCS_EXAMPLE) @@ -209,7 +210,7 @@ def test_get_embedding_size(): if not local_client._FASTEMBED_INSTALLED: pytest.skip("FastEmbed is not installed, skipping test") - assert local_client.get_embedding_size() == 384 + assert local_client.get_embedding_size() == text_vector_size assert local_client.get_embedding_size(model_name="BAAI/bge-base-en-v1.5") == 768