Skip to content

Draft: Real vectors for tests #980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/congruence_tests/test_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def test_collection_exists():


def test_init_from():
vector_size = 2
vector_size = 384

remote_client = init_remote()
local_client = init_local()
Expand Down
12 changes: 6 additions & 6 deletions tests/congruence_tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@
COLLECTION_NAME = "congruence_test_collection"

# dense vectors sizes
text_vector_size = 50
image_vector_size = 100
code_vector_size = 80
text_vector_size = 384
image_vector_size = 384
code_vector_size = 384

# sparse vectors sizes
sparse_text_vector_size = 100
sparse_image_vector_size = 1_000
sparse_code_vector_size = 10_000
sparse_text_vector_size = 384
sparse_image_vector_size = 384
sparse_code_vector_size = 384

# number of vectors to generate
NUM_VECTORS = 1000
Expand Down
3 changes: 2 additions & 1 deletion tests/congruence_tests/test_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
init_remote,
)
from tests.fixtures.filters import one_random_filter_please
from tests.fixtures.points import sample_queries

secondary_collection_name = "congruence_secondary_collection"


def random_vector(dims: int) -> list[float]:
return np.random.random(dims).round(3).tolist()
return sample_queries(1)[0]


@pytest.fixture(scope="module")
Expand Down
19 changes: 10 additions & 9 deletions tests/congruence_tests/test_group_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
text_vector_size,
)
from tests.fixtures.filters import one_random_filter_please
from tests.fixtures.points import sample_queries

LOOKUP_COLLECTION_NAME = "lookup_collection"

Expand All @@ -26,9 +27,10 @@ class TestGroupSearcher:
__test__ = False

def __init__(self):
self.query_text = np.random.random(text_vector_size).tolist()
self.query_image = np.random.random(image_vector_size).tolist()
self.query_code = np.random.random(code_vector_size).tolist()
queries = sample_queries(3)
self.query_text = queries[0]
self.query_image = queries[1]
self.query_code = queries[2]
self.group_by = "rand_digit"
self.group_size = 1
self.limit = 10
Expand Down Expand Up @@ -217,8 +219,8 @@ def group_by_keys():


def test_group_search_types():
fixture_points = generate_fixtures(vectors_sizes=50)
vectors_config = models.VectorParams(size=50, distance=models.Distance.EUCLID)
fixture_points = generate_fixtures(vectors_sizes=text_vector_size)
vectors_config = models.VectorParams(size=text_vector_size, distance=models.Distance.EUCLID)

searcher = TestGroupSearcher()

Expand All @@ -228,17 +230,16 @@ def test_group_search_types():
remote_client = init_remote()
init_client(remote_client, fixture_points, vectors_config=vectors_config)

query_vector_np = np.random.random(text_vector_size)
query_vector_np = sample_queries(1)[0]
compare_client_results(
local_client,
remote_client,
searcher.group_search,
query_vector=query_vector_np,
query_vector=np.array(query_vector_np),
)

query_vector_list = query_vector_np.tolist()
compare_client_results(
local_client, remote_client, searcher.group_search, query_vector=query_vector_list
local_client, remote_client, searcher.group_search, query_vector=query_vector_np
)

delete_fixture_collection(local_client)
Expand Down
18 changes: 12 additions & 6 deletions tests/congruence_tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
generate_random_multivector,
)
from tests.utils import read_version
from tests.fixtures.points import sample_queries

SECONDARY_COLLECTION_NAME = "congruence_secondary_collection"

Expand All @@ -46,12 +47,15 @@ def __init__(self):
self.group_size = 3
self.limit = 2 # number of groups

sampled_queries = sample_queries(4)
self.query_image = sampled_queries[0]

# dense query vectors
self.dense_vector_query_text = np.random.random(text_vector_size).tolist()
self.dense_vector_query_text_bis = self.dense_vector_query_text
self.dense_vector_query_text = sampled_queries[1]
self.dense_vector_query_text_bis = sampled_queries[1]
self.dense_vector_query_text_bis[0] += 42.0 # slightly different vector
self.dense_vector_query_image = np.random.random(image_vector_size).tolist()
self.dense_vector_query_code = np.random.random(code_vector_size).tolist()
self.dense_vector_query_image = sampled_queries[2]
self.dense_vector_query_code = sampled_queries[3]

# sparse query vectors
self.sparse_vector_query_text = generate_random_sparse_vector(
Expand Down Expand Up @@ -1458,9 +1462,11 @@ def test_original_input_persistence():
# the reason was that we were replacing point id with a sparse vector, and then, when we needed a dense vector
# from the same point id, we already had point id replaced with a sparse vector
num_points = 50
vectors_config = {"text": models.VectorParams(size=50, distance=models.Distance.COSINE)}
vectors_config = {
"text": models.VectorParams(size=text_vector_size, distance=models.Distance.COSINE)
}
sparse_vectors_config = {"sparse-text": models.SparseVectorParams()}
fixture_points = generate_fixtures(vectors_sizes={"text": 50}, num=num_points)
fixture_points = generate_fixtures(vectors_sizes={"text": text_vector_size}, num=num_points)
sparse_fixture_points = generate_sparse_fixtures(num=num_points)
points = [
models.PointStruct(
Expand Down
26 changes: 13 additions & 13 deletions tests/congruence_tests/test_query_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
generate_multivector_fixtures,
multi_vector_config,
)
from tests.fixtures.points import generate_random_sparse_vector, generate_random_multivector
from tests.fixtures.points import (
generate_random_sparse_vector,
generate_random_multivector,
sample_queries,
)


class TestQueryBatchSearcher:
Expand All @@ -39,28 +43,27 @@ def __init__(self):
self.multivector_query_batch_code = []

for _ in range(4):
vecs = sample_queries(4)
self.dense_vector_query_batch_text.append(
models.QueryRequest(
query=np.random.random(text_vector_size).tolist(),
prefetch=models.Prefetch(
query=np.random.random(text_vector_size).tolist(), limit=5, using="text"
),
query=vecs[0],
prefetch=models.Prefetch(query=vecs[1], limit=5, using="text"),
limit=5,
using="text",
with_payload=True,
)
)
self.dense_vector_query_batch_image.append(
models.QueryRequest(
query=np.random.random(image_vector_size).tolist(),
query=vecs[2],
limit=5,
using="image",
with_payload=True,
)
)
self.dense_vector_query_batch_code.append(
models.QueryRequest(
query=np.random.random(code_vector_size).tolist(),
query=vecs[3],
limit=5,
using="code",
with_payload=True,
Expand Down Expand Up @@ -101,16 +104,13 @@ def __init__(self):
)
)

vecs = sample_queries(2)
self.dense_vector_query_batch_text_dbsf = [
models.QueryRequest(
query=models.FusionQuery(fusion=models.Fusion.DBSF),
prefetch=[
models.Prefetch(
query=np.random.random(text_vector_size).tolist(), using="text"
),
models.Prefetch(
query=np.random.random(text_vector_size).tolist(), using="text"
),
models.Prefetch(query=vecs[0], using="text"),
models.Prefetch(query=vecs[1], using="text"),
],
with_payload=True,
)
Expand Down
4 changes: 2 additions & 2 deletions tests/congruence_tests/test_recommendation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
init_remote,
)
from tests.fixtures.filters import one_random_filter_please
from tests.fixtures.points import sample_queries

secondary_collection_name = "congruence_secondary_collection"

Expand All @@ -24,7 +25,7 @@ class TestSimpleRecommendation:
__test__ = False

def __init__(self):
self.query_image = np.random.random(image_vector_size).tolist()
self.query_image = sample_queries(1)[0]

@classmethod
def simple_recommend_image(cls, client: QdrantBase) -> list[models.ScoredPoint]:
Expand Down Expand Up @@ -291,7 +292,6 @@ def test_recommend_from_another_collection():

def test_simple_recommend() -> None:
fixture_points = generate_fixtures()

secondary_collection_points = generate_fixtures(100)

searcher = TestSimpleRecommendation()
Expand Down
11 changes: 8 additions & 3 deletions tests/congruence_tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,14 @@ class TestSimpleSearcher:
__test__ = False

def __init__(self):
self.query_text = np.random.random(text_vector_size).tolist()
self.query_image = np.random.random(image_vector_size).tolist()
self.query_code = np.random.random(code_vector_size).tolist()
_text_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)
_text_vectors_unique = np.unique(_text_vectors, axis=0)
_text_vectors = _text_vectors_unique.tolist()
sampled_vectors = np.random.choice(len(_text_vectors), size=3, replace=False)

self.query_text = _text_vectors[sampled_vectors[0]]
self.query_image = _text_vectors[sampled_vectors[1]]
self.query_code = _text_vectors[sampled_vectors[2]]

def simple_search_text(self, client: QdrantBase) -> list[models.ScoredPoint]:
return client.search(
Expand Down
39 changes: 31 additions & 8 deletions tests/fixtures/points.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,32 @@
from qdrant_client.local.sparse import validate_sparse_vector
from tests.fixtures.payload import one_random_payload_please

_text_vectors = np.load("data/text.npy", mmap_mode="r")
_text_vectors_unique = np.unique(_text_vectors, axis=0)
_text_vectors_clean = _text_vectors_unique[~np.isnan(_text_vectors_unique).any(axis=1)].tolist()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add fallback mechanism for missing data files

Currently, if the data files are missing, the test will fail completely. Consider adding a fallback mechanism that uses randomly generated vectors when the data files are not found.

Implement a try-except block to gracefully handle missing data files:

try:
    _text_vectors = np.load("data/text.npy", mmap_mode="r")
    _text_vectors_unique = np.unique(_text_vectors, axis=0)
    _text_vectors_clean = _text_vectors_unique[~np.isnan(_text_vectors_unique).any(axis=1)].tolist()
    _using_real_vectors = True
except FileNotFoundError:
    print("Warning: data/text.npy not found. Using randomly generated vectors instead.")
    _text_vectors_clean = []  # Empty list as fallback
    _using_real_vectors = False

try:
    _query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)
    _query_vectors_unique = np.unique(_query_vectors, axis=0)
    _query_vectors_clean = _query_vectors_unique.tolist()
except FileNotFoundError:
    print("Warning: data/queries.npy not found. Using randomly generated vectors instead.")
    _query_vectors_clean = []  # Empty list as fallback

Then modify the functions to check _using_real_vectors and fall back to random generation when needed:

def sample_queries(n: int) -> list[np.array]:
+    if not _using_real_vectors or len(_query_vectors_clean) == 0:
+        # Fallback to random generation
+        return [np.random.random(384).round(3).tolist() for _ in range(n)]
    sampled_vectors = np.random.choice(len(_query_vectors_clean), size=n, replace=False)
    return [_query_vectors_clean[i] for i in sampled_vectors]

This would make the tests more robust and still allow them to run without the data files.

Also applies to: 18-24, 65-68, 124-124

🧰 Tools
🪛 GitHub Actions: Integration tests

[error] 13-13: FileNotFoundError: No such file or directory: 'data/text.npy'. The test failed because the required data file 'data/text.npy' is missing.


⚠️ Potential issue

Fix the file path reference issue

The code is attempting to load data files with hardcoded paths, but the pipeline has failed with a FileNotFoundError: No such file or directory: 'data/text.npy'. This indicates that either:

  1. The data files are missing from the repository, or
  2. The path is incorrect relative to where the tests are executed

Consider one of these solutions:

  1. Include the data files in the PR or document how to obtain them
  2. Use a relative path that accounts for the project structure:
- _text_vectors = np.load("data/text.npy", mmap_mode="r")
+ _text_vectors = np.load(os.path.join(os.path.dirname(__file__), "../../data/text.npy"), mmap_mode="r")

Don't forget to add the import os statement at the top of the file.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
_text_vectors = np.load("data/text.npy", mmap_mode="r")
_text_vectors_unique = np.unique(_text_vectors, axis=0)
_text_vectors_clean = _text_vectors_unique[~np.isnan(_text_vectors_unique).any(axis=1)].tolist()
import os
_text_vectors = np.load(
os.path.join(os.path.dirname(__file__), "../../data/text.npy"),
mmap_mode="r"
)
_text_vectors_unique = np.unique(_text_vectors, axis=0)
_text_vectors_clean = _text_vectors_unique[
~np.isnan(_text_vectors_unique).any(axis=1)
].tolist()
🧰 Tools
🪛 GitHub Actions: Integration tests

[error] 13-13: FileNotFoundError: No such file or directory: 'data/text.npy'. The test failed because the required data file 'data/text.npy' is missing.


def random_vectors(
vector_sizes: Union[dict[str, int], int],
) -> models.VectorStruct:

def sample_queries(n: int) -> list[np.array]:
_query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)
_query_vectors_unique = np.unique(_query_vectors, axis=0)
_query_vectors = _query_vectors_unique.tolist()
sampled_vectors = np.random.choice(len(_query_vectors), size=n, replace=False)
return [_query_vectors[i] for i in sampled_vectors]

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Improve data loading efficiency and error handling

The sample_queries function loads the data from disk on every call, which is inefficient. Additionally, it lacks error handling for the file loading operation.

Consider loading the query vectors once at module level, similar to how you handle _text_vectors:

+ _query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)
+ _query_vectors_unique = np.unique(_query_vectors, axis=0)
+ _query_vectors_clean = _query_vectors_unique.tolist()

def sample_queries(n: int) -> list[np.array]:
-    _query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)
-    _query_vectors_unique = np.unique(_query_vectors, axis=0)
-    _query_vectors = _query_vectors_unique.tolist()
-    sampled_vectors = np.random.choice(len(_query_vectors), size=n, replace=False)
-    return [_query_vectors[i] for i in sampled_vectors]
+    sampled_vectors = np.random.choice(len(_query_vectors_clean), size=n, replace=False)
+    return [_query_vectors_clean[i] for i in sampled_vectors]

Also, add exception handling to provide better error messages if the files can't be loaded.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def sample_queries(n: int) -> list[np.array]:
_query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)
_query_vectors_unique = np.unique(_query_vectors, axis=0)
_query_vectors = _query_vectors_unique.tolist()
sampled_vectors = np.random.choice(len(_query_vectors), size=n, replace=False)
return [_query_vectors[i] for i in sampled_vectors]
# Module‐level loading & preprocessing of query vectors
_query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)
_query_vectors_unique = np.unique(_query_vectors, axis=0)
_query_vectors_clean = _query_vectors_unique.tolist()
def sample_queries(n: int) -> list[np.array]:
sampled_vectors = np.random.choice(len(_query_vectors_clean), size=n, replace=False)
return [_query_vectors_clean[i] for i in sampled_vectors]


def random_vectors(vector_sizes: Union[dict[str, int], int], idx=None) -> models.VectorStruct:
if isinstance(vector_sizes, int):
return np.random.random(vector_sizes).round(3).tolist()
if idx:
return _text_vectors_clean[idx]
else:
return np.random.random(vector_sizes).round(3).tolist()
elif isinstance(vector_sizes, dict):
vectors = {}
for vector_name, vector_size in vector_sizes.items():
vectors[vector_name] = np.random.random(vector_size).round(3).tolist()
if idx:
vectors[vector_name] = _text_vectors_clean[idx]
else:
vectors[vector_name] = np.random.random(vector_size).round(3).tolist()
return vectors
else:
raise ValueError("vector_sizes must be int or dict")
Expand All @@ -28,12 +44,12 @@ def random_vectors(
def random_multivectors(vector_sizes: Union[dict[str, int], int]) -> models.VectorStruct:
if isinstance(vector_sizes, int):
vec_count = random.randint(1, 10)
return generate_random_multivector(vector_sizes, vec_count)
return sample_random_multivector(vector_sizes, vec_count)
elif isinstance(vector_sizes, dict):
vectors = {}
for vector_name, vector_size in vector_sizes.items():
vec_count = random.randint(1, 10)
vectors[vector_name] = generate_random_multivector(vector_size, vec_count)
vectors[vector_name] = sample_random_multivector(vector_size, vec_count)
return vectors
else:
raise ValueError("vector_sizes must be int or dict")
Expand All @@ -46,6 +62,11 @@ def generate_random_multivector(vec_size: int, vec_count: int) -> list[list[floa
return multivec


def sample_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]:
sampled_vectors = np.random.choice(len(_text_vectors_clean), size=vec_count, replace=False)
return [_text_vectors_clean[i] for i in sampled_vectors]

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add dimension validation to sample_random_multivector

The sample_random_multivector function doesn't validate that the dimensions of the preloaded vectors match the requested vec_size.

Add dimension validation:

def sample_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]:
    sampled_vectors = np.random.choice(len(_text_vectors_clean), size=vec_count, replace=False)
+    # Verify vector dimensions match
+    for i in sampled_vectors:
+        if len(_text_vectors_clean[i]) != vec_size:
+            raise ValueError(f"Preloaded vector dimension {len(_text_vectors_clean[i])} does not match requested dimension {vec_size}")
    return [_text_vectors_clean[i] for i in sampled_vectors]

This will help catch dimension mismatches early.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def sample_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]:
sampled_vectors = np.random.choice(len(_text_vectors_clean), size=vec_count, replace=False)
return [_text_vectors_clean[i] for i in sampled_vectors]
def sample_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]:
sampled_vectors = np.random.choice(len(_text_vectors_clean), size=vec_count, replace=False)
# Verify vector dimensions match
for i in sampled_vectors:
if len(_text_vectors_clean[i]) != vec_size:
raise ValueError(
f"Preloaded vector dimension {len(_text_vectors_clean[i])} "
f"does not match requested dimension {vec_size}"
)
return [_text_vectors_clean[i] for i in sampled_vectors]


# Generate random sparse vector with given size and density
# The density is the probability of non-zero value over the whole vector
def generate_random_sparse_vector(size: int, density: float) -> SparseVector:
Expand Down Expand Up @@ -100,7 +121,9 @@ def generate_points(
if skip_vectors and isinstance(vector_sizes, int):
raise ValueError("skip_vectors is not supported for single vector")

sampled_vectors = np.random.choice(len(_text_vectors_clean), size=num_points, replace=False)
points = []

for i in range(num_points):
payload = None
if with_payload:
Expand All @@ -115,7 +138,7 @@ def generate_points(
elif multivector:
vectors = random_multivectors(vector_sizes)
else:
vectors = random_vectors(vector_sizes)
vectors = random_vectors(vector_sizes, sampled_vectors[i])

if skip_vectors:
if random.random() > 0.8:
Expand Down
Loading