-
Notifications
You must be signed in to change notification settings - Fork 146
Draft: Real vectors for tests #980
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from 3 commits
5adcf93
1073e87
5319ed7
888d94e
5d304e8
7c577db
71ad426
1e8c1cc
661e198
d62575d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -10,16 +10,32 @@ | |||||||||||||||||||||||||||||
from qdrant_client.local.sparse import validate_sparse_vector | ||||||||||||||||||||||||||||||
from tests.fixtures.payload import one_random_payload_please | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
_text_vectors = np.load("data/text.npy", mmap_mode="r") | ||||||||||||||||||||||||||||||
_text_vectors_unique = np.unique(_text_vectors, axis=0) | ||||||||||||||||||||||||||||||
_text_vectors_clean = _text_vectors_unique[~np.isnan(_text_vectors_unique).any(axis=1)].tolist() | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def random_vectors( | ||||||||||||||||||||||||||||||
vector_sizes: Union[dict[str, int], int], | ||||||||||||||||||||||||||||||
) -> models.VectorStruct: | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def sample_queries(n: int) -> list[np.array]: | ||||||||||||||||||||||||||||||
_query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32) | ||||||||||||||||||||||||||||||
_query_vectors_unique = np.unique(_query_vectors, axis=0) | ||||||||||||||||||||||||||||||
_query_vectors = _query_vectors_unique.tolist() | ||||||||||||||||||||||||||||||
sampled_vectors = np.random.choice(len(_query_vectors), size=n, replace=False) | ||||||||||||||||||||||||||||||
return [_query_vectors[i] for i in sampled_vectors] | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Improve data loading efficiency and error handling The Consider loading the query vectors once at module level, similar to how you handle + _query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)
+ _query_vectors_unique = np.unique(_query_vectors, axis=0)
+ _query_vectors_clean = _query_vectors_unique.tolist()
def sample_queries(n: int) -> list[np.array]:
- _query_vectors = np.load("data/queries.npy", allow_pickle=True).astype(np.float32)
- _query_vectors_unique = np.unique(_query_vectors, axis=0)
- _query_vectors = _query_vectors_unique.tolist()
- sampled_vectors = np.random.choice(len(_query_vectors), size=n, replace=False)
- return [_query_vectors[i] for i in sampled_vectors]
+ sampled_vectors = np.random.choice(len(_query_vectors_clean), size=n, replace=False)
+ return [_query_vectors_clean[i] for i in sampled_vectors] Also, add exception handling to provide better error messages if the files can't be loaded. 📝 Committable suggestion
Suggested change
|
||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def random_vectors(vector_sizes: Union[dict[str, int], int], idx=None) -> models.VectorStruct: | ||||||||||||||||||||||||||||||
if isinstance(vector_sizes, int): | ||||||||||||||||||||||||||||||
return np.random.random(vector_sizes).round(3).tolist() | ||||||||||||||||||||||||||||||
if idx: | ||||||||||||||||||||||||||||||
return _text_vectors_clean[idx] | ||||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||||
return np.random.random(vector_sizes).round(3).tolist() | ||||||||||||||||||||||||||||||
elif isinstance(vector_sizes, dict): | ||||||||||||||||||||||||||||||
vectors = {} | ||||||||||||||||||||||||||||||
for vector_name, vector_size in vector_sizes.items(): | ||||||||||||||||||||||||||||||
vectors[vector_name] = np.random.random(vector_size).round(3).tolist() | ||||||||||||||||||||||||||||||
if idx: | ||||||||||||||||||||||||||||||
vectors[vector_name] = _text_vectors_clean[idx] | ||||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||||
vectors[vector_name] = np.random.random(vector_size).round(3).tolist() | ||||||||||||||||||||||||||||||
return vectors | ||||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||||
raise ValueError("vector_sizes must be int or dict") | ||||||||||||||||||||||||||||||
|
@@ -28,12 +44,12 @@ def random_vectors( | |||||||||||||||||||||||||||||
def random_multivectors(vector_sizes: Union[dict[str, int], int]) -> models.VectorStruct: | ||||||||||||||||||||||||||||||
if isinstance(vector_sizes, int): | ||||||||||||||||||||||||||||||
vec_count = random.randint(1, 10) | ||||||||||||||||||||||||||||||
return generate_random_multivector(vector_sizes, vec_count) | ||||||||||||||||||||||||||||||
return sample_random_multivector(vector_sizes, vec_count) | ||||||||||||||||||||||||||||||
elif isinstance(vector_sizes, dict): | ||||||||||||||||||||||||||||||
vectors = {} | ||||||||||||||||||||||||||||||
for vector_name, vector_size in vector_sizes.items(): | ||||||||||||||||||||||||||||||
vec_count = random.randint(1, 10) | ||||||||||||||||||||||||||||||
vectors[vector_name] = generate_random_multivector(vector_size, vec_count) | ||||||||||||||||||||||||||||||
vectors[vector_name] = sample_random_multivector(vector_size, vec_count) | ||||||||||||||||||||||||||||||
return vectors | ||||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||||
raise ValueError("vector_sizes must be int or dict") | ||||||||||||||||||||||||||||||
|
@@ -46,6 +62,11 @@ def generate_random_multivector(vec_size: int, vec_count: int) -> list[list[floa | |||||||||||||||||||||||||||||
return multivec | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def sample_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]: | ||||||||||||||||||||||||||||||
sampled_vectors = np.random.choice(len(_text_vectors_clean), size=vec_count, replace=False) | ||||||||||||||||||||||||||||||
return [_text_vectors_clean[i] for i in sampled_vectors] | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Add dimension validation to The Add dimension validation: def sample_random_multivector(vec_size: int, vec_count: int) -> list[list[float]]:
sampled_vectors = np.random.choice(len(_text_vectors_clean), size=vec_count, replace=False)
+ # Verify vector dimensions match
+ for i in sampled_vectors:
+ if len(_text_vectors_clean[i]) != vec_size:
+ raise ValueError(f"Preloaded vector dimension {len(_text_vectors_clean[i])} does not match requested dimension {vec_size}")
return [_text_vectors_clean[i] for i in sampled_vectors] This will help catch dimension mismatches early. 📝 Committable suggestion
Suggested change
|
||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
# Generate random sparse vector with given size and density | ||||||||||||||||||||||||||||||
# The density is the probability of non-zero value over the whole vector | ||||||||||||||||||||||||||||||
def generate_random_sparse_vector(size: int, density: float) -> SparseVector: | ||||||||||||||||||||||||||||||
|
@@ -100,7 +121,9 @@ def generate_points( | |||||||||||||||||||||||||||||
if skip_vectors and isinstance(vector_sizes, int): | ||||||||||||||||||||||||||||||
raise ValueError("skip_vectors is not supported for single vector") | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
sampled_vectors = np.random.choice(len(_text_vectors_clean), size=num_points, replace=False) | ||||||||||||||||||||||||||||||
points = [] | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
for i in range(num_points): | ||||||||||||||||||||||||||||||
payload = None | ||||||||||||||||||||||||||||||
if with_payload: | ||||||||||||||||||||||||||||||
|
@@ -115,7 +138,7 @@ def generate_points( | |||||||||||||||||||||||||||||
elif multivector: | ||||||||||||||||||||||||||||||
vectors = random_multivectors(vector_sizes) | ||||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||||
vectors = random_vectors(vector_sizes) | ||||||||||||||||||||||||||||||
vectors = random_vectors(vector_sizes, sampled_vectors[i]) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
if skip_vectors: | ||||||||||||||||||||||||||||||
if random.random() > 0.8: | ||||||||||||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Add fallback mechanism for missing data files
Currently, if the data files are missing, the test will fail completely. Consider adding a fallback mechanism that uses randomly generated vectors when the data files are not found.
Implement a try-except block to gracefully handle missing data files:
Then modify the functions to check
_using_real_vectors
and fall back to random generation when needed:This would make the tests more robust and still allow them to run without the data files.
Also applies to: 18-24, 65-68, 124-124
🧰 Tools
🪛 GitHub Actions: Integration tests
[error] 13-13: FileNotFoundError: No such file or directory: 'data/text.npy'. The test failed because the required data file 'data/text.npy' is missing.
Fix the file path reference issue
The code is attempting to load data files with hardcoded paths, but the pipeline has failed with a
FileNotFoundError: No such file or directory: 'data/text.npy'
. This indicates that either:Consider one of these solutions:
Don't forget to add the
import os
statement at the top of the file.📝 Committable suggestion
🧰 Tools
🪛 GitHub Actions: Integration tests
[error] 13-13: FileNotFoundError: No such file or directory: 'data/text.npy'. The test failed because the required data file 'data/text.npy' is missing.